cctally 1.11.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,21 +41,18 @@ Holds:
41
41
  - ``cmd_cache_sync`` — entry point for ``cctally cache-sync
42
42
  [--source {claude,codex,all}] [--rebuild]``.
43
43
 
44
- What stays in bin/cctally:
44
+ What lives in bin/_cctally_core (promoted 2026-05-22, #84):
45
45
  - Path constants ``APP_DIR``, ``CACHE_DB_PATH``, ``CACHE_LOCK_PATH``,
46
- ``CACHE_LOCK_CODEX_PATH``, ``CODEX_SESSIONS_DIR`` referenced from
47
- the moved bodies via the ``c = _cctally()`` call-time accessor
48
- pattern (spec §5.5, same as ``bin/_lib_subscription_weeks.py`` and
49
- ``bin/_lib_aggregators.py``). The accessor resolves
50
- ``sys.modules['cctally'].X`` on every call, so
51
- ``monkeypatch.setitem(ns, "CACHE_DB_PATH", tmp)`` and conftest
52
- ``redirect_paths`` HOME redirects propagate transparently with NO
53
- test-side changes (tests already patch ``ns["CACHE_DB_PATH"]`` etc.
54
- by setitem on the dict-as-module bridge). We chose ``c.X`` over the
55
- ``_cctally_db.py``-style seed block here because cache tests are
56
- widely scattered (record-usage tick, dashboard panels, share render
57
- kernel, block tests, every JSONL-reading subcommand fixture) and
58
- Phase C-style inline patching would touch dozens of sites.
46
+ ``CACHE_LOCK_CODEX_PATH``. Moved bodies read these via call-time
47
+ ``_cctally_core.X`` and tests patch via
48
+ ``monkeypatch.setattr(_cctally_core, "X", v)`` (or the conftest
49
+ ``redirect_paths()`` helper). The legacy
50
+ ``setitem(ns, "CACHE_DB_PATH", …)`` pattern is forbidden by
51
+ ``test_no_old_style_test_patches_for_promoted_globals``.
52
+
53
+ What stays in bin/cctally:
54
+ - ``CODEX_SESSIONS_DIR`` out of scope for #84; still read via the
55
+ ``c = _cctally()`` call-time accessor (spec §5.5).
59
56
  - ``_sum_cost_for_range`` — sits at the cache↔report boundary; 6+
60
57
  callers outside cache (forecast, weekly, report, project, doctor),
61
58
  so the directive keeps it on the bin/cctally side.
@@ -122,6 +119,7 @@ def _cctally():
122
119
  # Spec 2026-05-17-cctally-core-kernel-extraction.md §3.3: kernel symbols
123
120
  # (Z-leaf + Z-mid) import from _cctally_core. The legacy shim function
124
121
  # for ``eprint`` is deleted.
122
+ import _cctally_core
125
123
  from _cctally_core import eprint
126
124
 
127
125
 
@@ -167,6 +165,7 @@ _CodexIterState = _lib_jsonl._CodexIterState
167
165
  _iter_jsonl_entries_with_offsets = _lib_jsonl._iter_jsonl_entries_with_offsets
168
166
  _iter_codex_jsonl_entries_with_offsets = _lib_jsonl._iter_codex_jsonl_entries_with_offsets
169
167
  _parse_usage_entries = _lib_jsonl._parse_usage_entries
168
+ _should_replace = _lib_jsonl._should_replace
170
169
 
171
170
  _cctally_db_sib = _load_lib("_cctally_db")
172
171
  add_column_if_missing = _cctally_db_sib.add_column_if_missing
@@ -175,11 +174,12 @@ _CACHE_MIGRATIONS = _cctally_db_sib._CACHE_MIGRATIONS
175
174
 
176
175
 
177
176
  # === BEGIN MOVED REGIONS ===
178
- # Path constants (APP_DIR, CACHE_DB_PATH, CACHE_LOCK_PATH,
179
- # CACHE_LOCK_CODEX_PATH, CODEX_SESSIONS_DIR) are accessed via the
180
- # `c = _cctally()` call-time accessor inside each function that
181
- # needs them — so ``monkeypatch.setitem(ns, "CACHE_DB_PATH", tmp)``
182
- # in tests resolves on every read (no stale module-level binding).
177
+ # Path constants APP_DIR / CACHE_DB_PATH / CACHE_LOCK_PATH /
178
+ # CACHE_LOCK_CODEX_PATH live in _cctally_core (promoted 2026-05-22, #84);
179
+ # moved bodies read them via call-time ``_cctally_core.X`` and tests
180
+ # patch via ``monkeypatch.setattr(_cctally_core, "X", v)``.
181
+ # CODEX_SESSIONS_DIR stays in bin/cctally (out of scope for #84) and is
182
+ # still accessed via the ``c = _cctally()`` call-time accessor.
183
183
 
184
184
  # === Region 1: ProjectKey + _resolve_project_key (was bin/cctally:1994-2069) ===
185
185
 
@@ -305,7 +305,16 @@ class IngestStats:
305
305
  files_processed: int = 0
306
306
  files_skipped_unchanged: int = 0
307
307
  files_reset_truncated: int = 0
308
- rows_inserted: int = 0
308
+ # Count of session_entries rows written by this sync — both genuinely-
309
+ # new INSERTs and ccusage-parity ON CONFLICT DO UPDATE replacements
310
+ # (the dedup tiebreaker swaps a streaming-intermediate row for the
311
+ # post-stream finalization). SQLite's `total_changes` counter
312
+ # increments on both, so this field is "rows changed", not "rows
313
+ # newly inserted". Pre-dedup builds used INSERT OR IGNORE where
314
+ # conflicts did NOT bump the counter; the name change preserves the
315
+ # observability metric without misrepresenting UPSERT updates as
316
+ # new inserts.
317
+ rows_changed: int = 0
309
318
  lock_contended: bool = False
310
319
 
311
320
 
@@ -315,7 +324,7 @@ def _progress_stderr(stats: IngestStats, *, force: bool = False) -> None:
315
324
  return
316
325
  eprint(
317
326
  f"[cache-sync] {stats.files_processed}/{stats.files_total} files, "
318
- f"{stats.rows_inserted} new rows"
327
+ f"{stats.rows_changed} rows changed"
319
328
  )
320
329
 
321
330
 
@@ -422,10 +431,10 @@ def sync_cache(
422
431
  """
423
432
  stats = IngestStats()
424
433
  c = _cctally()
425
- c.APP_DIR.mkdir(parents=True, exist_ok=True)
426
- c.CACHE_LOCK_PATH.touch()
434
+ _cctally_core.APP_DIR.mkdir(parents=True, exist_ok=True)
435
+ _cctally_core.CACHE_LOCK_PATH.touch()
427
436
 
428
- lock_fh = open(c.CACHE_LOCK_PATH, "w")
437
+ lock_fh = open(_cctally_core.CACHE_LOCK_PATH, "w")
429
438
  try:
430
439
  try:
431
440
  fcntl.flock(lock_fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
@@ -434,6 +443,30 @@ def sync_cache(
434
443
  stats.lock_contended = True
435
444
  return stats
436
445
 
446
+ # Walk-complete sentinel gating (cctally-dev#93, D5b/D6b). Capture
447
+ # whether cache 001 was already applied at the moment this sync
448
+ # acquired the lock. The end-of-loop marker write is gated on this so
449
+ # a walk whose baseline predates the 001 wipe (the "straddle" run)
450
+ # withholds the marker — it cannot vouch for a cache 001 wiped
451
+ # underneath it. On the normal first-upgrade flow open_cache_db runs
452
+ # the dispatcher (001 applies in-process) BEFORE sync_cache is ever
453
+ # called, so this is True and the marker is written as expected. If
454
+ # schema_migrations doesn't exist yet, treat as not-applied (False).
455
+ try:
456
+ applied_at_start = conn.execute(
457
+ "SELECT 1 FROM schema_migrations WHERE name='001_dedup_highest_wins'"
458
+ ).fetchone() is not None
459
+ except sqlite3.OperationalError:
460
+ applied_at_start = False
461
+
462
+ # Tracks whether every file in this walk was either ingested cleanly
463
+ # or confirmed-current. Any per-file error-skip (stat/read failure or
464
+ # a DB error that rolls back + continues) flips it False so the marker
465
+ # is withheld — an incomplete walk must not look complete. The
466
+ # unchanged-file early-exit (`size == prev_size`) does NOT flip it: a
467
+ # confirmed-current file still counts as walked.
468
+ walk_clean = True
469
+
437
470
  if rebuild:
438
471
  # Clear INSIDE the lock — a concurrent rebuild that lost the
439
472
  # race would otherwise have wiped this cache before bailing,
@@ -442,6 +475,11 @@ def sync_cache(
442
475
  # empty baseline.
443
476
  conn.execute("DELETE FROM session_entries")
444
477
  conn.execute("DELETE FROM session_files")
478
+ # Clear the walk-complete sentinel atomically with the wipe
479
+ # (cctally-dev#93, D5/D2): a stale "complete" marker must never
480
+ # survive a destructive rebuild. The end-of-loop write below
481
+ # re-establishes it only after this rebuild's clean walk.
482
+ conn.execute("DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'")
445
483
  conn.commit()
446
484
  eprint("[cache-sync] rebuild: cleared Claude cached entries")
447
485
 
@@ -465,6 +503,120 @@ def sync_cache(
465
503
  )
466
504
  }
467
505
 
506
+ # Orphaned-tracked-file detection (cctally-dev#93 review). A path
507
+ # tracked in session_files (with data already ingested) but no
508
+ # longer present on disk leaves orphaned session_entries rows that
509
+ # the per-file loop below never visits — it iterates only on-disk
510
+ # `paths`. sync_cache deliberately does NOT prune those orphans
511
+ # in-place: a deleted file shares the truncation hazard (under the
512
+ # sticky source_path dedup a surviving file may carry the same
513
+ # (msg_id, req_id) yet keep its size_bytes, so a per-orphan DELETE
514
+ # could drop a row the survivor still owns without re-ingesting
515
+ # it), and a blanket full-reset would wrongly fire on the
516
+ # legitimate "cache seeded with synthetic source paths" fixture
517
+ # pattern. Instead we INVALIDATE the walk-complete marker: an
518
+ # orphaned cache no longer faithfully mirrors disk, so it is — by
519
+ # the marker's own definition — not a complete walk. We must
520
+ # actively DELETE any marker a PRIOR clean walk left behind (not
521
+ # merely withhold THIS run's end-of-loop rewrite — that rewrite is
522
+ # gated on walk_clean, but a stale marker from a previous sync
523
+ # would otherwise survive and keep vouching for completeness).
524
+ # Setting walk_clean=False additionally suppresses the end-of-loop
525
+ # rewrite so the marker stays absent for this run. With the marker
526
+ # gone the upgrade gate DEFERs the 008/009/010 recomputes (rather
527
+ # than certifying aggregates that still include data from files no
528
+ # longer on disk); the operator clears the orphans by running
529
+ # `cache-sync --rebuild` (the documented re-derive path), which
530
+ # re-establishes the marker. Only paths whose row carried ingested
531
+ # bytes (size_bytes > 0) count — a size_bytes=0 row holds no
532
+ # session_entries, so its absence leaves no orphan. The DELETE +
533
+ # commit lands BEFORE the per-file read+parse loop, so no write
534
+ # lock is held into that loop (same discipline as the truncation
535
+ # escalation just below).
536
+ on_disk_paths = {str(jp) for jp in paths}
537
+ orphaned_tracked_paths = [
538
+ p for p, (size_bytes, _, _) in existing.items()
539
+ if size_bytes and p not in on_disk_paths
540
+ ]
541
+ if orphaned_tracked_paths:
542
+ eprint(
543
+ f"[cache] {len(orphaned_tracked_paths)} tracked file(s) no "
544
+ f"longer on disk; invalidating walk-complete marker "
545
+ f"(run `cache-sync --rebuild` to prune orphaned entries)"
546
+ )
547
+ conn.execute(
548
+ "DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'"
549
+ )
550
+ conn.commit()
551
+ walk_clean = False # orphaned rows -> cache doesn't mirror disk (D5a)
552
+
553
+ # Pre-scan for any truncation among tracked files. Under the
554
+ # ccusage-parity ON CONFLICT DO UPDATE, source_path is PINNED to
555
+ # whichever file first inserted a (msg_id, req_id) row (see U1
556
+ # in this file). Later UPSERTs from a DIFFERENT file may have
557
+ # updated the token columns on that row while leaving source_path
558
+ # pointing at the original (now possibly truncated) file. A
559
+ # naive per-file truncation path then deletes by source_path and
560
+ # loses data the other file is still carrying — but that other
561
+ # file's `size_bytes` is unchanged, so the per-file early-exit
562
+ # at `if size == prev_size: continue` skips its re-ingest.
563
+ #
564
+ # Escalation: when any file's size has shrunk, drop the entire
565
+ # session_entries cache and force every file to re-ingest from
566
+ # offset 0. The cache is fully re-derivable, this is rare (only
567
+ # on JSONL rotation / manual edits), and it sidesteps the
568
+ # per-key contributing-file bookkeeping that would otherwise be
569
+ # required. The lock is already held, so this is atomic with
570
+ # the subsequent per-file ingest.
571
+ truncated_paths: set[str] = set()
572
+ for jp in paths:
573
+ prev = existing.get(str(jp))
574
+ if prev is None:
575
+ continue
576
+ try:
577
+ st = jp.stat()
578
+ except OSError:
579
+ continue
580
+ if st.st_size < prev[0]:
581
+ truncated_paths.add(str(jp))
582
+
583
+ if truncated_paths:
584
+ eprint(
585
+ f"[cache-sync] truncation detected on {len(truncated_paths)} "
586
+ f"file(s) — re-ingesting all files (safe under ccusage-parity "
587
+ f"dedup)"
588
+ )
589
+ conn.execute("DELETE FROM session_entries")
590
+ # Clear the walk-complete sentinel atomically with the truncation
591
+ # full-reset (cctally-dev#93, D5/D2): the cache is being wiped, so
592
+ # any "complete" marker is now stale. The end-of-loop write below
593
+ # re-establishes it only after this run's clean re-ingest walk.
594
+ conn.execute("DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'")
595
+ # Crash-safety: also clear session_files's size/offset tracking
596
+ # so a partial-state recovery on the NEXT sync forces every
597
+ # file's per-file branch to take the fresh-ingest path. Without
598
+ # this, if the process is killed (kill -9, power loss) between
599
+ # this DELETE commit and the per-file re-ingest commits below,
600
+ # the next sync would only re-detect the originally-truncated
601
+ # file(s); other files still have matching size_bytes and the
602
+ # `if size == prev_size: continue` early-exit would leave them
603
+ # missing from session_entries until file size changes or an
604
+ # operator runs `cache-sync --rebuild`. UPDATE (not DELETE)
605
+ # preserves session_id / project_path columns lazy-backfilled
606
+ # by _ensure_session_files_row (used by the `session`
607
+ # subcommand's JOIN).
608
+ conn.execute(
609
+ "UPDATE session_files SET size_bytes = 0, last_byte_offset = 0"
610
+ )
611
+ conn.commit()
612
+ stats.files_reset_truncated += len(truncated_paths)
613
+ # Force every file to re-ingest from offset 0: clearing the
614
+ # `existing` map makes `prev is None` true downstream, so the
615
+ # per-file branch takes the fresh-ingest path (start_offset=0,
616
+ # truncated=False since we already wiped the table above —
617
+ # avoids a redundant per-file DELETE that would be a no-op).
618
+ existing = {}
619
+
468
620
  for jp in paths:
469
621
  path_str = str(jp)
470
622
  # Backfill session_id/project_path for A2 `session` subcommand.
@@ -478,6 +630,7 @@ def sync_cache(
478
630
  st = jp.stat()
479
631
  except OSError as exc:
480
632
  eprint(f"[cache] stat failed for {jp}: {exc}")
633
+ walk_clean = False # skipped a file without ingesting (D5a)
481
634
  continue
482
635
 
483
636
  size = st.st_size
@@ -536,6 +689,7 @@ def sync_cache(
536
689
  final_offset = fh.tell()
537
690
  except OSError as exc:
538
691
  eprint(f"[cache] could not read {jp}: {exc}")
692
+ walk_clean = False # skipped a file without ingesting (D5a)
539
693
  continue
540
694
 
541
695
  # Python's sqlite3 module starts an implicit transaction on the
@@ -553,16 +707,65 @@ def sync_cache(
553
707
  stats.files_reset_truncated += 1
554
708
  if rows:
555
709
  before = conn.total_changes
710
+ # ccusage-parity ON CONFLICT DO UPDATE: higher-token total
711
+ # wins on conflict; speed-set breaks ties. The partial
712
+ # UNIQUE index `idx_entries_dedup` restricts the conflict
713
+ # target to (msg_id IS NOT NULL AND req_id IS NOT NULL),
714
+ # so the WHERE clause on the conflict target MUST repeat
715
+ # that predicate verbatim — bare `ON CONFLICT(msg_id,
716
+ # req_id)` raises OperationalError. NULL-keyed rows fall
717
+ # through to a plain INSERT, unchanged.
718
+ #
719
+ # `source_path` is INTENTIONALLY OMITTED from the DO
720
+ # UPDATE SET clause: it stays pinned to whichever JSONL
721
+ # FIRST INSERTed the (msg_id, req_id) row. The
722
+ # downstream `LEFT JOIN session_files ON sf.path =
723
+ # se.source_path` uses source_path to attribute tokens
724
+ # to a `project_path`. If a later UPSERT from a
725
+ # different file flipped source_path, the row's
726
+ # project attribution would move with the winner —
727
+ # `cctally project` would mis-aggregate. Sticky
728
+ # source_path matches pre-dedup INSERT OR IGNORE
729
+ # behavior and the operator's mental model.
730
+ # (`line_offset` is similarly sticky for the same
731
+ # reason — the offset only makes sense within the
732
+ # file that originally wrote the row.)
556
733
  conn.executemany(
557
- """INSERT OR IGNORE INTO session_entries
734
+ """INSERT INTO session_entries
558
735
  (source_path, line_offset, timestamp_utc, model,
559
736
  msg_id, req_id, input_tokens, output_tokens,
560
737
  cache_create_tokens, cache_read_tokens,
561
738
  usage_extra_json, cost_usd_raw)
562
- VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""",
739
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
740
+ ON CONFLICT(msg_id, req_id)
741
+ WHERE msg_id IS NOT NULL AND req_id IS NOT NULL
742
+ DO UPDATE SET
743
+ timestamp_utc = excluded.timestamp_utc,
744
+ model = excluded.model,
745
+ input_tokens = excluded.input_tokens,
746
+ output_tokens = excluded.output_tokens,
747
+ cache_create_tokens = excluded.cache_create_tokens,
748
+ cache_read_tokens = excluded.cache_read_tokens,
749
+ usage_extra_json = excluded.usage_extra_json,
750
+ cost_usd_raw = excluded.cost_usd_raw
751
+ WHERE
752
+ (excluded.input_tokens + excluded.output_tokens
753
+ + excluded.cache_create_tokens + excluded.cache_read_tokens)
754
+ >
755
+ (session_entries.input_tokens + session_entries.output_tokens
756
+ + session_entries.cache_create_tokens + session_entries.cache_read_tokens)
757
+ OR (
758
+ (excluded.input_tokens + excluded.output_tokens
759
+ + excluded.cache_create_tokens + excluded.cache_read_tokens)
760
+ =
761
+ (session_entries.input_tokens + session_entries.output_tokens
762
+ + session_entries.cache_create_tokens + session_entries.cache_read_tokens)
763
+ AND json_extract(excluded.usage_extra_json, '$.speed') IS NOT NULL
764
+ AND json_extract(session_entries.usage_extra_json, '$.speed') IS NULL
765
+ )""",
563
766
  rows,
564
767
  )
565
- stats.rows_inserted += conn.total_changes - before
768
+ stats.rows_changed += conn.total_changes - before
566
769
  # UPSERT preserves session_id / project_path columns populated
567
770
  # by _ensure_session_files_row at the top of this loop. A plain
568
771
  # INSERT OR REPLACE would wipe them on every changed-file sync.
@@ -585,6 +788,7 @@ def sync_cache(
585
788
  except sqlite3.DatabaseError as exc:
586
789
  eprint(f"[cache] db error on {jp}: {exc}")
587
790
  conn.rollback()
791
+ walk_clean = False # rolled back this file without ingesting (D5a)
588
792
  continue
589
793
 
590
794
  if progress is not None:
@@ -592,6 +796,22 @@ def sync_cache(
592
796
 
593
797
  if progress is not None:
594
798
  progress(stats)
799
+
800
+ # Walk-complete sentinel write (cctally-dev#93, D5a). Still inside the
801
+ # held fcntl lock, before the finally-unlock. Only when the entire walk
802
+ # was clean AND cache 001 was already applied at the start of this run
803
+ # (D5b): an unclean walk or a straddle run must not vouch for cache
804
+ # completeness. A lock-contended sync returned early above and never
805
+ # reaches here. Presence (not the timestamp) is the gate signal; the
806
+ # value stores the completion instant for doctor/debugging.
807
+ if walk_clean and applied_at_start:
808
+ conn.execute(
809
+ "INSERT INTO cache_meta(key, value) "
810
+ "VALUES('claude_ingest_walk_complete', ?) "
811
+ "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
812
+ (dt.datetime.now(dt.timezone.utc).isoformat(),),
813
+ )
814
+ conn.commit()
595
815
  return stats
596
816
  finally:
597
817
  try:
@@ -665,15 +885,26 @@ def _collect_entries_direct(
665
885
  *,
666
886
  project: str | None = None,
667
887
  ) -> list[UsageEntry]:
668
- """Legacy direct-parse fallback used when the cache DB can't be opened."""
888
+ """Legacy direct-parse fallback used when the cache DB can't be opened.
889
+
890
+ Uses the ccusage-parity dict-keyed accumulator: dedup-keyed entries
891
+ live in `dedupe_map` and are tiebroken via `_should_replace` (higher
892
+ token total wins, speed-set breaks ties). Entries with NULL msg_id or
893
+ req_id bypass the map and land verbatim — partial UNIQUE index on the
894
+ cache mirrors this behavior. Flattened + sorted once at the end.
895
+ """
669
896
  files = _discover_session_files(range_start, project=project)
670
- seen_hashes: set[str] = set()
671
- entries: list[UsageEntry] = []
897
+ dedupe_map: dict[str, UsageEntry] = {}
898
+ no_key: list[UsageEntry] = []
672
899
  for fp in files:
673
- entries.extend(
674
- _parse_usage_entries(fp, range_start, range_end, seen_hashes=seen_hashes)
900
+ no_key.extend(
901
+ _parse_usage_entries(
902
+ fp, range_start, range_end, dedupe_map=dedupe_map,
903
+ )
675
904
  )
676
- return entries
905
+ all_entries = list(dedupe_map.values()) + no_key
906
+ all_entries.sort(key=lambda e: e.timestamp)
907
+ return all_entries
677
908
 
678
909
 
679
910
  # === Region 4: _JoinedClaudeEntry + get_claude_session_entries (was bin/cctally:2478-2668) ===
@@ -809,10 +1040,23 @@ def _direct_parse_claude_session_entries(
809
1040
  scan the file for the first `sessionId` / `cwd` value, else fall
810
1041
  back to the filename UUID and the decoded-escaped parent directory
811
1042
  — same logic as `_ensure_session_files_row`.
1043
+
1044
+ Uses the ccusage-parity dict-keyed accumulator. Each per-file parse
1045
+ contributes into a global `(entry, source_path)` map keyed by
1046
+ `msg_id:req_id`; ties broken by `_should_replace`. NULL-keyed entries
1047
+ bypass dedup. After all files are walked, results are stamped with
1048
+ their owning file's session_id/cwd metadata and emitted in
1049
+ timestamp order.
812
1050
  """
813
- results: list[_JoinedClaudeEntry] = []
814
1051
  files = _discover_session_files(range_start, project=project)
815
- seen_hashes: set[str] = set()
1052
+
1053
+ # File metadata: source_path -> (session_id, project_path/cwd).
1054
+ meta_by_path: dict[str, tuple[str, str]] = {}
1055
+
1056
+ # Global accumulator: (msg_id:req_id) -> (UsageEntry, source_path).
1057
+ dedupe_map: dict[str, tuple[UsageEntry, str]] = {}
1058
+ # Null-key entries (rare; same as the cache's partial-index fallthrough).
1059
+ no_key_with_meta: list[tuple[UsageEntry, str]] = []
816
1060
 
817
1061
  for fp in files:
818
1062
  source_path = str(fp)
@@ -847,27 +1091,67 @@ def _direct_parse_claude_session_entries(
847
1091
  session_id = os.path.splitext(os.path.basename(source_path))[0]
848
1092
  if cwd is None:
849
1093
  cwd = _decode_escaped_cwd(os.path.basename(os.path.dirname(source_path)))
1094
+ meta_by_path[source_path] = (session_id, cwd)
1095
+
1096
+ # Parse this file with a fresh per-file dedupe_map so we can attach
1097
+ # the source_path provenance to whatever wins this file's local
1098
+ # contests. Then merge into the global map using the same
1099
+ # `_should_replace` rule. (A shared dedupe_map across files would
1100
+ # lose the source_path of the winning entry — _parse_usage_entries
1101
+ # has no awareness of per-file metadata.)
1102
+ file_dedupe_map: dict[str, UsageEntry] = {}
1103
+ file_no_key = _parse_usage_entries(
1104
+ fp, range_start, range_end, dedupe_map=file_dedupe_map,
1105
+ )
850
1106
 
851
- for entry in _parse_usage_entries(
852
- fp, range_start, range_end, seen_hashes=seen_hashes
853
- ):
854
- usage = entry.usage
855
- results.append(_JoinedClaudeEntry(
856
- timestamp=entry.timestamp,
857
- model=entry.model,
858
- input_tokens=int(usage.get("input_tokens", 0) or 0),
859
- output_tokens=int(usage.get("output_tokens", 0) or 0),
860
- cache_creation_tokens=int(
861
- usage.get("cache_creation_input_tokens", 0) or 0
862
- ),
863
- cache_read_tokens=int(
864
- usage.get("cache_read_input_tokens", 0) or 0
865
- ),
866
- source_path=source_path,
867
- session_id=session_id,
868
- project_path=cwd,
869
- cost_usd=entry.cost_usd,
870
- ))
1107
+ # Merge file-local no-key entries directly (no dedup contest).
1108
+ for entry in file_no_key:
1109
+ no_key_with_meta.append((entry, source_path))
1110
+
1111
+ # Merge file-local dedup-keyed entries into the global map.
1112
+ # Same tiebreaker as the cache's ON CONFLICT DO UPDATE clause:
1113
+ # higher-token total wins the entry DATA. But `source_path` is
1114
+ # STICKY to whichever file FIRST contributed the key — it is NOT
1115
+ # flipped to the winner. This mirrors the cache ingest path, where
1116
+ # `source_path` is intentionally OMITTED from the ON CONFLICT DO
1117
+ # UPDATE SET clause (see this file's UPSERT, ~line 636) so the
1118
+ # downstream `LEFT JOIN session_files ON sf.path = se.source_path`
1119
+ # attributes tokens to the project of the file that first wrote the
1120
+ # row. Replacing it here would move project attribution to the
1121
+ # winner's file — `cctally project` (and any session_files join)
1122
+ # would then disagree with the normal cached behavior exactly when
1123
+ # this fallback path is exercised.
1124
+ for key, entry in file_dedupe_map.items():
1125
+ existing = dedupe_map.get(key)
1126
+ if existing is None:
1127
+ dedupe_map[key] = (entry, source_path)
1128
+ elif _should_replace(entry, existing[0]):
1129
+ # Winner's DATA, first contributor's source_path (sticky).
1130
+ dedupe_map[key] = (entry, existing[1])
1131
+
1132
+ # Flatten + emit.
1133
+ results: list[_JoinedClaudeEntry] = []
1134
+ flat: list[tuple[UsageEntry, str]] = list(dedupe_map.values()) + no_key_with_meta
1135
+ flat.sort(key=lambda pair: pair[0].timestamp)
1136
+ for entry, source_path in flat:
1137
+ usage = entry.usage
1138
+ sid, cwd = meta_by_path[source_path]
1139
+ results.append(_JoinedClaudeEntry(
1140
+ timestamp=entry.timestamp,
1141
+ model=entry.model,
1142
+ input_tokens=int(usage.get("input_tokens", 0) or 0),
1143
+ output_tokens=int(usage.get("output_tokens", 0) or 0),
1144
+ cache_creation_tokens=int(
1145
+ usage.get("cache_creation_input_tokens", 0) or 0
1146
+ ),
1147
+ cache_read_tokens=int(
1148
+ usage.get("cache_read_input_tokens", 0) or 0
1149
+ ),
1150
+ source_path=source_path,
1151
+ session_id=sid,
1152
+ project_path=cwd,
1153
+ cost_usd=entry.cost_usd,
1154
+ ))
871
1155
 
872
1156
  return results
873
1157
 
@@ -881,7 +1165,13 @@ class CodexIngestStats:
881
1165
  files_processed: int = 0
882
1166
  files_skipped_unchanged: int = 0
883
1167
  files_reset_truncated: int = 0
884
- rows_inserted: int = 0
1168
+ # Count of codex_session_entries rows written by this sync. Codex
1169
+ # ingest uses INSERT OR IGNORE — ignored conflicts do NOT bump
1170
+ # SQLite's `total_changes`, so this number is effectively "rows
1171
+ # newly inserted". Field is named ``rows_changed`` for parity with
1172
+ # ``IngestStats`` (Claude path) which carries an UPSERT and
1173
+ # therefore counts both new INSERTs and DO UPDATE replacements.
1174
+ rows_changed: int = 0
885
1175
  lock_contended: bool = False
886
1176
 
887
1177
 
@@ -891,7 +1181,7 @@ def _progress_codex_stderr(stats: CodexIngestStats, *, force: bool = False) -> N
891
1181
  return
892
1182
  eprint(
893
1183
  f"[codex-cache] {stats.files_processed}/{stats.files_total} files, "
894
- f"{stats.rows_inserted} new rows"
1184
+ f"{stats.rows_changed} rows changed"
895
1185
  )
896
1186
 
897
1187
 
@@ -914,10 +1204,10 @@ def sync_codex_cache(
914
1204
  """
915
1205
  stats = CodexIngestStats()
916
1206
  c = _cctally()
917
- c.APP_DIR.mkdir(parents=True, exist_ok=True)
918
- c.CACHE_LOCK_CODEX_PATH.touch()
1207
+ _cctally_core.APP_DIR.mkdir(parents=True, exist_ok=True)
1208
+ _cctally_core.CACHE_LOCK_CODEX_PATH.touch()
919
1209
 
920
- lock_fh = open(c.CACHE_LOCK_CODEX_PATH, "w")
1210
+ lock_fh = open(_cctally_core.CACHE_LOCK_CODEX_PATH, "w")
921
1211
  try:
922
1212
  try:
923
1213
  fcntl.flock(lock_fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
@@ -1096,7 +1386,7 @@ def sync_codex_cache(
1096
1386
  VALUES (?,?,?,?,?,?,?,?,?,?)""",
1097
1387
  rows,
1098
1388
  )
1099
- stats.rows_inserted += conn.total_changes - before
1389
+ stats.rows_changed += conn.total_changes - before
1100
1390
  conn.execute(
1101
1391
  """INSERT OR REPLACE INTO codex_session_files
1102
1392
  (path, size_bytes, mtime_ns, last_byte_offset,
@@ -1263,94 +1553,30 @@ def open_cache_db() -> sqlite3.Connection:
1263
1553
  recreated — the cache is fully re-derivable from JSONL, so this is safe.
1264
1554
  """
1265
1555
  c = _cctally()
1266
- c.APP_DIR.mkdir(parents=True, exist_ok=True)
1556
+ _cctally_core.APP_DIR.mkdir(parents=True, exist_ok=True)
1267
1557
  try:
1268
- conn = sqlite3.connect(c.CACHE_DB_PATH)
1558
+ conn = sqlite3.connect(_cctally_core.CACHE_DB_PATH)
1269
1559
  conn.execute("SELECT 1").fetchone()
1270
1560
  except sqlite3.DatabaseError as exc:
1271
1561
  eprint(f"[cache] corrupt cache DB ({exc}); recreating")
1272
1562
  try:
1273
- c.CACHE_DB_PATH.unlink()
1563
+ _cctally_core.CACHE_DB_PATH.unlink()
1274
1564
  except FileNotFoundError:
1275
1565
  pass
1276
- conn = sqlite3.connect(c.CACHE_DB_PATH)
1566
+ conn = sqlite3.connect(_cctally_core.CACHE_DB_PATH)
1277
1567
 
1278
1568
  conn.execute("PRAGMA journal_mode=WAL")
1279
1569
  conn.execute("PRAGMA busy_timeout=5000")
1280
1570
 
1281
- conn.executescript(
1282
- """
1283
- CREATE TABLE IF NOT EXISTS session_files (
1284
- path TEXT PRIMARY KEY,
1285
- size_bytes INTEGER NOT NULL,
1286
- mtime_ns INTEGER NOT NULL,
1287
- last_byte_offset INTEGER NOT NULL,
1288
- last_ingested_at TEXT NOT NULL
1289
- );
1290
- CREATE TABLE IF NOT EXISTS session_entries (
1291
- id INTEGER PRIMARY KEY AUTOINCREMENT,
1292
- source_path TEXT NOT NULL,
1293
- line_offset INTEGER NOT NULL,
1294
- timestamp_utc TEXT NOT NULL,
1295
- model TEXT NOT NULL,
1296
- msg_id TEXT,
1297
- req_id TEXT,
1298
- input_tokens INTEGER NOT NULL DEFAULT 0,
1299
- output_tokens INTEGER NOT NULL DEFAULT 0,
1300
- cache_create_tokens INTEGER NOT NULL DEFAULT 0,
1301
- cache_read_tokens INTEGER NOT NULL DEFAULT 0,
1302
- usage_extra_json TEXT,
1303
- cost_usd_raw REAL
1304
- );
1305
- CREATE INDEX IF NOT EXISTS idx_entries_timestamp
1306
- ON session_entries(timestamp_utc);
1307
- CREATE INDEX IF NOT EXISTS idx_entries_source
1308
- ON session_entries(source_path);
1309
- CREATE UNIQUE INDEX IF NOT EXISTS idx_entries_dedup
1310
- ON session_entries(msg_id, req_id)
1311
- WHERE msg_id IS NOT NULL AND req_id IS NOT NULL;
1312
-
1313
- CREATE TABLE IF NOT EXISTS codex_session_files (
1314
- path TEXT PRIMARY KEY,
1315
- size_bytes INTEGER NOT NULL,
1316
- mtime_ns INTEGER NOT NULL,
1317
- last_byte_offset INTEGER NOT NULL,
1318
- last_ingested_at TEXT NOT NULL,
1319
- last_session_id TEXT,
1320
- last_model TEXT
1321
- );
1322
- CREATE TABLE IF NOT EXISTS codex_session_entries (
1323
- id INTEGER PRIMARY KEY AUTOINCREMENT,
1324
- source_path TEXT NOT NULL,
1325
- line_offset INTEGER NOT NULL,
1326
- timestamp_utc TEXT NOT NULL,
1327
- session_id TEXT NOT NULL,
1328
- model TEXT NOT NULL,
1329
- input_tokens INTEGER NOT NULL DEFAULT 0,
1330
- cached_input_tokens INTEGER NOT NULL DEFAULT 0,
1331
- output_tokens INTEGER NOT NULL DEFAULT 0,
1332
- reasoning_output_tokens INTEGER NOT NULL DEFAULT 0,
1333
- total_tokens INTEGER NOT NULL DEFAULT 0,
1334
- UNIQUE(source_path, line_offset)
1335
- );
1336
- CREATE INDEX IF NOT EXISTS idx_codex_entries_timestamp
1337
- ON codex_session_entries(timestamp_utc);
1338
- CREATE INDEX IF NOT EXISTS idx_codex_entries_session
1339
- ON codex_session_entries(session_id);
1340
- CREATE INDEX IF NOT EXISTS idx_codex_entries_source
1341
- ON codex_session_entries(source_path);
1342
- """
1343
- )
1344
-
1345
- # Inline migration: add session_id / project_path columns to session_files
1346
- # if they're missing. These were added for A2 `session` subcommand metadata;
1347
- # populated lazily in sync_cache() / _ensure_session_files_row().
1348
- add_column_if_missing(conn, "session_files", "session_id", "TEXT")
1349
- add_column_if_missing(conn, "session_files", "project_path", "TEXT")
1350
- conn.execute(
1351
- "CREATE INDEX IF NOT EXISTS idx_session_files_session_id "
1352
- "ON session_files(session_id)"
1353
- )
1571
+ # Apply the shared cache.db schema (cctally-dev#93, D4): Claude tables +
1572
+ # indexes, the session_id / project_path column adds on session_files
1573
+ # (A2 `session` metadata, populated lazily in sync_cache() /
1574
+ # _ensure_session_files_row()), the Codex base tables + indexes, and the
1575
+ # cache_meta sentinel table. This is the single cache.db schema source —
1576
+ # the eager-apply path (_eagerly_apply_cache_migrations) uses the SAME
1577
+ # helper, so the two can no longer drift. The Codex last_total_tokens
1578
+ # ALTER + purge stays below (out of the shared helper — D4/P1#3).
1579
+ _cctally_db_sib._apply_cache_schema(conn)
1354
1580
 
1355
1581
  # Migration: add last_total_tokens to codex_session_files. When the column
1356
1582
  # is newly added (i.e. this is the first run after upgrade), purge the
@@ -1409,7 +1635,7 @@ def cmd_cache_sync(args: argparse.Namespace) -> int:
1409
1635
  f"[cache-sync] claude done: {stats.files_processed} processed, "
1410
1636
  f"{stats.files_skipped_unchanged} skipped, "
1411
1637
  f"{stats.files_reset_truncated} reset, "
1412
- f"{stats.rows_inserted} rows inserted"
1638
+ f"{stats.rows_changed} rows changed"
1413
1639
  )
1414
1640
 
1415
1641
  if source in ("codex", "all"):
@@ -1427,7 +1653,7 @@ def cmd_cache_sync(args: argparse.Namespace) -> int:
1427
1653
  f"[cache-sync] codex done: {stats.files_processed} processed, "
1428
1654
  f"{stats.files_skipped_unchanged} skipped, "
1429
1655
  f"{stats.files_reset_truncated} reset, "
1430
- f"{stats.rows_inserted} rows inserted"
1656
+ f"{stats.rows_changed} rows changed"
1431
1657
  )
1432
1658
 
1433
1659
  return 0