cctally 1.11.1 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -165,6 +165,7 @@ _CodexIterState = _lib_jsonl._CodexIterState
165
165
  _iter_jsonl_entries_with_offsets = _lib_jsonl._iter_jsonl_entries_with_offsets
166
166
  _iter_codex_jsonl_entries_with_offsets = _lib_jsonl._iter_codex_jsonl_entries_with_offsets
167
167
  _parse_usage_entries = _lib_jsonl._parse_usage_entries
168
+ _should_replace = _lib_jsonl._should_replace
168
169
 
169
170
  _cctally_db_sib = _load_lib("_cctally_db")
170
171
  add_column_if_missing = _cctally_db_sib.add_column_if_missing
@@ -304,7 +305,16 @@ class IngestStats:
304
305
  files_processed: int = 0
305
306
  files_skipped_unchanged: int = 0
306
307
  files_reset_truncated: int = 0
307
- rows_inserted: int = 0
308
+ # Count of session_entries rows written by this sync — both genuinely-
309
+ # new INSERTs and ccusage-parity ON CONFLICT DO UPDATE replacements
310
+ # (the dedup tiebreaker swaps a streaming-intermediate row for the
311
+ # post-stream finalization). SQLite's `total_changes` counter
312
+ # increments on both, so this field is "rows changed", not "rows
313
+ # newly inserted". Pre-dedup builds used INSERT OR IGNORE where
314
+ # conflicts did NOT bump the counter; the name change preserves the
315
+ # observability metric without misrepresenting UPSERT updates as
316
+ # new inserts.
317
+ rows_changed: int = 0
308
318
  lock_contended: bool = False
309
319
 
310
320
 
@@ -314,7 +324,7 @@ def _progress_stderr(stats: IngestStats, *, force: bool = False) -> None:
314
324
  return
315
325
  eprint(
316
326
  f"[cache-sync] {stats.files_processed}/{stats.files_total} files, "
317
- f"{stats.rows_inserted} new rows"
327
+ f"{stats.rows_changed} rows changed"
318
328
  )
319
329
 
320
330
 
@@ -433,6 +443,30 @@ def sync_cache(
433
443
  stats.lock_contended = True
434
444
  return stats
435
445
 
446
+ # Walk-complete sentinel gating (cctally-dev#93, D5b/D6b). Capture
447
+ # whether cache 001 was already applied at the moment this sync
448
+ # acquired the lock. The end-of-loop marker write is gated on this so
449
+ # a walk whose baseline predates the 001 wipe (the "straddle" run)
450
+ # withholds the marker — it cannot vouch for a cache 001 wiped
451
+ # underneath it. On the normal first-upgrade flow open_cache_db runs
452
+ # the dispatcher (001 applies in-process) BEFORE sync_cache is ever
453
+ # called, so this is True and the marker is written as expected. If
454
+ # schema_migrations doesn't exist yet, treat as not-applied (False).
455
+ try:
456
+ applied_at_start = conn.execute(
457
+ "SELECT 1 FROM schema_migrations WHERE name='001_dedup_highest_wins'"
458
+ ).fetchone() is not None
459
+ except sqlite3.OperationalError:
460
+ applied_at_start = False
461
+
462
+ # Tracks whether every file in this walk was either ingested cleanly
463
+ # or confirmed-current. Any per-file error-skip (stat/read failure or
464
+ # a DB error that rolls back + continues) flips it False so the marker
465
+ # is withheld — an incomplete walk must not look complete. The
466
+ # unchanged-file early-exit (`size == prev_size`) does NOT flip it: a
467
+ # confirmed-current file still counts as walked.
468
+ walk_clean = True
469
+
436
470
  if rebuild:
437
471
  # Clear INSIDE the lock — a concurrent rebuild that lost the
438
472
  # race would otherwise have wiped this cache before bailing,
@@ -441,6 +475,11 @@ def sync_cache(
441
475
  # empty baseline.
442
476
  conn.execute("DELETE FROM session_entries")
443
477
  conn.execute("DELETE FROM session_files")
478
+ # Clear the walk-complete sentinel atomically with the wipe
479
+ # (cctally-dev#93, D5/D2): a stale "complete" marker must never
480
+ # survive a destructive rebuild. The end-of-loop write below
481
+ # re-establishes it only after this rebuild's clean walk.
482
+ conn.execute("DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'")
444
483
  conn.commit()
445
484
  eprint("[cache-sync] rebuild: cleared Claude cached entries")
446
485
 
@@ -464,6 +503,120 @@ def sync_cache(
464
503
  )
465
504
  }
466
505
 
506
+ # Orphaned-tracked-file detection (cctally-dev#93 review). A path
507
+ # tracked in session_files (with data already ingested) but no
508
+ # longer present on disk leaves orphaned session_entries rows that
509
+ # the per-file loop below never visits — it iterates only on-disk
510
+ # `paths`. sync_cache deliberately does NOT prune those orphans
511
+ # in-place: a deleted file shares the truncation hazard (under the
512
+ # sticky source_path dedup a surviving file may carry the same
513
+ # (msg_id, req_id) yet keep its size_bytes, so a per-orphan DELETE
514
+ # could drop a row the survivor still owns without re-ingesting
515
+ # it), and a blanket full-reset would wrongly fire on the
516
+ # legitimate "cache seeded with synthetic source paths" fixture
517
+ # pattern. Instead we INVALIDATE the walk-complete marker: an
518
+ # orphaned cache no longer faithfully mirrors disk, so it is — by
519
+ # the marker's own definition — not a complete walk. We must
520
+ # actively DELETE any marker a PRIOR clean walk left behind (not
521
+ # merely withhold THIS run's end-of-loop rewrite — that rewrite is
522
+ # gated on walk_clean, but a stale marker from a previous sync
523
+ # would otherwise survive and keep vouching for completeness).
524
+ # Setting walk_clean=False additionally suppresses the end-of-loop
525
+ # rewrite so the marker stays absent for this run. With the marker
526
+ # gone the upgrade gate DEFERs the 008/009/010 recomputes (rather
527
+ # than certifying aggregates that still include data from files no
528
+ # longer on disk); the operator clears the orphans by running
529
+ # `cache-sync --rebuild` (the documented re-derive path), which
530
+ # re-establishes the marker. Only paths whose row carried ingested
531
+ # bytes (size_bytes > 0) count — a size_bytes=0 row holds no
532
+ # session_entries, so its absence leaves no orphan. The DELETE +
533
+ # commit lands BEFORE the per-file read+parse loop, so no write
534
+ # lock is held into that loop (same discipline as the truncation
535
+ # escalation just below).
536
+ on_disk_paths = {str(jp) for jp in paths}
537
+ orphaned_tracked_paths = [
538
+ p for p, (size_bytes, _, _) in existing.items()
539
+ if size_bytes and p not in on_disk_paths
540
+ ]
541
+ if orphaned_tracked_paths:
542
+ eprint(
543
+ f"[cache] {len(orphaned_tracked_paths)} tracked file(s) no "
544
+ f"longer on disk; invalidating walk-complete marker "
545
+ f"(run `cache-sync --rebuild` to prune orphaned entries)"
546
+ )
547
+ conn.execute(
548
+ "DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'"
549
+ )
550
+ conn.commit()
551
+ walk_clean = False # orphaned rows -> cache doesn't mirror disk (D5a)
552
+
553
+ # Pre-scan for any truncation among tracked files. Under the
554
+ # ccusage-parity ON CONFLICT DO UPDATE, source_path is PINNED to
555
+ # whichever file first inserted a (msg_id, req_id) row (see U1
556
+ # in this file). Later UPSERTs from a DIFFERENT file may have
557
+ # updated the token columns on that row while leaving source_path
558
+ # pointing at the original (now possibly truncated) file. A
559
+ # naive per-file truncation path then deletes by source_path and
560
+ # loses data the other file is still carrying — but that other
561
+ # file's `size_bytes` is unchanged, so the per-file early-exit
562
+ # at `if size == prev_size: continue` skips its re-ingest.
563
+ #
564
+ # Escalation: when any file's size has shrunk, drop the entire
565
+ # session_entries cache and force every file to re-ingest from
566
+ # offset 0. The cache is fully re-derivable, this is rare (only
567
+ # on JSONL rotation / manual edits), and it sidesteps the
568
+ # per-key contributing-file bookkeeping that would otherwise be
569
+ # required. The lock is already held, so this is atomic with
570
+ # the subsequent per-file ingest.
571
+ truncated_paths: set[str] = set()
572
+ for jp in paths:
573
+ prev = existing.get(str(jp))
574
+ if prev is None:
575
+ continue
576
+ try:
577
+ st = jp.stat()
578
+ except OSError:
579
+ continue
580
+ if st.st_size < prev[0]:
581
+ truncated_paths.add(str(jp))
582
+
583
+ if truncated_paths:
584
+ eprint(
585
+ f"[cache-sync] truncation detected on {len(truncated_paths)} "
586
+ f"file(s) — re-ingesting all files (safe under ccusage-parity "
587
+ f"dedup)"
588
+ )
589
+ conn.execute("DELETE FROM session_entries")
590
+ # Clear the walk-complete sentinel atomically with the truncation
591
+ # full-reset (cctally-dev#93, D5/D2): the cache is being wiped, so
592
+ # any "complete" marker is now stale. The end-of-loop write below
593
+ # re-establishes it only after this run's clean re-ingest walk.
594
+ conn.execute("DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'")
595
+ # Crash-safety: also clear session_files's size/offset tracking
596
+ # so a partial-state recovery on the NEXT sync forces every
597
+ # file's per-file branch to take the fresh-ingest path. Without
598
+ # this, if the process is killed (kill -9, power loss) between
599
+ # this DELETE commit and the per-file re-ingest commits below,
600
+ # the next sync would only re-detect the originally-truncated
601
+ # file(s); other files still have matching size_bytes and the
602
+ # `if size == prev_size: continue` early-exit would leave them
603
+ # missing from session_entries until file size changes or an
604
+ # operator runs `cache-sync --rebuild`. UPDATE (not DELETE)
605
+ # preserves session_id / project_path columns lazy-backfilled
606
+ # by _ensure_session_files_row (used by the `session`
607
+ # subcommand's JOIN).
608
+ conn.execute(
609
+ "UPDATE session_files SET size_bytes = 0, last_byte_offset = 0"
610
+ )
611
+ conn.commit()
612
+ stats.files_reset_truncated += len(truncated_paths)
613
+ # Force every file to re-ingest from offset 0: clearing the
614
+ # `existing` map makes `prev is None` true downstream, so the
615
+ # per-file branch takes the fresh-ingest path (start_offset=0,
616
+ # truncated=False since we already wiped the table above —
617
+ # avoids a redundant per-file DELETE that would be a no-op).
618
+ existing = {}
619
+
467
620
  for jp in paths:
468
621
  path_str = str(jp)
469
622
  # Backfill session_id/project_path for A2 `session` subcommand.
@@ -477,6 +630,7 @@ def sync_cache(
477
630
  st = jp.stat()
478
631
  except OSError as exc:
479
632
  eprint(f"[cache] stat failed for {jp}: {exc}")
633
+ walk_clean = False # skipped a file without ingesting (D5a)
480
634
  continue
481
635
 
482
636
  size = st.st_size
@@ -535,6 +689,7 @@ def sync_cache(
535
689
  final_offset = fh.tell()
536
690
  except OSError as exc:
537
691
  eprint(f"[cache] could not read {jp}: {exc}")
692
+ walk_clean = False # skipped a file without ingesting (D5a)
538
693
  continue
539
694
 
540
695
  # Python's sqlite3 module starts an implicit transaction on the
@@ -552,16 +707,65 @@ def sync_cache(
552
707
  stats.files_reset_truncated += 1
553
708
  if rows:
554
709
  before = conn.total_changes
710
+ # ccusage-parity ON CONFLICT DO UPDATE: higher-token total
711
+ # wins on conflict; speed-set breaks ties. The partial
712
+ # UNIQUE index `idx_entries_dedup` restricts the conflict
713
+ # target to (msg_id IS NOT NULL AND req_id IS NOT NULL),
714
+ # so the WHERE clause on the conflict target MUST repeat
715
+ # that predicate verbatim — bare `ON CONFLICT(msg_id,
716
+ # req_id)` raises OperationalError. NULL-keyed rows fall
717
+ # through to a plain INSERT, unchanged.
718
+ #
719
+ # `source_path` is INTENTIONALLY OMITTED from the DO
720
+ # UPDATE SET clause: it stays pinned to whichever JSONL
721
+ # FIRST INSERTed the (msg_id, req_id) row. The
722
+ # downstream `LEFT JOIN session_files ON sf.path =
723
+ # se.source_path` uses source_path to attribute tokens
724
+ # to a `project_path`. If a later UPSERT from a
725
+ # different file flipped source_path, the row's
726
+ # project attribution would move with the winner —
727
+ # `cctally project` would mis-aggregate. Sticky
728
+ # source_path matches pre-dedup INSERT OR IGNORE
729
+ # behavior and the operator's mental model.
730
+ # (`line_offset` is similarly sticky for the same
731
+ # reason — the offset only makes sense within the
732
+ # file that originally wrote the row.)
555
733
  conn.executemany(
556
- """INSERT OR IGNORE INTO session_entries
734
+ """INSERT INTO session_entries
557
735
  (source_path, line_offset, timestamp_utc, model,
558
736
  msg_id, req_id, input_tokens, output_tokens,
559
737
  cache_create_tokens, cache_read_tokens,
560
738
  usage_extra_json, cost_usd_raw)
561
- VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""",
739
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
740
+ ON CONFLICT(msg_id, req_id)
741
+ WHERE msg_id IS NOT NULL AND req_id IS NOT NULL
742
+ DO UPDATE SET
743
+ timestamp_utc = excluded.timestamp_utc,
744
+ model = excluded.model,
745
+ input_tokens = excluded.input_tokens,
746
+ output_tokens = excluded.output_tokens,
747
+ cache_create_tokens = excluded.cache_create_tokens,
748
+ cache_read_tokens = excluded.cache_read_tokens,
749
+ usage_extra_json = excluded.usage_extra_json,
750
+ cost_usd_raw = excluded.cost_usd_raw
751
+ WHERE
752
+ (excluded.input_tokens + excluded.output_tokens
753
+ + excluded.cache_create_tokens + excluded.cache_read_tokens)
754
+ >
755
+ (session_entries.input_tokens + session_entries.output_tokens
756
+ + session_entries.cache_create_tokens + session_entries.cache_read_tokens)
757
+ OR (
758
+ (excluded.input_tokens + excluded.output_tokens
759
+ + excluded.cache_create_tokens + excluded.cache_read_tokens)
760
+ =
761
+ (session_entries.input_tokens + session_entries.output_tokens
762
+ + session_entries.cache_create_tokens + session_entries.cache_read_tokens)
763
+ AND json_extract(excluded.usage_extra_json, '$.speed') IS NOT NULL
764
+ AND json_extract(session_entries.usage_extra_json, '$.speed') IS NULL
765
+ )""",
562
766
  rows,
563
767
  )
564
- stats.rows_inserted += conn.total_changes - before
768
+ stats.rows_changed += conn.total_changes - before
565
769
  # UPSERT preserves session_id / project_path columns populated
566
770
  # by _ensure_session_files_row at the top of this loop. A plain
567
771
  # INSERT OR REPLACE would wipe them on every changed-file sync.
@@ -584,6 +788,7 @@ def sync_cache(
584
788
  except sqlite3.DatabaseError as exc:
585
789
  eprint(f"[cache] db error on {jp}: {exc}")
586
790
  conn.rollback()
791
+ walk_clean = False # rolled back this file without ingesting (D5a)
587
792
  continue
588
793
 
589
794
  if progress is not None:
@@ -591,6 +796,22 @@ def sync_cache(
591
796
 
592
797
  if progress is not None:
593
798
  progress(stats)
799
+
800
+ # Walk-complete sentinel write (cctally-dev#93, D5a). Still inside the
801
+ # held fcntl lock, before the finally-unlock. Only when the entire walk
802
+ # was clean AND cache 001 was already applied at the start of this run
803
+ # (D5b): an unclean walk or a straddle run must not vouch for cache
804
+ # completeness. A lock-contended sync returned early above and never
805
+ # reaches here. Presence (not the timestamp) is the gate signal; the
806
+ # value stores the completion instant for doctor/debugging.
807
+ if walk_clean and applied_at_start:
808
+ conn.execute(
809
+ "INSERT INTO cache_meta(key, value) "
810
+ "VALUES('claude_ingest_walk_complete', ?) "
811
+ "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
812
+ (dt.datetime.now(dt.timezone.utc).isoformat(),),
813
+ )
814
+ conn.commit()
594
815
  return stats
595
816
  finally:
596
817
  try:
@@ -664,15 +885,26 @@ def _collect_entries_direct(
664
885
  *,
665
886
  project: str | None = None,
666
887
  ) -> list[UsageEntry]:
667
- """Legacy direct-parse fallback used when the cache DB can't be opened."""
888
+ """Legacy direct-parse fallback used when the cache DB can't be opened.
889
+
890
+ Uses the ccusage-parity dict-keyed accumulator: dedup-keyed entries
891
+ live in `dedupe_map` and are tiebroken via `_should_replace` (higher
892
+ token total wins, speed-set breaks ties). Entries with NULL msg_id or
893
+ req_id bypass the map and land verbatim — partial UNIQUE index on the
894
+ cache mirrors this behavior. Flattened + sorted once at the end.
895
+ """
668
896
  files = _discover_session_files(range_start, project=project)
669
- seen_hashes: set[str] = set()
670
- entries: list[UsageEntry] = []
897
+ dedupe_map: dict[str, UsageEntry] = {}
898
+ no_key: list[UsageEntry] = []
671
899
  for fp in files:
672
- entries.extend(
673
- _parse_usage_entries(fp, range_start, range_end, seen_hashes=seen_hashes)
900
+ no_key.extend(
901
+ _parse_usage_entries(
902
+ fp, range_start, range_end, dedupe_map=dedupe_map,
903
+ )
674
904
  )
675
- return entries
905
+ all_entries = list(dedupe_map.values()) + no_key
906
+ all_entries.sort(key=lambda e: e.timestamp)
907
+ return all_entries
676
908
 
677
909
 
678
910
  # === Region 4: _JoinedClaudeEntry + get_claude_session_entries (was bin/cctally:2478-2668) ===
@@ -808,10 +1040,23 @@ def _direct_parse_claude_session_entries(
808
1040
  scan the file for the first `sessionId` / `cwd` value, else fall
809
1041
  back to the filename UUID and the decoded-escaped parent directory
810
1042
  — same logic as `_ensure_session_files_row`.
1043
+
1044
+ Uses the ccusage-parity dict-keyed accumulator. Each per-file parse
1045
+ contributes into a global `(entry, source_path)` map keyed by
1046
+ `msg_id:req_id`; ties broken by `_should_replace`. NULL-keyed entries
1047
+ bypass dedup. After all files are walked, results are stamped with
1048
+ their owning file's session_id/cwd metadata and emitted in
1049
+ timestamp order.
811
1050
  """
812
- results: list[_JoinedClaudeEntry] = []
813
1051
  files = _discover_session_files(range_start, project=project)
814
- seen_hashes: set[str] = set()
1052
+
1053
+ # File metadata: source_path -> (session_id, project_path/cwd).
1054
+ meta_by_path: dict[str, tuple[str, str]] = {}
1055
+
1056
+ # Global accumulator: (msg_id:req_id) -> (UsageEntry, source_path).
1057
+ dedupe_map: dict[str, tuple[UsageEntry, str]] = {}
1058
+ # Null-key entries (rare; same as the cache's partial-index fallthrough).
1059
+ no_key_with_meta: list[tuple[UsageEntry, str]] = []
815
1060
 
816
1061
  for fp in files:
817
1062
  source_path = str(fp)
@@ -846,27 +1091,67 @@ def _direct_parse_claude_session_entries(
846
1091
  session_id = os.path.splitext(os.path.basename(source_path))[0]
847
1092
  if cwd is None:
848
1093
  cwd = _decode_escaped_cwd(os.path.basename(os.path.dirname(source_path)))
1094
+ meta_by_path[source_path] = (session_id, cwd)
1095
+
1096
+ # Parse this file with a fresh per-file dedupe_map so we can attach
1097
+ # the source_path provenance to whatever wins this file's local
1098
+ # contests. Then merge into the global map using the same
1099
+ # `_should_replace` rule. (A shared dedupe_map across files would
1100
+ # lose the source_path of the winning entry — _parse_usage_entries
1101
+ # has no awareness of per-file metadata.)
1102
+ file_dedupe_map: dict[str, UsageEntry] = {}
1103
+ file_no_key = _parse_usage_entries(
1104
+ fp, range_start, range_end, dedupe_map=file_dedupe_map,
1105
+ )
849
1106
 
850
- for entry in _parse_usage_entries(
851
- fp, range_start, range_end, seen_hashes=seen_hashes
852
- ):
853
- usage = entry.usage
854
- results.append(_JoinedClaudeEntry(
855
- timestamp=entry.timestamp,
856
- model=entry.model,
857
- input_tokens=int(usage.get("input_tokens", 0) or 0),
858
- output_tokens=int(usage.get("output_tokens", 0) or 0),
859
- cache_creation_tokens=int(
860
- usage.get("cache_creation_input_tokens", 0) or 0
861
- ),
862
- cache_read_tokens=int(
863
- usage.get("cache_read_input_tokens", 0) or 0
864
- ),
865
- source_path=source_path,
866
- session_id=session_id,
867
- project_path=cwd,
868
- cost_usd=entry.cost_usd,
869
- ))
1107
+ # Merge file-local no-key entries directly (no dedup contest).
1108
+ for entry in file_no_key:
1109
+ no_key_with_meta.append((entry, source_path))
1110
+
1111
+ # Merge file-local dedup-keyed entries into the global map.
1112
+ # Same tiebreaker as the cache's ON CONFLICT DO UPDATE clause:
1113
+ # higher-token total wins the entry DATA. But `source_path` is
1114
+ # STICKY to whichever file FIRST contributed the key — it is NOT
1115
+ # flipped to the winner. This mirrors the cache ingest path, where
1116
+ # `source_path` is intentionally OMITTED from the ON CONFLICT DO
1117
+ # UPDATE SET clause (see this file's UPSERT, ~line 636) so the
1118
+ # downstream `LEFT JOIN session_files ON sf.path = se.source_path`
1119
+ # attributes tokens to the project of the file that first wrote the
1120
+ # row. Replacing it here would move project attribution to the
1121
+ # winner's file — `cctally project` (and any session_files join)
1122
+ # would then disagree with the normal cached behavior exactly when
1123
+ # this fallback path is exercised.
1124
+ for key, entry in file_dedupe_map.items():
1125
+ existing = dedupe_map.get(key)
1126
+ if existing is None:
1127
+ dedupe_map[key] = (entry, source_path)
1128
+ elif _should_replace(entry, existing[0]):
1129
+ # Winner's DATA, first contributor's source_path (sticky).
1130
+ dedupe_map[key] = (entry, existing[1])
1131
+
1132
+ # Flatten + emit.
1133
+ results: list[_JoinedClaudeEntry] = []
1134
+ flat: list[tuple[UsageEntry, str]] = list(dedupe_map.values()) + no_key_with_meta
1135
+ flat.sort(key=lambda pair: pair[0].timestamp)
1136
+ for entry, source_path in flat:
1137
+ usage = entry.usage
1138
+ sid, cwd = meta_by_path[source_path]
1139
+ results.append(_JoinedClaudeEntry(
1140
+ timestamp=entry.timestamp,
1141
+ model=entry.model,
1142
+ input_tokens=int(usage.get("input_tokens", 0) or 0),
1143
+ output_tokens=int(usage.get("output_tokens", 0) or 0),
1144
+ cache_creation_tokens=int(
1145
+ usage.get("cache_creation_input_tokens", 0) or 0
1146
+ ),
1147
+ cache_read_tokens=int(
1148
+ usage.get("cache_read_input_tokens", 0) or 0
1149
+ ),
1150
+ source_path=source_path,
1151
+ session_id=sid,
1152
+ project_path=cwd,
1153
+ cost_usd=entry.cost_usd,
1154
+ ))
870
1155
 
871
1156
  return results
872
1157
 
@@ -880,7 +1165,13 @@ class CodexIngestStats:
880
1165
  files_processed: int = 0
881
1166
  files_skipped_unchanged: int = 0
882
1167
  files_reset_truncated: int = 0
883
- rows_inserted: int = 0
1168
+ # Count of codex_session_entries rows written by this sync. Codex
1169
+ # ingest uses INSERT OR IGNORE — ignored conflicts do NOT bump
1170
+ # SQLite's `total_changes`, so this number is effectively "rows
1171
+ # newly inserted". Field is named ``rows_changed`` for parity with
1172
+ # ``IngestStats`` (Claude path) which carries an UPSERT and
1173
+ # therefore counts both new INSERTs and DO UPDATE replacements.
1174
+ rows_changed: int = 0
884
1175
  lock_contended: bool = False
885
1176
 
886
1177
 
@@ -890,7 +1181,7 @@ def _progress_codex_stderr(stats: CodexIngestStats, *, force: bool = False) -> N
890
1181
  return
891
1182
  eprint(
892
1183
  f"[codex-cache] {stats.files_processed}/{stats.files_total} files, "
893
- f"{stats.rows_inserted} new rows"
1184
+ f"{stats.rows_changed} rows changed"
894
1185
  )
895
1186
 
896
1187
 
@@ -1095,7 +1386,7 @@ def sync_codex_cache(
1095
1386
  VALUES (?,?,?,?,?,?,?,?,?,?)""",
1096
1387
  rows,
1097
1388
  )
1098
- stats.rows_inserted += conn.total_changes - before
1389
+ stats.rows_changed += conn.total_changes - before
1099
1390
  conn.execute(
1100
1391
  """INSERT OR REPLACE INTO codex_session_files
1101
1392
  (path, size_bytes, mtime_ns, last_byte_offset,
@@ -1277,79 +1568,15 @@ def open_cache_db() -> sqlite3.Connection:
1277
1568
  conn.execute("PRAGMA journal_mode=WAL")
1278
1569
  conn.execute("PRAGMA busy_timeout=5000")
1279
1570
 
1280
- conn.executescript(
1281
- """
1282
- CREATE TABLE IF NOT EXISTS session_files (
1283
- path TEXT PRIMARY KEY,
1284
- size_bytes INTEGER NOT NULL,
1285
- mtime_ns INTEGER NOT NULL,
1286
- last_byte_offset INTEGER NOT NULL,
1287
- last_ingested_at TEXT NOT NULL
1288
- );
1289
- CREATE TABLE IF NOT EXISTS session_entries (
1290
- id INTEGER PRIMARY KEY AUTOINCREMENT,
1291
- source_path TEXT NOT NULL,
1292
- line_offset INTEGER NOT NULL,
1293
- timestamp_utc TEXT NOT NULL,
1294
- model TEXT NOT NULL,
1295
- msg_id TEXT,
1296
- req_id TEXT,
1297
- input_tokens INTEGER NOT NULL DEFAULT 0,
1298
- output_tokens INTEGER NOT NULL DEFAULT 0,
1299
- cache_create_tokens INTEGER NOT NULL DEFAULT 0,
1300
- cache_read_tokens INTEGER NOT NULL DEFAULT 0,
1301
- usage_extra_json TEXT,
1302
- cost_usd_raw REAL
1303
- );
1304
- CREATE INDEX IF NOT EXISTS idx_entries_timestamp
1305
- ON session_entries(timestamp_utc);
1306
- CREATE INDEX IF NOT EXISTS idx_entries_source
1307
- ON session_entries(source_path);
1308
- CREATE UNIQUE INDEX IF NOT EXISTS idx_entries_dedup
1309
- ON session_entries(msg_id, req_id)
1310
- WHERE msg_id IS NOT NULL AND req_id IS NOT NULL;
1311
-
1312
- CREATE TABLE IF NOT EXISTS codex_session_files (
1313
- path TEXT PRIMARY KEY,
1314
- size_bytes INTEGER NOT NULL,
1315
- mtime_ns INTEGER NOT NULL,
1316
- last_byte_offset INTEGER NOT NULL,
1317
- last_ingested_at TEXT NOT NULL,
1318
- last_session_id TEXT,
1319
- last_model TEXT
1320
- );
1321
- CREATE TABLE IF NOT EXISTS codex_session_entries (
1322
- id INTEGER PRIMARY KEY AUTOINCREMENT,
1323
- source_path TEXT NOT NULL,
1324
- line_offset INTEGER NOT NULL,
1325
- timestamp_utc TEXT NOT NULL,
1326
- session_id TEXT NOT NULL,
1327
- model TEXT NOT NULL,
1328
- input_tokens INTEGER NOT NULL DEFAULT 0,
1329
- cached_input_tokens INTEGER NOT NULL DEFAULT 0,
1330
- output_tokens INTEGER NOT NULL DEFAULT 0,
1331
- reasoning_output_tokens INTEGER NOT NULL DEFAULT 0,
1332
- total_tokens INTEGER NOT NULL DEFAULT 0,
1333
- UNIQUE(source_path, line_offset)
1334
- );
1335
- CREATE INDEX IF NOT EXISTS idx_codex_entries_timestamp
1336
- ON codex_session_entries(timestamp_utc);
1337
- CREATE INDEX IF NOT EXISTS idx_codex_entries_session
1338
- ON codex_session_entries(session_id);
1339
- CREATE INDEX IF NOT EXISTS idx_codex_entries_source
1340
- ON codex_session_entries(source_path);
1341
- """
1342
- )
1343
-
1344
- # Inline migration: add session_id / project_path columns to session_files
1345
- # if they're missing. These were added for A2 `session` subcommand metadata;
1346
- # populated lazily in sync_cache() / _ensure_session_files_row().
1347
- add_column_if_missing(conn, "session_files", "session_id", "TEXT")
1348
- add_column_if_missing(conn, "session_files", "project_path", "TEXT")
1349
- conn.execute(
1350
- "CREATE INDEX IF NOT EXISTS idx_session_files_session_id "
1351
- "ON session_files(session_id)"
1352
- )
1571
+ # Apply the shared cache.db schema (cctally-dev#93, D4): Claude tables +
1572
+ # indexes, the session_id / project_path column adds on session_files
1573
+ # (A2 `session` metadata, populated lazily in sync_cache() /
1574
+ # _ensure_session_files_row()), the Codex base tables + indexes, and the
1575
+ # cache_meta sentinel table. This is the single cache.db schema source —
1576
+ # the eager-apply path (_eagerly_apply_cache_migrations) uses the SAME
1577
+ # helper, so the two can no longer drift. The Codex last_total_tokens
1578
+ # ALTER + purge stays below (out of the shared helper — D4/P1#3).
1579
+ _cctally_db_sib._apply_cache_schema(conn)
1353
1580
 
1354
1581
  # Migration: add last_total_tokens to codex_session_files. When the column
1355
1582
  # is newly added (i.e. this is the first run after upgrade), purge the
@@ -1408,7 +1635,7 @@ def cmd_cache_sync(args: argparse.Namespace) -> int:
1408
1635
  f"[cache-sync] claude done: {stats.files_processed} processed, "
1409
1636
  f"{stats.files_skipped_unchanged} skipped, "
1410
1637
  f"{stats.files_reset_truncated} reset, "
1411
- f"{stats.rows_inserted} rows inserted"
1638
+ f"{stats.rows_changed} rows changed"
1412
1639
  )
1413
1640
 
1414
1641
  if source in ("codex", "all"):
@@ -1426,7 +1653,7 @@ def cmd_cache_sync(args: argparse.Namespace) -> int:
1426
1653
  f"[cache-sync] codex done: {stats.files_processed} processed, "
1427
1654
  f"{stats.files_skipped_unchanged} skipped, "
1428
1655
  f"{stats.files_reset_truncated} reset, "
1429
- f"{stats.rows_inserted} rows inserted"
1656
+ f"{stats.rows_changed} rows changed"
1430
1657
  )
1431
1658
 
1432
1659
  return 0
@@ -67,6 +67,7 @@ def _init_paths_from_env() -> None:
67
67
  global UPDATE_STATE_PATH, UPDATE_SUPPRESS_PATH
68
68
  global UPDATE_LOCK_PATH, UPDATE_LOG_PATH, UPDATE_LOG_ROTATED_PATH
69
69
  global UPDATE_CHECK_LAST_FETCH_PATH, CLAUDE_SETTINGS_PATH
70
+ global CLAUDE_PROJECTS_DIR
70
71
 
71
72
  home = pathlib.Path.home()
72
73
  APP_DIR = home / ".local" / "share" / "cctally"
@@ -108,10 +109,60 @@ def _init_paths_from_env() -> None:
108
109
 
109
110
  CLAUDE_SETTINGS_PATH = home / ".claude" / "settings.json"
110
111
 
112
+ # Claude session JSONL root. Production path is `~/.claude/projects`;
113
+ # exposed as a module-level constant so cross-DB migrations (e.g.
114
+ # stats migration 008) and the dispatcher's empty-disk fallback can
115
+ # honor a fixture override via tests' `monkeypatch.setattr(
116
+ # _cctally_core, "CLAUDE_PROJECTS_DIR", tmp_path / "...")`. The
117
+ # `_get_claude_data_dirs()` helper in bin/cctally remains the
118
+ # authoritative resolver for ad-hoc reads (multi-root + env-aware);
119
+ # this constant is the single-rooted production default that 99% of
120
+ # callers want. For multi-root, env-aware resolution (mirroring
121
+ # `_get_claude_data_dirs`), use `_resolve_claude_projects_dirs()`.
122
+ CLAUDE_PROJECTS_DIR = home / ".claude" / "projects"
123
+
111
124
 
112
125
  _init_paths_from_env()
113
126
 
114
127
 
128
+ def _resolve_claude_projects_dirs() -> list[pathlib.Path]:
129
+ """Return Claude Code projects dirs that exist on disk, env-aware.
130
+
131
+ Mirrors `_get_claude_data_dirs()` in bin/cctally but returns the
132
+ `projects/` subdir directly (since cross-DB migrations only care
133
+ about the JSONL root, not the parent Claude data dir). Honors
134
+ ``CLAUDE_CONFIG_DIR`` (comma-separated multi-root) and falls back
135
+ to ``~/.config/claude`` then ``~/.claude``.
136
+
137
+ Used by stats migration 008's gate helper to avoid falsely
138
+ short-circuiting Layer C's empty-disk fallback when the user has
139
+ ``CLAUDE_CONFIG_DIR=/other/path`` set AND no ``~/.claude/projects``
140
+ dir on disk: the gate would otherwise see zero JSONL files at the
141
+ hardcoded ``CLAUDE_PROJECTS_DIR`` and "pass" the gate, then run the
142
+ recompute as a no-op against an empty cache.
143
+
144
+ Tests can also feed an explicit list to the gate helper directly,
145
+ skipping this resolver.
146
+ """
147
+ env_val = os.environ.get("CLAUDE_CONFIG_DIR", "").strip()
148
+ if env_val:
149
+ candidates = [pathlib.Path(p.strip()) for p in env_val.split(",") if p.strip()]
150
+ result = [
151
+ d / "projects"
152
+ for d in candidates
153
+ if d.is_dir() and (d / "projects").is_dir()
154
+ ]
155
+ if result:
156
+ return result
157
+
158
+ home = pathlib.Path.home()
159
+ defaults = [
160
+ home / ".config" / "claude",
161
+ home / ".claude",
162
+ ]
163
+ return [d / "projects" for d in defaults if d.is_dir() and (d / "projects").is_dir()]
164
+
165
+
115
166
  # === Logging =========================================================
116
167
 
117
168