cctally 1.11.1 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +47 -0
- package/bin/_cctally_cache.py +338 -111
- package/bin/_cctally_core.py +51 -0
- package/bin/_cctally_db.py +1654 -5
- package/bin/_cctally_record.py +1 -1
- package/bin/_cctally_setup.py +11 -1
- package/bin/_lib_jsonl.py +80 -16
- package/package.json +1 -1
package/bin/_cctally_cache.py
CHANGED
|
@@ -165,6 +165,7 @@ _CodexIterState = _lib_jsonl._CodexIterState
|
|
|
165
165
|
_iter_jsonl_entries_with_offsets = _lib_jsonl._iter_jsonl_entries_with_offsets
|
|
166
166
|
_iter_codex_jsonl_entries_with_offsets = _lib_jsonl._iter_codex_jsonl_entries_with_offsets
|
|
167
167
|
_parse_usage_entries = _lib_jsonl._parse_usage_entries
|
|
168
|
+
_should_replace = _lib_jsonl._should_replace
|
|
168
169
|
|
|
169
170
|
_cctally_db_sib = _load_lib("_cctally_db")
|
|
170
171
|
add_column_if_missing = _cctally_db_sib.add_column_if_missing
|
|
@@ -304,7 +305,16 @@ class IngestStats:
|
|
|
304
305
|
files_processed: int = 0
|
|
305
306
|
files_skipped_unchanged: int = 0
|
|
306
307
|
files_reset_truncated: int = 0
|
|
307
|
-
|
|
308
|
+
# Count of session_entries rows written by this sync — both genuinely-
|
|
309
|
+
# new INSERTs and ccusage-parity ON CONFLICT DO UPDATE replacements
|
|
310
|
+
# (the dedup tiebreaker swaps a streaming-intermediate row for the
|
|
311
|
+
# post-stream finalization). SQLite's `total_changes` counter
|
|
312
|
+
# increments on both, so this field is "rows changed", not "rows
|
|
313
|
+
# newly inserted". Pre-dedup builds used INSERT OR IGNORE where
|
|
314
|
+
# conflicts did NOT bump the counter; the name change preserves the
|
|
315
|
+
# observability metric without misrepresenting UPSERT updates as
|
|
316
|
+
# new inserts.
|
|
317
|
+
rows_changed: int = 0
|
|
308
318
|
lock_contended: bool = False
|
|
309
319
|
|
|
310
320
|
|
|
@@ -314,7 +324,7 @@ def _progress_stderr(stats: IngestStats, *, force: bool = False) -> None:
|
|
|
314
324
|
return
|
|
315
325
|
eprint(
|
|
316
326
|
f"[cache-sync] {stats.files_processed}/{stats.files_total} files, "
|
|
317
|
-
f"{stats.
|
|
327
|
+
f"{stats.rows_changed} rows changed"
|
|
318
328
|
)
|
|
319
329
|
|
|
320
330
|
|
|
@@ -433,6 +443,30 @@ def sync_cache(
|
|
|
433
443
|
stats.lock_contended = True
|
|
434
444
|
return stats
|
|
435
445
|
|
|
446
|
+
# Walk-complete sentinel gating (cctally-dev#93, D5b/D6b). Capture
|
|
447
|
+
# whether cache 001 was already applied at the moment this sync
|
|
448
|
+
# acquired the lock. The end-of-loop marker write is gated on this so
|
|
449
|
+
# a walk whose baseline predates the 001 wipe (the "straddle" run)
|
|
450
|
+
# withholds the marker — it cannot vouch for a cache 001 wiped
|
|
451
|
+
# underneath it. On the normal first-upgrade flow open_cache_db runs
|
|
452
|
+
# the dispatcher (001 applies in-process) BEFORE sync_cache is ever
|
|
453
|
+
# called, so this is True and the marker is written as expected. If
|
|
454
|
+
# schema_migrations doesn't exist yet, treat as not-applied (False).
|
|
455
|
+
try:
|
|
456
|
+
applied_at_start = conn.execute(
|
|
457
|
+
"SELECT 1 FROM schema_migrations WHERE name='001_dedup_highest_wins'"
|
|
458
|
+
).fetchone() is not None
|
|
459
|
+
except sqlite3.OperationalError:
|
|
460
|
+
applied_at_start = False
|
|
461
|
+
|
|
462
|
+
# Tracks whether every file in this walk was either ingested cleanly
|
|
463
|
+
# or confirmed-current. Any per-file error-skip (stat/read failure or
|
|
464
|
+
# a DB error that rolls back + continues) flips it False so the marker
|
|
465
|
+
# is withheld — an incomplete walk must not look complete. The
|
|
466
|
+
# unchanged-file early-exit (`size == prev_size`) does NOT flip it: a
|
|
467
|
+
# confirmed-current file still counts as walked.
|
|
468
|
+
walk_clean = True
|
|
469
|
+
|
|
436
470
|
if rebuild:
|
|
437
471
|
# Clear INSIDE the lock — a concurrent rebuild that lost the
|
|
438
472
|
# race would otherwise have wiped this cache before bailing,
|
|
@@ -441,6 +475,11 @@ def sync_cache(
|
|
|
441
475
|
# empty baseline.
|
|
442
476
|
conn.execute("DELETE FROM session_entries")
|
|
443
477
|
conn.execute("DELETE FROM session_files")
|
|
478
|
+
# Clear the walk-complete sentinel atomically with the wipe
|
|
479
|
+
# (cctally-dev#93, D5/D2): a stale "complete" marker must never
|
|
480
|
+
# survive a destructive rebuild. The end-of-loop write below
|
|
481
|
+
# re-establishes it only after this rebuild's clean walk.
|
|
482
|
+
conn.execute("DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'")
|
|
444
483
|
conn.commit()
|
|
445
484
|
eprint("[cache-sync] rebuild: cleared Claude cached entries")
|
|
446
485
|
|
|
@@ -464,6 +503,120 @@ def sync_cache(
|
|
|
464
503
|
)
|
|
465
504
|
}
|
|
466
505
|
|
|
506
|
+
# Orphaned-tracked-file detection (cctally-dev#93 review). A path
|
|
507
|
+
# tracked in session_files (with data already ingested) but no
|
|
508
|
+
# longer present on disk leaves orphaned session_entries rows that
|
|
509
|
+
# the per-file loop below never visits — it iterates only on-disk
|
|
510
|
+
# `paths`. sync_cache deliberately does NOT prune those orphans
|
|
511
|
+
# in-place: a deleted file shares the truncation hazard (under the
|
|
512
|
+
# sticky source_path dedup a surviving file may carry the same
|
|
513
|
+
# (msg_id, req_id) yet keep its size_bytes, so a per-orphan DELETE
|
|
514
|
+
# could drop a row the survivor still owns without re-ingesting
|
|
515
|
+
# it), and a blanket full-reset would wrongly fire on the
|
|
516
|
+
# legitimate "cache seeded with synthetic source paths" fixture
|
|
517
|
+
# pattern. Instead we INVALIDATE the walk-complete marker: an
|
|
518
|
+
# orphaned cache no longer faithfully mirrors disk, so it is — by
|
|
519
|
+
# the marker's own definition — not a complete walk. We must
|
|
520
|
+
# actively DELETE any marker a PRIOR clean walk left behind (not
|
|
521
|
+
# merely withhold THIS run's end-of-loop rewrite — that rewrite is
|
|
522
|
+
# gated on walk_clean, but a stale marker from a previous sync
|
|
523
|
+
# would otherwise survive and keep vouching for completeness).
|
|
524
|
+
# Setting walk_clean=False additionally suppresses the end-of-loop
|
|
525
|
+
# rewrite so the marker stays absent for this run. With the marker
|
|
526
|
+
# gone the upgrade gate DEFERs the 008/009/010 recomputes (rather
|
|
527
|
+
# than certifying aggregates that still include data from files no
|
|
528
|
+
# longer on disk); the operator clears the orphans by running
|
|
529
|
+
# `cache-sync --rebuild` (the documented re-derive path), which
|
|
530
|
+
# re-establishes the marker. Only paths whose row carried ingested
|
|
531
|
+
# bytes (size_bytes > 0) count — a size_bytes=0 row holds no
|
|
532
|
+
# session_entries, so its absence leaves no orphan. The DELETE +
|
|
533
|
+
# commit lands BEFORE the per-file read+parse loop, so no write
|
|
534
|
+
# lock is held into that loop (same discipline as the truncation
|
|
535
|
+
# escalation just below).
|
|
536
|
+
on_disk_paths = {str(jp) for jp in paths}
|
|
537
|
+
orphaned_tracked_paths = [
|
|
538
|
+
p for p, (size_bytes, _, _) in existing.items()
|
|
539
|
+
if size_bytes and p not in on_disk_paths
|
|
540
|
+
]
|
|
541
|
+
if orphaned_tracked_paths:
|
|
542
|
+
eprint(
|
|
543
|
+
f"[cache] {len(orphaned_tracked_paths)} tracked file(s) no "
|
|
544
|
+
f"longer on disk; invalidating walk-complete marker "
|
|
545
|
+
f"(run `cache-sync --rebuild` to prune orphaned entries)"
|
|
546
|
+
)
|
|
547
|
+
conn.execute(
|
|
548
|
+
"DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'"
|
|
549
|
+
)
|
|
550
|
+
conn.commit()
|
|
551
|
+
walk_clean = False # orphaned rows -> cache doesn't mirror disk (D5a)
|
|
552
|
+
|
|
553
|
+
# Pre-scan for any truncation among tracked files. Under the
|
|
554
|
+
# ccusage-parity ON CONFLICT DO UPDATE, source_path is PINNED to
|
|
555
|
+
# whichever file first inserted a (msg_id, req_id) row (see U1
|
|
556
|
+
# in this file). Later UPSERTs from a DIFFERENT file may have
|
|
557
|
+
# updated the token columns on that row while leaving source_path
|
|
558
|
+
# pointing at the original (now possibly truncated) file. A
|
|
559
|
+
# naive per-file truncation path then deletes by source_path and
|
|
560
|
+
# loses data the other file is still carrying — but that other
|
|
561
|
+
# file's `size_bytes` is unchanged, so the per-file early-exit
|
|
562
|
+
# at `if size == prev_size: continue` skips its re-ingest.
|
|
563
|
+
#
|
|
564
|
+
# Escalation: when any file's size has shrunk, drop the entire
|
|
565
|
+
# session_entries cache and force every file to re-ingest from
|
|
566
|
+
# offset 0. The cache is fully re-derivable, this is rare (only
|
|
567
|
+
# on JSONL rotation / manual edits), and it sidesteps the
|
|
568
|
+
# per-key contributing-file bookkeeping that would otherwise be
|
|
569
|
+
# required. The lock is already held, so this is atomic with
|
|
570
|
+
# the subsequent per-file ingest.
|
|
571
|
+
truncated_paths: set[str] = set()
|
|
572
|
+
for jp in paths:
|
|
573
|
+
prev = existing.get(str(jp))
|
|
574
|
+
if prev is None:
|
|
575
|
+
continue
|
|
576
|
+
try:
|
|
577
|
+
st = jp.stat()
|
|
578
|
+
except OSError:
|
|
579
|
+
continue
|
|
580
|
+
if st.st_size < prev[0]:
|
|
581
|
+
truncated_paths.add(str(jp))
|
|
582
|
+
|
|
583
|
+
if truncated_paths:
|
|
584
|
+
eprint(
|
|
585
|
+
f"[cache-sync] truncation detected on {len(truncated_paths)} "
|
|
586
|
+
f"file(s) — re-ingesting all files (safe under ccusage-parity "
|
|
587
|
+
f"dedup)"
|
|
588
|
+
)
|
|
589
|
+
conn.execute("DELETE FROM session_entries")
|
|
590
|
+
# Clear the walk-complete sentinel atomically with the truncation
|
|
591
|
+
# full-reset (cctally-dev#93, D5/D2): the cache is being wiped, so
|
|
592
|
+
# any "complete" marker is now stale. The end-of-loop write below
|
|
593
|
+
# re-establishes it only after this run's clean re-ingest walk.
|
|
594
|
+
conn.execute("DELETE FROM cache_meta WHERE key='claude_ingest_walk_complete'")
|
|
595
|
+
# Crash-safety: also clear session_files's size/offset tracking
|
|
596
|
+
# so a partial-state recovery on the NEXT sync forces every
|
|
597
|
+
# file's per-file branch to take the fresh-ingest path. Without
|
|
598
|
+
# this, if the process is killed (kill -9, power loss) between
|
|
599
|
+
# this DELETE commit and the per-file re-ingest commits below,
|
|
600
|
+
# the next sync would only re-detect the originally-truncated
|
|
601
|
+
# file(s); other files still have matching size_bytes and the
|
|
602
|
+
# `if size == prev_size: continue` early-exit would leave them
|
|
603
|
+
# missing from session_entries until file size changes or an
|
|
604
|
+
# operator runs `cache-sync --rebuild`. UPDATE (not DELETE)
|
|
605
|
+
# preserves session_id / project_path columns lazy-backfilled
|
|
606
|
+
# by _ensure_session_files_row (used by the `session`
|
|
607
|
+
# subcommand's JOIN).
|
|
608
|
+
conn.execute(
|
|
609
|
+
"UPDATE session_files SET size_bytes = 0, last_byte_offset = 0"
|
|
610
|
+
)
|
|
611
|
+
conn.commit()
|
|
612
|
+
stats.files_reset_truncated += len(truncated_paths)
|
|
613
|
+
# Force every file to re-ingest from offset 0: clearing the
|
|
614
|
+
# `existing` map makes `prev is None` true downstream, so the
|
|
615
|
+
# per-file branch takes the fresh-ingest path (start_offset=0,
|
|
616
|
+
# truncated=False since we already wiped the table above —
|
|
617
|
+
# avoids a redundant per-file DELETE that would be a no-op).
|
|
618
|
+
existing = {}
|
|
619
|
+
|
|
467
620
|
for jp in paths:
|
|
468
621
|
path_str = str(jp)
|
|
469
622
|
# Backfill session_id/project_path for A2 `session` subcommand.
|
|
@@ -477,6 +630,7 @@ def sync_cache(
|
|
|
477
630
|
st = jp.stat()
|
|
478
631
|
except OSError as exc:
|
|
479
632
|
eprint(f"[cache] stat failed for {jp}: {exc}")
|
|
633
|
+
walk_clean = False # skipped a file without ingesting (D5a)
|
|
480
634
|
continue
|
|
481
635
|
|
|
482
636
|
size = st.st_size
|
|
@@ -535,6 +689,7 @@ def sync_cache(
|
|
|
535
689
|
final_offset = fh.tell()
|
|
536
690
|
except OSError as exc:
|
|
537
691
|
eprint(f"[cache] could not read {jp}: {exc}")
|
|
692
|
+
walk_clean = False # skipped a file without ingesting (D5a)
|
|
538
693
|
continue
|
|
539
694
|
|
|
540
695
|
# Python's sqlite3 module starts an implicit transaction on the
|
|
@@ -552,16 +707,65 @@ def sync_cache(
|
|
|
552
707
|
stats.files_reset_truncated += 1
|
|
553
708
|
if rows:
|
|
554
709
|
before = conn.total_changes
|
|
710
|
+
# ccusage-parity ON CONFLICT DO UPDATE: higher-token total
|
|
711
|
+
# wins on conflict; speed-set breaks ties. The partial
|
|
712
|
+
# UNIQUE index `idx_entries_dedup` restricts the conflict
|
|
713
|
+
# target to (msg_id IS NOT NULL AND req_id IS NOT NULL),
|
|
714
|
+
# so the WHERE clause on the conflict target MUST repeat
|
|
715
|
+
# that predicate verbatim — bare `ON CONFLICT(msg_id,
|
|
716
|
+
# req_id)` raises OperationalError. NULL-keyed rows fall
|
|
717
|
+
# through to a plain INSERT, unchanged.
|
|
718
|
+
#
|
|
719
|
+
# `source_path` is INTENTIONALLY OMITTED from the DO
|
|
720
|
+
# UPDATE SET clause: it stays pinned to whichever JSONL
|
|
721
|
+
# FIRST INSERTed the (msg_id, req_id) row. The
|
|
722
|
+
# downstream `LEFT JOIN session_files ON sf.path =
|
|
723
|
+
# se.source_path` uses source_path to attribute tokens
|
|
724
|
+
# to a `project_path`. If a later UPSERT from a
|
|
725
|
+
# different file flipped source_path, the row's
|
|
726
|
+
# project attribution would move with the winner —
|
|
727
|
+
# `cctally project` would mis-aggregate. Sticky
|
|
728
|
+
# source_path matches pre-dedup INSERT OR IGNORE
|
|
729
|
+
# behavior and the operator's mental model.
|
|
730
|
+
# (`line_offset` is similarly sticky for the same
|
|
731
|
+
# reason — the offset only makes sense within the
|
|
732
|
+
# file that originally wrote the row.)
|
|
555
733
|
conn.executemany(
|
|
556
|
-
"""INSERT
|
|
734
|
+
"""INSERT INTO session_entries
|
|
557
735
|
(source_path, line_offset, timestamp_utc, model,
|
|
558
736
|
msg_id, req_id, input_tokens, output_tokens,
|
|
559
737
|
cache_create_tokens, cache_read_tokens,
|
|
560
738
|
usage_extra_json, cost_usd_raw)
|
|
561
|
-
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
|
|
739
|
+
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
|
|
740
|
+
ON CONFLICT(msg_id, req_id)
|
|
741
|
+
WHERE msg_id IS NOT NULL AND req_id IS NOT NULL
|
|
742
|
+
DO UPDATE SET
|
|
743
|
+
timestamp_utc = excluded.timestamp_utc,
|
|
744
|
+
model = excluded.model,
|
|
745
|
+
input_tokens = excluded.input_tokens,
|
|
746
|
+
output_tokens = excluded.output_tokens,
|
|
747
|
+
cache_create_tokens = excluded.cache_create_tokens,
|
|
748
|
+
cache_read_tokens = excluded.cache_read_tokens,
|
|
749
|
+
usage_extra_json = excluded.usage_extra_json,
|
|
750
|
+
cost_usd_raw = excluded.cost_usd_raw
|
|
751
|
+
WHERE
|
|
752
|
+
(excluded.input_tokens + excluded.output_tokens
|
|
753
|
+
+ excluded.cache_create_tokens + excluded.cache_read_tokens)
|
|
754
|
+
>
|
|
755
|
+
(session_entries.input_tokens + session_entries.output_tokens
|
|
756
|
+
+ session_entries.cache_create_tokens + session_entries.cache_read_tokens)
|
|
757
|
+
OR (
|
|
758
|
+
(excluded.input_tokens + excluded.output_tokens
|
|
759
|
+
+ excluded.cache_create_tokens + excluded.cache_read_tokens)
|
|
760
|
+
=
|
|
761
|
+
(session_entries.input_tokens + session_entries.output_tokens
|
|
762
|
+
+ session_entries.cache_create_tokens + session_entries.cache_read_tokens)
|
|
763
|
+
AND json_extract(excluded.usage_extra_json, '$.speed') IS NOT NULL
|
|
764
|
+
AND json_extract(session_entries.usage_extra_json, '$.speed') IS NULL
|
|
765
|
+
)""",
|
|
562
766
|
rows,
|
|
563
767
|
)
|
|
564
|
-
stats.
|
|
768
|
+
stats.rows_changed += conn.total_changes - before
|
|
565
769
|
# UPSERT preserves session_id / project_path columns populated
|
|
566
770
|
# by _ensure_session_files_row at the top of this loop. A plain
|
|
567
771
|
# INSERT OR REPLACE would wipe them on every changed-file sync.
|
|
@@ -584,6 +788,7 @@ def sync_cache(
|
|
|
584
788
|
except sqlite3.DatabaseError as exc:
|
|
585
789
|
eprint(f"[cache] db error on {jp}: {exc}")
|
|
586
790
|
conn.rollback()
|
|
791
|
+
walk_clean = False # rolled back this file without ingesting (D5a)
|
|
587
792
|
continue
|
|
588
793
|
|
|
589
794
|
if progress is not None:
|
|
@@ -591,6 +796,22 @@ def sync_cache(
|
|
|
591
796
|
|
|
592
797
|
if progress is not None:
|
|
593
798
|
progress(stats)
|
|
799
|
+
|
|
800
|
+
# Walk-complete sentinel write (cctally-dev#93, D5a). Still inside the
|
|
801
|
+
# held fcntl lock, before the finally-unlock. Only when the entire walk
|
|
802
|
+
# was clean AND cache 001 was already applied at the start of this run
|
|
803
|
+
# (D5b): an unclean walk or a straddle run must not vouch for cache
|
|
804
|
+
# completeness. A lock-contended sync returned early above and never
|
|
805
|
+
# reaches here. Presence (not the timestamp) is the gate signal; the
|
|
806
|
+
# value stores the completion instant for doctor/debugging.
|
|
807
|
+
if walk_clean and applied_at_start:
|
|
808
|
+
conn.execute(
|
|
809
|
+
"INSERT INTO cache_meta(key, value) "
|
|
810
|
+
"VALUES('claude_ingest_walk_complete', ?) "
|
|
811
|
+
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
|
812
|
+
(dt.datetime.now(dt.timezone.utc).isoformat(),),
|
|
813
|
+
)
|
|
814
|
+
conn.commit()
|
|
594
815
|
return stats
|
|
595
816
|
finally:
|
|
596
817
|
try:
|
|
@@ -664,15 +885,26 @@ def _collect_entries_direct(
|
|
|
664
885
|
*,
|
|
665
886
|
project: str | None = None,
|
|
666
887
|
) -> list[UsageEntry]:
|
|
667
|
-
"""Legacy direct-parse fallback used when the cache DB can't be opened.
|
|
888
|
+
"""Legacy direct-parse fallback used when the cache DB can't be opened.
|
|
889
|
+
|
|
890
|
+
Uses the ccusage-parity dict-keyed accumulator: dedup-keyed entries
|
|
891
|
+
live in `dedupe_map` and are tiebroken via `_should_replace` (higher
|
|
892
|
+
token total wins, speed-set breaks ties). Entries with NULL msg_id or
|
|
893
|
+
req_id bypass the map and land verbatim — partial UNIQUE index on the
|
|
894
|
+
cache mirrors this behavior. Flattened + sorted once at the end.
|
|
895
|
+
"""
|
|
668
896
|
files = _discover_session_files(range_start, project=project)
|
|
669
|
-
|
|
670
|
-
|
|
897
|
+
dedupe_map: dict[str, UsageEntry] = {}
|
|
898
|
+
no_key: list[UsageEntry] = []
|
|
671
899
|
for fp in files:
|
|
672
|
-
|
|
673
|
-
_parse_usage_entries(
|
|
900
|
+
no_key.extend(
|
|
901
|
+
_parse_usage_entries(
|
|
902
|
+
fp, range_start, range_end, dedupe_map=dedupe_map,
|
|
903
|
+
)
|
|
674
904
|
)
|
|
675
|
-
|
|
905
|
+
all_entries = list(dedupe_map.values()) + no_key
|
|
906
|
+
all_entries.sort(key=lambda e: e.timestamp)
|
|
907
|
+
return all_entries
|
|
676
908
|
|
|
677
909
|
|
|
678
910
|
# === Region 4: _JoinedClaudeEntry + get_claude_session_entries (was bin/cctally:2478-2668) ===
|
|
@@ -808,10 +1040,23 @@ def _direct_parse_claude_session_entries(
|
|
|
808
1040
|
scan the file for the first `sessionId` / `cwd` value, else fall
|
|
809
1041
|
back to the filename UUID and the decoded-escaped parent directory
|
|
810
1042
|
— same logic as `_ensure_session_files_row`.
|
|
1043
|
+
|
|
1044
|
+
Uses the ccusage-parity dict-keyed accumulator. Each per-file parse
|
|
1045
|
+
contributes into a global `(entry, source_path)` map keyed by
|
|
1046
|
+
`msg_id:req_id`; ties broken by `_should_replace`. NULL-keyed entries
|
|
1047
|
+
bypass dedup. After all files are walked, results are stamped with
|
|
1048
|
+
their owning file's session_id/cwd metadata and emitted in
|
|
1049
|
+
timestamp order.
|
|
811
1050
|
"""
|
|
812
|
-
results: list[_JoinedClaudeEntry] = []
|
|
813
1051
|
files = _discover_session_files(range_start, project=project)
|
|
814
|
-
|
|
1052
|
+
|
|
1053
|
+
# File metadata: source_path -> (session_id, project_path/cwd).
|
|
1054
|
+
meta_by_path: dict[str, tuple[str, str]] = {}
|
|
1055
|
+
|
|
1056
|
+
# Global accumulator: (msg_id:req_id) -> (UsageEntry, source_path).
|
|
1057
|
+
dedupe_map: dict[str, tuple[UsageEntry, str]] = {}
|
|
1058
|
+
# Null-key entries (rare; same as the cache's partial-index fallthrough).
|
|
1059
|
+
no_key_with_meta: list[tuple[UsageEntry, str]] = []
|
|
815
1060
|
|
|
816
1061
|
for fp in files:
|
|
817
1062
|
source_path = str(fp)
|
|
@@ -846,27 +1091,67 @@ def _direct_parse_claude_session_entries(
|
|
|
846
1091
|
session_id = os.path.splitext(os.path.basename(source_path))[0]
|
|
847
1092
|
if cwd is None:
|
|
848
1093
|
cwd = _decode_escaped_cwd(os.path.basename(os.path.dirname(source_path)))
|
|
1094
|
+
meta_by_path[source_path] = (session_id, cwd)
|
|
1095
|
+
|
|
1096
|
+
# Parse this file with a fresh per-file dedupe_map so we can attach
|
|
1097
|
+
# the source_path provenance to whatever wins this file's local
|
|
1098
|
+
# contests. Then merge into the global map using the same
|
|
1099
|
+
# `_should_replace` rule. (A shared dedupe_map across files would
|
|
1100
|
+
# lose the source_path of the winning entry — _parse_usage_entries
|
|
1101
|
+
# has no awareness of per-file metadata.)
|
|
1102
|
+
file_dedupe_map: dict[str, UsageEntry] = {}
|
|
1103
|
+
file_no_key = _parse_usage_entries(
|
|
1104
|
+
fp, range_start, range_end, dedupe_map=file_dedupe_map,
|
|
1105
|
+
)
|
|
849
1106
|
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
1107
|
+
# Merge file-local no-key entries directly (no dedup contest).
|
|
1108
|
+
for entry in file_no_key:
|
|
1109
|
+
no_key_with_meta.append((entry, source_path))
|
|
1110
|
+
|
|
1111
|
+
# Merge file-local dedup-keyed entries into the global map.
|
|
1112
|
+
# Same tiebreaker as the cache's ON CONFLICT DO UPDATE clause:
|
|
1113
|
+
# higher-token total wins the entry DATA. But `source_path` is
|
|
1114
|
+
# STICKY to whichever file FIRST contributed the key — it is NOT
|
|
1115
|
+
# flipped to the winner. This mirrors the cache ingest path, where
|
|
1116
|
+
# `source_path` is intentionally OMITTED from the ON CONFLICT DO
|
|
1117
|
+
# UPDATE SET clause (see this file's UPSERT, ~line 636) so the
|
|
1118
|
+
# downstream `LEFT JOIN session_files ON sf.path = se.source_path`
|
|
1119
|
+
# attributes tokens to the project of the file that first wrote the
|
|
1120
|
+
# row. Replacing it here would move project attribution to the
|
|
1121
|
+
# winner's file — `cctally project` (and any session_files join)
|
|
1122
|
+
# would then disagree with the normal cached behavior exactly when
|
|
1123
|
+
# this fallback path is exercised.
|
|
1124
|
+
for key, entry in file_dedupe_map.items():
|
|
1125
|
+
existing = dedupe_map.get(key)
|
|
1126
|
+
if existing is None:
|
|
1127
|
+
dedupe_map[key] = (entry, source_path)
|
|
1128
|
+
elif _should_replace(entry, existing[0]):
|
|
1129
|
+
# Winner's DATA, first contributor's source_path (sticky).
|
|
1130
|
+
dedupe_map[key] = (entry, existing[1])
|
|
1131
|
+
|
|
1132
|
+
# Flatten + emit.
|
|
1133
|
+
results: list[_JoinedClaudeEntry] = []
|
|
1134
|
+
flat: list[tuple[UsageEntry, str]] = list(dedupe_map.values()) + no_key_with_meta
|
|
1135
|
+
flat.sort(key=lambda pair: pair[0].timestamp)
|
|
1136
|
+
for entry, source_path in flat:
|
|
1137
|
+
usage = entry.usage
|
|
1138
|
+
sid, cwd = meta_by_path[source_path]
|
|
1139
|
+
results.append(_JoinedClaudeEntry(
|
|
1140
|
+
timestamp=entry.timestamp,
|
|
1141
|
+
model=entry.model,
|
|
1142
|
+
input_tokens=int(usage.get("input_tokens", 0) or 0),
|
|
1143
|
+
output_tokens=int(usage.get("output_tokens", 0) or 0),
|
|
1144
|
+
cache_creation_tokens=int(
|
|
1145
|
+
usage.get("cache_creation_input_tokens", 0) or 0
|
|
1146
|
+
),
|
|
1147
|
+
cache_read_tokens=int(
|
|
1148
|
+
usage.get("cache_read_input_tokens", 0) or 0
|
|
1149
|
+
),
|
|
1150
|
+
source_path=source_path,
|
|
1151
|
+
session_id=sid,
|
|
1152
|
+
project_path=cwd,
|
|
1153
|
+
cost_usd=entry.cost_usd,
|
|
1154
|
+
))
|
|
870
1155
|
|
|
871
1156
|
return results
|
|
872
1157
|
|
|
@@ -880,7 +1165,13 @@ class CodexIngestStats:
|
|
|
880
1165
|
files_processed: int = 0
|
|
881
1166
|
files_skipped_unchanged: int = 0
|
|
882
1167
|
files_reset_truncated: int = 0
|
|
883
|
-
|
|
1168
|
+
# Count of codex_session_entries rows written by this sync. Codex
|
|
1169
|
+
# ingest uses INSERT OR IGNORE — ignored conflicts do NOT bump
|
|
1170
|
+
# SQLite's `total_changes`, so this number is effectively "rows
|
|
1171
|
+
# newly inserted". Field is named ``rows_changed`` for parity with
|
|
1172
|
+
# ``IngestStats`` (Claude path) which carries an UPSERT and
|
|
1173
|
+
# therefore counts both new INSERTs and DO UPDATE replacements.
|
|
1174
|
+
rows_changed: int = 0
|
|
884
1175
|
lock_contended: bool = False
|
|
885
1176
|
|
|
886
1177
|
|
|
@@ -890,7 +1181,7 @@ def _progress_codex_stderr(stats: CodexIngestStats, *, force: bool = False) -> N
|
|
|
890
1181
|
return
|
|
891
1182
|
eprint(
|
|
892
1183
|
f"[codex-cache] {stats.files_processed}/{stats.files_total} files, "
|
|
893
|
-
f"{stats.
|
|
1184
|
+
f"{stats.rows_changed} rows changed"
|
|
894
1185
|
)
|
|
895
1186
|
|
|
896
1187
|
|
|
@@ -1095,7 +1386,7 @@ def sync_codex_cache(
|
|
|
1095
1386
|
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
|
1096
1387
|
rows,
|
|
1097
1388
|
)
|
|
1098
|
-
stats.
|
|
1389
|
+
stats.rows_changed += conn.total_changes - before
|
|
1099
1390
|
conn.execute(
|
|
1100
1391
|
"""INSERT OR REPLACE INTO codex_session_files
|
|
1101
1392
|
(path, size_bytes, mtime_ns, last_byte_offset,
|
|
@@ -1277,79 +1568,15 @@ def open_cache_db() -> sqlite3.Connection:
|
|
|
1277
1568
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
1278
1569
|
conn.execute("PRAGMA busy_timeout=5000")
|
|
1279
1570
|
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
CREATE TABLE IF NOT EXISTS session_entries (
|
|
1290
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1291
|
-
source_path TEXT NOT NULL,
|
|
1292
|
-
line_offset INTEGER NOT NULL,
|
|
1293
|
-
timestamp_utc TEXT NOT NULL,
|
|
1294
|
-
model TEXT NOT NULL,
|
|
1295
|
-
msg_id TEXT,
|
|
1296
|
-
req_id TEXT,
|
|
1297
|
-
input_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1298
|
-
output_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1299
|
-
cache_create_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1300
|
-
cache_read_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1301
|
-
usage_extra_json TEXT,
|
|
1302
|
-
cost_usd_raw REAL
|
|
1303
|
-
);
|
|
1304
|
-
CREATE INDEX IF NOT EXISTS idx_entries_timestamp
|
|
1305
|
-
ON session_entries(timestamp_utc);
|
|
1306
|
-
CREATE INDEX IF NOT EXISTS idx_entries_source
|
|
1307
|
-
ON session_entries(source_path);
|
|
1308
|
-
CREATE UNIQUE INDEX IF NOT EXISTS idx_entries_dedup
|
|
1309
|
-
ON session_entries(msg_id, req_id)
|
|
1310
|
-
WHERE msg_id IS NOT NULL AND req_id IS NOT NULL;
|
|
1311
|
-
|
|
1312
|
-
CREATE TABLE IF NOT EXISTS codex_session_files (
|
|
1313
|
-
path TEXT PRIMARY KEY,
|
|
1314
|
-
size_bytes INTEGER NOT NULL,
|
|
1315
|
-
mtime_ns INTEGER NOT NULL,
|
|
1316
|
-
last_byte_offset INTEGER NOT NULL,
|
|
1317
|
-
last_ingested_at TEXT NOT NULL,
|
|
1318
|
-
last_session_id TEXT,
|
|
1319
|
-
last_model TEXT
|
|
1320
|
-
);
|
|
1321
|
-
CREATE TABLE IF NOT EXISTS codex_session_entries (
|
|
1322
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1323
|
-
source_path TEXT NOT NULL,
|
|
1324
|
-
line_offset INTEGER NOT NULL,
|
|
1325
|
-
timestamp_utc TEXT NOT NULL,
|
|
1326
|
-
session_id TEXT NOT NULL,
|
|
1327
|
-
model TEXT NOT NULL,
|
|
1328
|
-
input_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1329
|
-
cached_input_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1330
|
-
output_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1331
|
-
reasoning_output_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1332
|
-
total_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1333
|
-
UNIQUE(source_path, line_offset)
|
|
1334
|
-
);
|
|
1335
|
-
CREATE INDEX IF NOT EXISTS idx_codex_entries_timestamp
|
|
1336
|
-
ON codex_session_entries(timestamp_utc);
|
|
1337
|
-
CREATE INDEX IF NOT EXISTS idx_codex_entries_session
|
|
1338
|
-
ON codex_session_entries(session_id);
|
|
1339
|
-
CREATE INDEX IF NOT EXISTS idx_codex_entries_source
|
|
1340
|
-
ON codex_session_entries(source_path);
|
|
1341
|
-
"""
|
|
1342
|
-
)
|
|
1343
|
-
|
|
1344
|
-
# Inline migration: add session_id / project_path columns to session_files
|
|
1345
|
-
# if they're missing. These were added for A2 `session` subcommand metadata;
|
|
1346
|
-
# populated lazily in sync_cache() / _ensure_session_files_row().
|
|
1347
|
-
add_column_if_missing(conn, "session_files", "session_id", "TEXT")
|
|
1348
|
-
add_column_if_missing(conn, "session_files", "project_path", "TEXT")
|
|
1349
|
-
conn.execute(
|
|
1350
|
-
"CREATE INDEX IF NOT EXISTS idx_session_files_session_id "
|
|
1351
|
-
"ON session_files(session_id)"
|
|
1352
|
-
)
|
|
1571
|
+
# Apply the shared cache.db schema (cctally-dev#93, D4): Claude tables +
|
|
1572
|
+
# indexes, the session_id / project_path column adds on session_files
|
|
1573
|
+
# (A2 `session` metadata, populated lazily in sync_cache() /
|
|
1574
|
+
# _ensure_session_files_row()), the Codex base tables + indexes, and the
|
|
1575
|
+
# cache_meta sentinel table. This is the single cache.db schema source —
|
|
1576
|
+
# the eager-apply path (_eagerly_apply_cache_migrations) uses the SAME
|
|
1577
|
+
# helper, so the two can no longer drift. The Codex last_total_tokens
|
|
1578
|
+
# ALTER + purge stays below (out of the shared helper — D4/P1#3).
|
|
1579
|
+
_cctally_db_sib._apply_cache_schema(conn)
|
|
1353
1580
|
|
|
1354
1581
|
# Migration: add last_total_tokens to codex_session_files. When the column
|
|
1355
1582
|
# is newly added (i.e. this is the first run after upgrade), purge the
|
|
@@ -1408,7 +1635,7 @@ def cmd_cache_sync(args: argparse.Namespace) -> int:
|
|
|
1408
1635
|
f"[cache-sync] claude done: {stats.files_processed} processed, "
|
|
1409
1636
|
f"{stats.files_skipped_unchanged} skipped, "
|
|
1410
1637
|
f"{stats.files_reset_truncated} reset, "
|
|
1411
|
-
f"{stats.
|
|
1638
|
+
f"{stats.rows_changed} rows changed"
|
|
1412
1639
|
)
|
|
1413
1640
|
|
|
1414
1641
|
if source in ("codex", "all"):
|
|
@@ -1426,7 +1653,7 @@ def cmd_cache_sync(args: argparse.Namespace) -> int:
|
|
|
1426
1653
|
f"[cache-sync] codex done: {stats.files_processed} processed, "
|
|
1427
1654
|
f"{stats.files_skipped_unchanged} skipped, "
|
|
1428
1655
|
f"{stats.files_reset_truncated} reset, "
|
|
1429
|
-
f"{stats.
|
|
1656
|
+
f"{stats.rows_changed} rows changed"
|
|
1430
1657
|
)
|
|
1431
1658
|
|
|
1432
1659
|
return 0
|
package/bin/_cctally_core.py
CHANGED
|
@@ -67,6 +67,7 @@ def _init_paths_from_env() -> None:
|
|
|
67
67
|
global UPDATE_STATE_PATH, UPDATE_SUPPRESS_PATH
|
|
68
68
|
global UPDATE_LOCK_PATH, UPDATE_LOG_PATH, UPDATE_LOG_ROTATED_PATH
|
|
69
69
|
global UPDATE_CHECK_LAST_FETCH_PATH, CLAUDE_SETTINGS_PATH
|
|
70
|
+
global CLAUDE_PROJECTS_DIR
|
|
70
71
|
|
|
71
72
|
home = pathlib.Path.home()
|
|
72
73
|
APP_DIR = home / ".local" / "share" / "cctally"
|
|
@@ -108,10 +109,60 @@ def _init_paths_from_env() -> None:
|
|
|
108
109
|
|
|
109
110
|
CLAUDE_SETTINGS_PATH = home / ".claude" / "settings.json"
|
|
110
111
|
|
|
112
|
+
# Claude session JSONL root. Production path is `~/.claude/projects`;
|
|
113
|
+
# exposed as a module-level constant so cross-DB migrations (e.g.
|
|
114
|
+
# stats migration 008) and the dispatcher's empty-disk fallback can
|
|
115
|
+
# honor a fixture override via tests' `monkeypatch.setattr(
|
|
116
|
+
# _cctally_core, "CLAUDE_PROJECTS_DIR", tmp_path / "...")`. The
|
|
117
|
+
# `_get_claude_data_dirs()` helper in bin/cctally remains the
|
|
118
|
+
# authoritative resolver for ad-hoc reads (multi-root + env-aware);
|
|
119
|
+
# this constant is the single-rooted production default that 99% of
|
|
120
|
+
# callers want. For multi-root, env-aware resolution (mirroring
|
|
121
|
+
# `_get_claude_data_dirs`), use `_resolve_claude_projects_dirs()`.
|
|
122
|
+
CLAUDE_PROJECTS_DIR = home / ".claude" / "projects"
|
|
123
|
+
|
|
111
124
|
|
|
112
125
|
_init_paths_from_env()
|
|
113
126
|
|
|
114
127
|
|
|
128
|
+
def _resolve_claude_projects_dirs() -> list[pathlib.Path]:
|
|
129
|
+
"""Return Claude Code projects dirs that exist on disk, env-aware.
|
|
130
|
+
|
|
131
|
+
Mirrors `_get_claude_data_dirs()` in bin/cctally but returns the
|
|
132
|
+
`projects/` subdir directly (since cross-DB migrations only care
|
|
133
|
+
about the JSONL root, not the parent Claude data dir). Honors
|
|
134
|
+
``CLAUDE_CONFIG_DIR`` (comma-separated multi-root) and falls back
|
|
135
|
+
to ``~/.config/claude`` then ``~/.claude``.
|
|
136
|
+
|
|
137
|
+
Used by stats migration 008's gate helper to avoid falsely
|
|
138
|
+
short-circuiting Layer C's empty-disk fallback when the user has
|
|
139
|
+
``CLAUDE_CONFIG_DIR=/other/path`` set AND no ``~/.claude/projects``
|
|
140
|
+
dir on disk: the gate would otherwise see zero JSONL files at the
|
|
141
|
+
hardcoded ``CLAUDE_PROJECTS_DIR`` and "pass" the gate, then run the
|
|
142
|
+
recompute as a no-op against an empty cache.
|
|
143
|
+
|
|
144
|
+
Tests can also feed an explicit list to the gate helper directly,
|
|
145
|
+
skipping this resolver.
|
|
146
|
+
"""
|
|
147
|
+
env_val = os.environ.get("CLAUDE_CONFIG_DIR", "").strip()
|
|
148
|
+
if env_val:
|
|
149
|
+
candidates = [pathlib.Path(p.strip()) for p in env_val.split(",") if p.strip()]
|
|
150
|
+
result = [
|
|
151
|
+
d / "projects"
|
|
152
|
+
for d in candidates
|
|
153
|
+
if d.is_dir() and (d / "projects").is_dir()
|
|
154
|
+
]
|
|
155
|
+
if result:
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
home = pathlib.Path.home()
|
|
159
|
+
defaults = [
|
|
160
|
+
home / ".config" / "claude",
|
|
161
|
+
home / ".claude",
|
|
162
|
+
]
|
|
163
|
+
return [d / "projects" for d in defaults if d.is_dir() and (d / "projects").is_dir()]
|
|
164
|
+
|
|
165
|
+
|
|
115
166
|
# === Logging =========================================================
|
|
116
167
|
|
|
117
168
|
|