cctally 1.7.0 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/bin/_cctally_alerts.py +231 -0
- package/bin/_cctally_cache.py +1432 -0
- package/bin/_cctally_config.py +560 -0
- package/bin/_cctally_dashboard.py +5218 -0
- package/bin/_cctally_db.py +1729 -0
- package/bin/_cctally_record.py +2120 -0
- package/bin/_cctally_refresh.py +812 -0
- package/bin/_cctally_release.py +751 -0
- package/bin/_cctally_setup.py +1571 -0
- package/bin/_cctally_sync_week.py +110 -0
- package/bin/_cctally_tui.py +4381 -0
- package/bin/_cctally_update.py +2132 -0
- package/bin/_lib_aggregators.py +712 -0
- package/bin/_lib_alerts_payload.py +194 -0
- package/bin/_lib_blocks.py +414 -0
- package/bin/_lib_diff_kernel.py +1618 -0
- package/bin/_lib_display_tz.py +361 -0
- package/bin/_lib_doctor.py +58 -0
- package/bin/_lib_five_hour.py +82 -0
- package/bin/_lib_jsonl.py +403 -0
- package/bin/_lib_pricing.py +520 -0
- package/bin/_lib_render.py +2785 -0
- package/bin/_lib_semver.py +105 -0
- package/bin/_lib_subscription_weeks.py +492 -0
- package/bin/cctally +11034 -35415
- package/package.json +24 -1
|
@@ -0,0 +1,1432 @@
|
|
|
1
|
+
"""Session-entry cache subsystem (Claude + Codex) for cctally.
|
|
2
|
+
|
|
3
|
+
Eager I/O sibling: bin/cctally loads this at startup. Holds the
|
|
4
|
+
session-entry cache plumbing that every JSONL-reading subcommand
|
|
5
|
+
(``daily`` / ``monthly`` / ``weekly`` / ``blocks`` / ``session`` /
|
|
6
|
+
``range-cost`` / ``cache-report`` / ``sync-week`` / ``codex-*``) routes
|
|
7
|
+
through. Hot path: ``sync_cache`` and ``open_cache_db`` are invoked on
|
|
8
|
+
every ``cctally record-usage`` tick via the statusline/hook-tick
|
|
9
|
+
pipeline.
|
|
10
|
+
|
|
11
|
+
Holds:
|
|
12
|
+
- ``ProjectKey`` (frozen dataclass) + ``_resolve_project_key`` —
|
|
13
|
+
canonical project bucket identity for the ``project`` subcommand.
|
|
14
|
+
- ``_get_codex_sessions_dir`` / ``_discover_codex_session_files`` —
|
|
15
|
+
Codex JSONL discovery primitives.
|
|
16
|
+
- ``IngestStats`` / ``CodexIngestStats`` (dataclasses), ``_progress_stderr``
|
|
17
|
+
/ ``_progress_codex_stderr`` — ingest progress + per-call telemetry.
|
|
18
|
+
- ``_ensure_session_files_row`` — idempotent backfill of
|
|
19
|
+
``session_files.session_id`` / ``.project_path`` driven by ``sync_cache``.
|
|
20
|
+
- ``sync_cache`` / ``sync_codex_cache`` — read-through delta ingest of
|
|
21
|
+
``~/.claude/projects/**/*.jsonl`` and ``~/.codex/sessions/**/*.jsonl``,
|
|
22
|
+
each gated by an exclusive ``fcntl.flock`` on its own ``.lock`` sibling
|
|
23
|
+
of ``cache.db``.
|
|
24
|
+
- ``open_cache_db`` — schema + per-DB migration dispatcher
|
|
25
|
+
(``_run_pending_migrations(_, registry=_CACHE_MIGRATIONS, …)``) +
|
|
26
|
+
WAL/busy-timeout pragmas; safe on corrupt-file recreation because the
|
|
27
|
+
cache is fully re-derivable from JSONL.
|
|
28
|
+
- ``iter_entries`` / ``iter_codex_entries`` — in-range SELECT helpers
|
|
29
|
+
returning ``UsageEntry`` / ``CodexEntry`` (defined in
|
|
30
|
+
``bin/_lib_jsonl.py``).
|
|
31
|
+
- ``_collect_entries_direct`` / ``_collect_codex_entries_direct`` /
|
|
32
|
+
``_direct_parse_claude_session_entries`` — direct-JSONL parse
|
|
33
|
+
fallbacks when cache.db can't be opened or an ingest lock is held.
|
|
34
|
+
- ``_JoinedClaudeEntry`` (dataclass) + ``get_claude_session_entries`` —
|
|
35
|
+
cache-first ``LEFT JOIN`` of ``session_entries`` ↔ ``session_files``
|
|
36
|
+
for the ``session`` / ``project`` / share-projects renderers.
|
|
37
|
+
- ``get_entries`` / ``get_codex_entries`` — top-level cache-first
|
|
38
|
+
fetches that JSONL-reading commands MUST use rather than touching
|
|
39
|
+
``open_cache_db`` directly. Transparent fallback on cache-open
|
|
40
|
+
failure or sync lock contention.
|
|
41
|
+
- ``cmd_cache_sync`` — entry point for ``cctally cache-sync
|
|
42
|
+
[--source {claude,codex,all}] [--rebuild]``.
|
|
43
|
+
|
|
44
|
+
What stays in bin/cctally:
|
|
45
|
+
- Path constants ``APP_DIR``, ``CACHE_DB_PATH``, ``CACHE_LOCK_PATH``,
|
|
46
|
+
``CACHE_LOCK_CODEX_PATH``, ``CODEX_SESSIONS_DIR`` — referenced from
|
|
47
|
+
the moved bodies via the ``c = _cctally()`` call-time accessor
|
|
48
|
+
pattern (spec §5.5, same as ``bin/_lib_subscription_weeks.py`` and
|
|
49
|
+
``bin/_lib_aggregators.py``). The accessor resolves
|
|
50
|
+
``sys.modules['cctally'].X`` on every call, so
|
|
51
|
+
``monkeypatch.setitem(ns, "CACHE_DB_PATH", tmp)`` and conftest
|
|
52
|
+
``redirect_paths`` HOME redirects propagate transparently with NO
|
|
53
|
+
test-side changes (tests already patch ``ns["CACHE_DB_PATH"]`` etc.
|
|
54
|
+
by setitem on the dict-as-module bridge). We chose ``c.X`` over the
|
|
55
|
+
``_cctally_db.py``-style seed block here because cache tests are
|
|
56
|
+
widely scattered (record-usage tick, dashboard panels, share render
|
|
57
|
+
kernel, block tests, every JSONL-reading subcommand fixture) and
|
|
58
|
+
Phase C-style inline patching would touch dozens of sites.
|
|
59
|
+
- ``_sum_cost_for_range`` — sits at the cache↔report boundary; 6+
|
|
60
|
+
callers outside cache (forecast, weekly, report, project, doctor),
|
|
61
|
+
so the directive keeps it on the bin/cctally side.
|
|
62
|
+
- ``CacheModelBreakdown`` / ``CacheRow`` and the broader cache-report
|
|
63
|
+
surface — that's Phase F territory, not the ingest/read primitives.
|
|
64
|
+
- ``_decode_escaped_cwd``, ``_discover_session_files``,
|
|
65
|
+
``_get_claude_data_dirs``, ``eprint`` — small shared helpers (JSONL
|
|
66
|
+
discovery + stderr formatter) consumed by many non-cache paths.
|
|
67
|
+
Routed through module-level callable shims (see below) so moved
|
|
68
|
+
code keeps its bare-name call shape and monkeypatches on bin/cctally
|
|
69
|
+
propagate via call-time ``sys.modules['cctally']`` lookup.
|
|
70
|
+
|
|
71
|
+
Direct sibling loads at module-load time (acyclic — both are pure leaves
|
|
72
|
+
in the sibling graph):
|
|
73
|
+
- ``_lib_jsonl`` for ``UsageEntry``, ``CodexEntry``, ``_CodexIterState``,
|
|
74
|
+
``_iter_jsonl_entries_with_offsets``, ``_iter_codex_jsonl_entries_with_offsets``,
|
|
75
|
+
``_parse_usage_entries``.
|
|
76
|
+
- ``_cctally_db`` for ``add_column_if_missing``, ``_run_pending_migrations``,
|
|
77
|
+
``_CACHE_MIGRATIONS``. Loading ``_cctally_db`` here is a no-op when
|
|
78
|
+
bin/cctally already imported it at startup (the eager-load block
|
|
79
|
+
there fires first), but the direct load makes this sibling
|
|
80
|
+
self-contained for tests that load ``_cctally_cache`` in isolation.
|
|
81
|
+
|
|
82
|
+
§5.6 audit: zero monkeypatch sites on any moved symbol. The Section
|
|
83
|
+
5.6 audit grep on the candidate-symbol inventory (``sync_cache``,
|
|
84
|
+
``sync_codex_cache``, ``open_cache_db``, ``iter_entries``,
|
|
85
|
+
``get_entries``, ``get_claude_session_entries``, ``get_codex_entries``,
|
|
86
|
+
``_resolve_project_key``, ``ProjectKey``, ``IngestStats``,
|
|
87
|
+
``CodexIngestStats``, ``_JoinedClaudeEntry``, ``_ensure_session_files_row``,
|
|
88
|
+
``_discover_codex_session_files``, ``_get_codex_sessions_dir``,
|
|
89
|
+
``cmd_cache_sync``, ``_progress_stderr``, ``_progress_codex_stderr``,
|
|
90
|
+
``_collect_entries_direct``, ``_collect_codex_entries_direct``,
|
|
91
|
+
``_direct_parse_claude_session_entries``, ``iter_codex_entries``)
|
|
92
|
+
returns no ``monkeypatch.setattr/setitem`` sites — the only test-side
|
|
93
|
+
hits are ``ns["X"](...)`` direct-callers (e.g.
|
|
94
|
+
``tests/test_share_top_projects.py`` patches ``get_claude_session_entries``
|
|
95
|
+
via ``monkeypatch.setitem(ns, ...)`` on bin/cctally's namespace, which
|
|
96
|
+
propagates through the eager re-export of the same name in bin/cctally).
|
|
97
|
+
Pure-mechanical extraction.
|
|
98
|
+
|
|
99
|
+
Spec: docs/superpowers/specs/2026-05-13-bin-cctally-split-design.md
|
|
100
|
+
"""
|
|
101
|
+
from __future__ import annotations
|
|
102
|
+
|
|
103
|
+
import argparse
|
|
104
|
+
import datetime as dt
|
|
105
|
+
import fcntl
|
|
106
|
+
import importlib.util as _ilu
|
|
107
|
+
import json
|
|
108
|
+
import os
|
|
109
|
+
import pathlib
|
|
110
|
+
import sqlite3
|
|
111
|
+
import sys
|
|
112
|
+
from dataclasses import dataclass, field
|
|
113
|
+
from typing import Any, Callable
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _cctally():
|
|
117
|
+
"""Resolve the current `cctally` module at call-time (spec §5.5)."""
|
|
118
|
+
return sys.modules["cctally"]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Module-level back-ref shims for the four bare-name helpers consumed
|
|
122
|
+
# throughout the moved bodies. Each shim resolves
|
|
123
|
+
# ``sys.modules['cctally'].X`` at CALL TIME (not bind time), so
|
|
124
|
+
# monkeypatches on cctally's namespace propagate into the moved code
|
|
125
|
+
# unchanged. Mirrors the precedent established in ``bin/_cctally_db.py``
|
|
126
|
+
# (``now_utc_iso`` / ``parse_iso_datetime`` / ``_compute_block_totals``
|
|
127
|
+
# / ``eprint`` shims).
|
|
128
|
+
def eprint(*args, **kwargs):
|
|
129
|
+
return sys.modules["cctally"].eprint(*args, **kwargs)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _decode_escaped_cwd(*args, **kwargs):
|
|
133
|
+
return sys.modules["cctally"]._decode_escaped_cwd(*args, **kwargs)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _discover_session_files(*args, **kwargs):
|
|
137
|
+
return sys.modules["cctally"]._discover_session_files(*args, **kwargs)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _get_claude_data_dirs(*args, **kwargs):
|
|
141
|
+
return sys.modules["cctally"]._get_claude_data_dirs(*args, **kwargs)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# Direct sibling loads at module-load time. Both targets are
|
|
145
|
+
# self-contained: ``_lib_jsonl`` is a pure leaf (stdlib-only), and
|
|
146
|
+
# ``_cctally_db`` registers its three production migration handlers at
|
|
147
|
+
# import time — those decorators are idempotent across re-imports
|
|
148
|
+
# because the framework's ``sys.modules`` cache means each handler
|
|
149
|
+
# registers exactly once per sibling lifetime.
|
|
150
|
+
def _load_lib(name: str):
|
|
151
|
+
cached = sys.modules.get(name)
|
|
152
|
+
if cached is not None:
|
|
153
|
+
return cached
|
|
154
|
+
p = pathlib.Path(__file__).resolve().parent / f"{name}.py"
|
|
155
|
+
spec = _ilu.spec_from_file_location(name, p)
|
|
156
|
+
mod = _ilu.module_from_spec(spec)
|
|
157
|
+
sys.modules[name] = mod
|
|
158
|
+
spec.loader.exec_module(mod)
|
|
159
|
+
return mod
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
_lib_jsonl = _load_lib("_lib_jsonl")
|
|
163
|
+
UsageEntry = _lib_jsonl.UsageEntry
|
|
164
|
+
CodexEntry = _lib_jsonl.CodexEntry
|
|
165
|
+
_CodexIterState = _lib_jsonl._CodexIterState
|
|
166
|
+
_iter_jsonl_entries_with_offsets = _lib_jsonl._iter_jsonl_entries_with_offsets
|
|
167
|
+
_iter_codex_jsonl_entries_with_offsets = _lib_jsonl._iter_codex_jsonl_entries_with_offsets
|
|
168
|
+
_parse_usage_entries = _lib_jsonl._parse_usage_entries
|
|
169
|
+
|
|
170
|
+
_cctally_db_sib = _load_lib("_cctally_db")
|
|
171
|
+
add_column_if_missing = _cctally_db_sib.add_column_if_missing
|
|
172
|
+
_run_pending_migrations = _cctally_db_sib._run_pending_migrations
|
|
173
|
+
_CACHE_MIGRATIONS = _cctally_db_sib._CACHE_MIGRATIONS
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# === BEGIN MOVED REGIONS ===
|
|
177
|
+
# Path constants (APP_DIR, CACHE_DB_PATH, CACHE_LOCK_PATH,
|
|
178
|
+
# CACHE_LOCK_CODEX_PATH, CODEX_SESSIONS_DIR) are accessed via the
|
|
179
|
+
# `c = _cctally()` call-time accessor inside each function that
|
|
180
|
+
# needs them — so ``monkeypatch.setitem(ns, "CACHE_DB_PATH", tmp)``
|
|
181
|
+
# in tests resolves on every read (no stale module-level binding).
|
|
182
|
+
|
|
183
|
+
# === Region 1: ProjectKey + _resolve_project_key (was bin/cctally:1994-2069) ===
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@dataclass(frozen=True)
|
|
187
|
+
class ProjectKey:
|
|
188
|
+
"""Canonical project identity for the `project` subcommand.
|
|
189
|
+
|
|
190
|
+
Equality and hash are defined over `bucket_path` only — this is
|
|
191
|
+
the canonical bucket identifier. `display_key` is the user-facing
|
|
192
|
+
label and may be augmented later (e.g. basename-collision
|
|
193
|
+
disambiguation) without breaking aggregation.
|
|
194
|
+
"""
|
|
195
|
+
bucket_path: str
|
|
196
|
+
display_key: str = field(compare=False)
|
|
197
|
+
git_root: str | None = field(compare=False)
|
|
198
|
+
is_unknown: bool = field(default=False, compare=False)
|
|
199
|
+
is_no_git: bool = field(default=False, compare=False)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _resolve_project_key(
|
|
203
|
+
project_path: str | None,
|
|
204
|
+
mode: str, # "git-root" | "full-path"
|
|
205
|
+
cache: dict[str, ProjectKey],
|
|
206
|
+
) -> ProjectKey:
|
|
207
|
+
"""Resolve a raw project_path to its ProjectKey.
|
|
208
|
+
|
|
209
|
+
Walks parents looking for `.git` (file or dir) to find the canonical
|
|
210
|
+
git-root. Non-git paths fall back to the normalized path. NULL input
|
|
211
|
+
becomes a literal `(unknown)` bucket.
|
|
212
|
+
"""
|
|
213
|
+
if project_path is None:
|
|
214
|
+
return ProjectKey(
|
|
215
|
+
bucket_path="(unknown)",
|
|
216
|
+
display_key="(unknown)",
|
|
217
|
+
git_root=None,
|
|
218
|
+
is_unknown=True,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if mode == "full-path":
|
|
222
|
+
normalized = os.path.realpath(os.path.expanduser(project_path))
|
|
223
|
+
key = cache.get(normalized)
|
|
224
|
+
if key is not None:
|
|
225
|
+
return key
|
|
226
|
+
key = ProjectKey(
|
|
227
|
+
bucket_path=normalized,
|
|
228
|
+
display_key=project_path, # raw, so user sees what they typed
|
|
229
|
+
git_root=None,
|
|
230
|
+
)
|
|
231
|
+
cache[normalized] = key
|
|
232
|
+
return key
|
|
233
|
+
|
|
234
|
+
normalized = os.path.realpath(os.path.expanduser(project_path))
|
|
235
|
+
cached = cache.get(normalized)
|
|
236
|
+
if cached is not None:
|
|
237
|
+
return cached
|
|
238
|
+
|
|
239
|
+
home = os.path.expanduser("~")
|
|
240
|
+
cur = normalized
|
|
241
|
+
while True:
|
|
242
|
+
if cur == home or cur == "/" or os.path.dirname(cur) == cur:
|
|
243
|
+
break
|
|
244
|
+
if os.path.exists(os.path.join(cur, ".git")):
|
|
245
|
+
key = ProjectKey(
|
|
246
|
+
bucket_path=cur,
|
|
247
|
+
display_key=os.path.basename(cur) or cur,
|
|
248
|
+
git_root=cur,
|
|
249
|
+
)
|
|
250
|
+
cache[normalized] = key
|
|
251
|
+
return key
|
|
252
|
+
cur = os.path.dirname(cur)
|
|
253
|
+
|
|
254
|
+
key = ProjectKey(
|
|
255
|
+
bucket_path=normalized,
|
|
256
|
+
display_key=os.path.basename(project_path) or project_path,
|
|
257
|
+
git_root=None,
|
|
258
|
+
is_no_git=True,
|
|
259
|
+
)
|
|
260
|
+
cache[normalized] = key
|
|
261
|
+
return key
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# === Region 2: Codex sessions-dir helpers (was bin/cctally:2072-2099) ===
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _get_codex_sessions_dir() -> pathlib.Path | None:
|
|
268
|
+
"""Return the Codex sessions directory if present, else None."""
|
|
269
|
+
c = _cctally()
|
|
270
|
+
if c.CODEX_SESSIONS_DIR.is_dir():
|
|
271
|
+
return c.CODEX_SESSIONS_DIR
|
|
272
|
+
return None
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _discover_codex_session_files(
|
|
276
|
+
range_start: dt.datetime,
|
|
277
|
+
) -> list[pathlib.Path]:
|
|
278
|
+
"""Glob ~/.codex/sessions/**/*.jsonl, filtering by mtime >= range_start."""
|
|
279
|
+
root = _get_codex_sessions_dir()
|
|
280
|
+
if root is None:
|
|
281
|
+
eprint("[codex] no ~/.codex/sessions directory found")
|
|
282
|
+
return []
|
|
283
|
+
start_ts = range_start.timestamp()
|
|
284
|
+
result: list[pathlib.Path] = []
|
|
285
|
+
for jp in root.glob("**/*.jsonl"):
|
|
286
|
+
if not jp.is_file():
|
|
287
|
+
continue
|
|
288
|
+
try:
|
|
289
|
+
mtime = jp.stat().st_mtime
|
|
290
|
+
except OSError:
|
|
291
|
+
continue
|
|
292
|
+
if mtime < start_ts:
|
|
293
|
+
continue
|
|
294
|
+
result.append(jp)
|
|
295
|
+
return result
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# === Region 3: IngestStats + Claude ingest path (was bin/cctally:2102-2400) ===
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@dataclass
|
|
302
|
+
class IngestStats:
|
|
303
|
+
files_total: int = 0
|
|
304
|
+
files_processed: int = 0
|
|
305
|
+
files_skipped_unchanged: int = 0
|
|
306
|
+
files_reset_truncated: int = 0
|
|
307
|
+
rows_inserted: int = 0
|
|
308
|
+
lock_contended: bool = False
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _progress_stderr(stats: IngestStats, *, force: bool = False) -> None:
|
|
312
|
+
"""Default stderr progress callback. Every 200 files or when forced."""
|
|
313
|
+
if not force and stats.files_processed % 200 != 0:
|
|
314
|
+
return
|
|
315
|
+
eprint(
|
|
316
|
+
f"[cache-sync] {stats.files_processed}/{stats.files_total} files, "
|
|
317
|
+
f"{stats.rows_inserted} new rows"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _ensure_session_files_row(conn: sqlite3.Connection, source_path: str) -> None:
|
|
322
|
+
"""Populate session_files.session_id and .project_path for this JSONL.
|
|
323
|
+
|
|
324
|
+
Idempotent and safe to call every sync: uses UPSERT with COALESCE on the
|
|
325
|
+
two new columns so already-populated rows are not overwritten. Scans the
|
|
326
|
+
file from offset 0 looking for the first line carrying `sessionId`; also
|
|
327
|
+
captures `cwd` for `project_path` when present. Falls back to filename
|
|
328
|
+
UUID + decoded-escaped-directory when those fields are absent.
|
|
329
|
+
|
|
330
|
+
Does not touch the delta-resume columns (size_bytes, mtime_ns,
|
|
331
|
+
last_byte_offset, last_ingested_at) — those belong to the existing
|
|
332
|
+
sync_cache path.
|
|
333
|
+
|
|
334
|
+
No-op on files already populated on both new columns; cheap SELECT check
|
|
335
|
+
up front to avoid re-reading the JSONL when the row is already complete.
|
|
336
|
+
"""
|
|
337
|
+
# Quick check: skip if both columns already populated.
|
|
338
|
+
existing = conn.execute(
|
|
339
|
+
"SELECT session_id, project_path FROM session_files WHERE path = ?",
|
|
340
|
+
(source_path,),
|
|
341
|
+
).fetchone()
|
|
342
|
+
if existing is not None and existing[0] is not None and existing[1] is not None:
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
session_id: str | None = None
|
|
346
|
+
cwd: str | None = None
|
|
347
|
+
try:
|
|
348
|
+
with open(source_path, "r", encoding="utf-8", errors="replace") as f:
|
|
349
|
+
for line in f:
|
|
350
|
+
line = line.strip()
|
|
351
|
+
if not line:
|
|
352
|
+
continue
|
|
353
|
+
try:
|
|
354
|
+
obj = json.loads(line)
|
|
355
|
+
except json.JSONDecodeError:
|
|
356
|
+
continue
|
|
357
|
+
if session_id is None:
|
|
358
|
+
sid = obj.get("sessionId")
|
|
359
|
+
if isinstance(sid, str) and sid:
|
|
360
|
+
session_id = sid
|
|
361
|
+
if cwd is None:
|
|
362
|
+
cwd_val = obj.get("cwd")
|
|
363
|
+
if isinstance(cwd_val, str) and cwd_val:
|
|
364
|
+
cwd = cwd_val
|
|
365
|
+
if session_id is not None and cwd is not None:
|
|
366
|
+
break
|
|
367
|
+
except OSError:
|
|
368
|
+
return # unreadable; retry on next sync
|
|
369
|
+
|
|
370
|
+
# Fallbacks.
|
|
371
|
+
if session_id is None:
|
|
372
|
+
stem = os.path.splitext(os.path.basename(source_path))[0]
|
|
373
|
+
session_id = stem
|
|
374
|
+
# One-shot stderr warning per process per path — match the codex-side
|
|
375
|
+
# pattern (grep for `filename_session_id_warned` for inspiration).
|
|
376
|
+
# Keep simple: unconditional warning. Sync is rare, noise is low.
|
|
377
|
+
print(
|
|
378
|
+
f"Warning: no sessionId in {source_path}; "
|
|
379
|
+
f"falling back to filename UUID {session_id}",
|
|
380
|
+
file=sys.stderr,
|
|
381
|
+
)
|
|
382
|
+
if cwd is None:
|
|
383
|
+
parent = os.path.basename(os.path.dirname(source_path))
|
|
384
|
+
cwd = _decode_escaped_cwd(parent)
|
|
385
|
+
|
|
386
|
+
now_iso = dt.datetime.now(dt.timezone.utc).isoformat()
|
|
387
|
+
conn.execute(
|
|
388
|
+
"""
|
|
389
|
+
INSERT INTO session_files (
|
|
390
|
+
path, size_bytes, mtime_ns, last_byte_offset, last_ingested_at,
|
|
391
|
+
session_id, project_path
|
|
392
|
+
) VALUES (?, 0, 0, 0, ?, ?, ?)
|
|
393
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
394
|
+
session_id = COALESCE(session_files.session_id, excluded.session_id),
|
|
395
|
+
project_path = COALESCE(session_files.project_path, excluded.project_path)
|
|
396
|
+
""",
|
|
397
|
+
(source_path, now_iso, session_id, cwd),
|
|
398
|
+
)
|
|
399
|
+
# Commit per-call so the write lock is released before the caller's
|
|
400
|
+
# subsequent JSONL read+parse. Leaving the implicit transaction open
|
|
401
|
+
# across the per-file loop would both hold a writer lock across reads
|
|
402
|
+
# and risk losing updates if a file-loop iteration `continue`s without
|
|
403
|
+
# hitting the caller's own commit.
|
|
404
|
+
conn.commit()
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def sync_cache(
|
|
408
|
+
conn: sqlite3.Connection,
|
|
409
|
+
*,
|
|
410
|
+
progress: Callable[[IngestStats], None] | None = None,
|
|
411
|
+
rebuild: bool = False,
|
|
412
|
+
) -> IngestStats:
|
|
413
|
+
"""Read-through delta ingest. Acquires an exclusive fcntl.flock; if
|
|
414
|
+
another process holds it, returns immediately with lock_contended=True
|
|
415
|
+
and the caller should proceed with whatever data is already cached.
|
|
416
|
+
|
|
417
|
+
When `rebuild=True`, clears the cached rows AFTER acquiring the lock
|
|
418
|
+
so a lost race does not wipe a cache another process is actively
|
|
419
|
+
populating. If the lock is contended on a rebuild, the cache is left
|
|
420
|
+
untouched and the caller sees `lock_contended=True`.
|
|
421
|
+
"""
|
|
422
|
+
stats = IngestStats()
|
|
423
|
+
c = _cctally()
|
|
424
|
+
c.APP_DIR.mkdir(parents=True, exist_ok=True)
|
|
425
|
+
c.CACHE_LOCK_PATH.touch()
|
|
426
|
+
|
|
427
|
+
lock_fh = open(c.CACHE_LOCK_PATH, "w")
|
|
428
|
+
try:
|
|
429
|
+
try:
|
|
430
|
+
fcntl.flock(lock_fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
431
|
+
except BlockingIOError:
|
|
432
|
+
eprint("[cache] sync already in progress; using existing cache")
|
|
433
|
+
stats.lock_contended = True
|
|
434
|
+
return stats
|
|
435
|
+
|
|
436
|
+
if rebuild:
|
|
437
|
+
# Clear INSIDE the lock — a concurrent rebuild that lost the
|
|
438
|
+
# race would otherwise have wiped this cache before bailing,
|
|
439
|
+
# leaving the user with empty state. Done before the existing
|
|
440
|
+
# SELECT so the subsequent delta-detection logic sees an
|
|
441
|
+
# empty baseline.
|
|
442
|
+
conn.execute("DELETE FROM session_entries")
|
|
443
|
+
conn.execute("DELETE FROM session_files")
|
|
444
|
+
conn.commit()
|
|
445
|
+
eprint("[cache-sync] rebuild: cleared Claude cached entries")
|
|
446
|
+
|
|
447
|
+
claude_dirs = _get_claude_data_dirs()
|
|
448
|
+
paths: list[pathlib.Path] = []
|
|
449
|
+
for claude_dir in claude_dirs:
|
|
450
|
+
for jp in (claude_dir / "projects").glob("**/*.jsonl"):
|
|
451
|
+
if jp.is_file():
|
|
452
|
+
paths.append(jp)
|
|
453
|
+
stats.files_total = len(paths)
|
|
454
|
+
|
|
455
|
+
# This SELECT does NOT open an implicit transaction (Python's
|
|
456
|
+
# sqlite3 module only BEGINs on DML). Do NOT add any INSERT/
|
|
457
|
+
# UPDATE/DELETE/REPLACE statement between here and the per-file
|
|
458
|
+
# loop below — the read+parse inside that loop must run with
|
|
459
|
+
# zero cache.db write lock held.
|
|
460
|
+
existing = {
|
|
461
|
+
row[0]: (row[1], row[2], row[3])
|
|
462
|
+
for row in conn.execute(
|
|
463
|
+
"SELECT path, size_bytes, mtime_ns, last_byte_offset FROM session_files"
|
|
464
|
+
)
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
for jp in paths:
|
|
468
|
+
path_str = str(jp)
|
|
469
|
+
# Backfill session_id/project_path for A2 `session` subcommand.
|
|
470
|
+
# Idempotent upsert that preserves delta-resume columns.
|
|
471
|
+
# Placed at the top so unchanged files (early-continue below) are
|
|
472
|
+
# still covered. The downstream INSERT for session_files preserves
|
|
473
|
+
# the two new columns via an explicit column list so this backfill
|
|
474
|
+
# is not clobbered by delta-resume writes.
|
|
475
|
+
_ensure_session_files_row(conn, path_str)
|
|
476
|
+
try:
|
|
477
|
+
st = jp.stat()
|
|
478
|
+
except OSError as exc:
|
|
479
|
+
eprint(f"[cache] stat failed for {jp}: {exc}")
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
size = st.st_size
|
|
483
|
+
mtime_ns = st.st_mtime_ns
|
|
484
|
+
prev = existing.get(path_str)
|
|
485
|
+
start_offset = 0
|
|
486
|
+
truncated = False
|
|
487
|
+
if prev is not None:
|
|
488
|
+
# mtime_ns is stored in session_files for diagnostics but
|
|
489
|
+
# intentionally NOT consulted for delta detection — size
|
|
490
|
+
# is the only signal (Claude Code's JSONL sessions are
|
|
491
|
+
# strictly append-only, so a size change is sufficient
|
|
492
|
+
# and mtime is prone to clock-skew false-positives).
|
|
493
|
+
prev_size, _, prev_offset = prev
|
|
494
|
+
if size == prev_size:
|
|
495
|
+
stats.files_skipped_unchanged += 1
|
|
496
|
+
continue
|
|
497
|
+
if size > prev_size:
|
|
498
|
+
start_offset = prev_offset
|
|
499
|
+
else:
|
|
500
|
+
truncated = True
|
|
501
|
+
start_offset = 0
|
|
502
|
+
|
|
503
|
+
# Read + parse is a pure read; do it OUTSIDE the write transaction
|
|
504
|
+
# so a slow JSONL doesn't hold a SQLite lock.
|
|
505
|
+
rows: list[tuple[Any, ...]] = []
|
|
506
|
+
final_offset = start_offset
|
|
507
|
+
try:
|
|
508
|
+
with open(jp, "r", encoding="utf-8", errors="replace") as fh:
|
|
509
|
+
fh.seek(start_offset)
|
|
510
|
+
for offset, entry, msg_id, req_id in _iter_jsonl_entries_with_offsets(fh):
|
|
511
|
+
usage = entry.usage
|
|
512
|
+
inp = int(usage.get("input_tokens", 0) or 0)
|
|
513
|
+
out = int(usage.get("output_tokens", 0) or 0)
|
|
514
|
+
cc = int(usage.get("cache_creation_input_tokens", 0) or 0)
|
|
515
|
+
cr = int(usage.get("cache_read_input_tokens", 0) or 0)
|
|
516
|
+
extras = {
|
|
517
|
+
k: v for k, v in usage.items()
|
|
518
|
+
if k not in (
|
|
519
|
+
"input_tokens", "output_tokens",
|
|
520
|
+
"cache_creation_input_tokens",
|
|
521
|
+
"cache_read_input_tokens",
|
|
522
|
+
)
|
|
523
|
+
}
|
|
524
|
+
rows.append((
|
|
525
|
+
path_str,
|
|
526
|
+
offset,
|
|
527
|
+
entry.timestamp.astimezone(dt.timezone.utc).isoformat(),
|
|
528
|
+
entry.model,
|
|
529
|
+
msg_id,
|
|
530
|
+
req_id,
|
|
531
|
+
inp, out, cc, cr,
|
|
532
|
+
json.dumps(extras, sort_keys=True) if extras else None,
|
|
533
|
+
entry.cost_usd,
|
|
534
|
+
))
|
|
535
|
+
final_offset = fh.tell()
|
|
536
|
+
except OSError as exc:
|
|
537
|
+
eprint(f"[cache] could not read {jp}: {exc}")
|
|
538
|
+
continue
|
|
539
|
+
|
|
540
|
+
# Python's sqlite3 module starts an implicit transaction on the
|
|
541
|
+
# first DML statement and commits on conn.commit(). We do NOT
|
|
542
|
+
# call "BEGIN IMMEDIATE" ourselves — that would error with
|
|
543
|
+
# "cannot start a transaction within a transaction" if a prior
|
|
544
|
+
# statement already opened one. DELETE + INSERTs + UPDATE happen
|
|
545
|
+
# atomically in a single commit.
|
|
546
|
+
try:
|
|
547
|
+
if truncated:
|
|
548
|
+
conn.execute(
|
|
549
|
+
"DELETE FROM session_entries WHERE source_path = ?",
|
|
550
|
+
(path_str,),
|
|
551
|
+
)
|
|
552
|
+
stats.files_reset_truncated += 1
|
|
553
|
+
if rows:
|
|
554
|
+
before = conn.total_changes
|
|
555
|
+
conn.executemany(
|
|
556
|
+
"""INSERT OR IGNORE INTO session_entries
|
|
557
|
+
(source_path, line_offset, timestamp_utc, model,
|
|
558
|
+
msg_id, req_id, input_tokens, output_tokens,
|
|
559
|
+
cache_create_tokens, cache_read_tokens,
|
|
560
|
+
usage_extra_json, cost_usd_raw)
|
|
561
|
+
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""",
|
|
562
|
+
rows,
|
|
563
|
+
)
|
|
564
|
+
stats.rows_inserted += conn.total_changes - before
|
|
565
|
+
# UPSERT preserves session_id / project_path columns populated
|
|
566
|
+
# by _ensure_session_files_row at the top of this loop. A plain
|
|
567
|
+
# INSERT OR REPLACE would wipe them on every changed-file sync.
|
|
568
|
+
conn.execute(
|
|
569
|
+
"""INSERT INTO session_files
|
|
570
|
+
(path, size_bytes, mtime_ns, last_byte_offset, last_ingested_at)
|
|
571
|
+
VALUES (?,?,?,?,?)
|
|
572
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
573
|
+
size_bytes = excluded.size_bytes,
|
|
574
|
+
mtime_ns = excluded.mtime_ns,
|
|
575
|
+
last_byte_offset = excluded.last_byte_offset,
|
|
576
|
+
last_ingested_at = excluded.last_ingested_at""",
|
|
577
|
+
(
|
|
578
|
+
path_str, size, mtime_ns, final_offset,
|
|
579
|
+
dt.datetime.now(dt.timezone.utc).isoformat(),
|
|
580
|
+
),
|
|
581
|
+
)
|
|
582
|
+
conn.commit()
|
|
583
|
+
stats.files_processed += 1
|
|
584
|
+
except sqlite3.DatabaseError as exc:
|
|
585
|
+
eprint(f"[cache] db error on {jp}: {exc}")
|
|
586
|
+
conn.rollback()
|
|
587
|
+
continue
|
|
588
|
+
|
|
589
|
+
if progress is not None:
|
|
590
|
+
progress(stats)
|
|
591
|
+
|
|
592
|
+
if progress is not None:
|
|
593
|
+
progress(stats)
|
|
594
|
+
return stats
|
|
595
|
+
finally:
|
|
596
|
+
try:
|
|
597
|
+
fcntl.flock(lock_fh, fcntl.LOCK_UN)
|
|
598
|
+
except OSError:
|
|
599
|
+
pass
|
|
600
|
+
lock_fh.close()
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def iter_entries(
|
|
604
|
+
conn: sqlite3.Connection,
|
|
605
|
+
range_start: dt.datetime,
|
|
606
|
+
range_end: dt.datetime,
|
|
607
|
+
*,
|
|
608
|
+
project: str | None = None,
|
|
609
|
+
) -> list[UsageEntry]:
|
|
610
|
+
"""Return cached UsageEntry rows whose timestamp falls in [range_start,
|
|
611
|
+
range_end]. Optional `project` filters by the project slug (directory
|
|
612
|
+
name under `<claude>/projects/`). Drop-in replacement for the old
|
|
613
|
+
`_discover_session_files` + `_parse_usage_entries` loop; dedup is
|
|
614
|
+
enforced at write time by the UNIQUE(msg_id, req_id) index.
|
|
615
|
+
"""
|
|
616
|
+
start_iso = range_start.astimezone(dt.timezone.utc).isoformat()
|
|
617
|
+
end_iso = range_end.astimezone(dt.timezone.utc).isoformat()
|
|
618
|
+
|
|
619
|
+
sql = (
|
|
620
|
+
"SELECT timestamp_utc, model, input_tokens, output_tokens, "
|
|
621
|
+
"cache_create_tokens, cache_read_tokens, usage_extra_json, cost_usd_raw "
|
|
622
|
+
"FROM session_entries "
|
|
623
|
+
"WHERE timestamp_utc >= ? AND timestamp_utc <= ?"
|
|
624
|
+
)
|
|
625
|
+
params: list[Any] = [start_iso, end_iso]
|
|
626
|
+
if project is not None:
|
|
627
|
+
# Escape LIKE wildcards (_ matches any single char, % matches any
|
|
628
|
+
# string). The old glob-based discovery matched project names
|
|
629
|
+
# literally; preserve that semantics so e.g. "foo_bar" doesn't
|
|
630
|
+
# also match "fooxbar".
|
|
631
|
+
escaped = (
|
|
632
|
+
project.replace("\\", r"\\").replace("%", r"\%").replace("_", r"\_")
|
|
633
|
+
)
|
|
634
|
+
sql += r" AND source_path LIKE ? ESCAPE '\'"
|
|
635
|
+
params.append(f"%/projects/{escaped}/%")
|
|
636
|
+
sql += " ORDER BY timestamp_utc ASC"
|
|
637
|
+
|
|
638
|
+
entries: list[UsageEntry] = []
|
|
639
|
+
for row in conn.execute(sql, params):
|
|
640
|
+
usage: dict[str, Any] = {
|
|
641
|
+
"input_tokens": row[2],
|
|
642
|
+
"output_tokens": row[3],
|
|
643
|
+
"cache_creation_input_tokens": row[4],
|
|
644
|
+
"cache_read_input_tokens": row[5],
|
|
645
|
+
}
|
|
646
|
+
if row[6]:
|
|
647
|
+
# Safe because sync_cache strips the four token keys from
|
|
648
|
+
# extras before storing them in usage_extra_json. If that
|
|
649
|
+
# write-side invariant ever changes, extras could shadow
|
|
650
|
+
# the int-normalized token columns.
|
|
651
|
+
usage.update(json.loads(row[6]))
|
|
652
|
+
entries.append(UsageEntry(
|
|
653
|
+
timestamp=dt.datetime.fromisoformat(row[0]),
|
|
654
|
+
model=row[1],
|
|
655
|
+
usage=usage,
|
|
656
|
+
cost_usd=row[7],
|
|
657
|
+
))
|
|
658
|
+
return entries
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def _collect_entries_direct(
|
|
662
|
+
range_start: dt.datetime,
|
|
663
|
+
range_end: dt.datetime,
|
|
664
|
+
*,
|
|
665
|
+
project: str | None = None,
|
|
666
|
+
) -> list[UsageEntry]:
|
|
667
|
+
"""Legacy direct-parse fallback used when the cache DB can't be opened."""
|
|
668
|
+
files = _discover_session_files(range_start, project=project)
|
|
669
|
+
seen_hashes: set[str] = set()
|
|
670
|
+
entries: list[UsageEntry] = []
|
|
671
|
+
for fp in files:
|
|
672
|
+
entries.extend(
|
|
673
|
+
_parse_usage_entries(fp, range_start, range_end, seen_hashes=seen_hashes)
|
|
674
|
+
)
|
|
675
|
+
return entries
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
# === Region 4: _JoinedClaudeEntry + get_claude_session_entries (was bin/cctally:2478-2668) ===
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
@dataclass
|
|
682
|
+
class _JoinedClaudeEntry:
|
|
683
|
+
"""session_entries row LEFT JOIN session_files metadata.
|
|
684
|
+
|
|
685
|
+
Row shape returned by `get_claude_session_entries`. `session_id` and
|
|
686
|
+
`project_path` are both nullable — a LEFT JOIN preserves entries whose
|
|
687
|
+
`session_files` metadata has not yet been backfilled by sync_cache's
|
|
688
|
+
`_ensure_session_files_row` hook. The aggregator (Task 19) handles
|
|
689
|
+
`session_id is None` by falling back to the filename UUID and emitting
|
|
690
|
+
a one-shot warning.
|
|
691
|
+
"""
|
|
692
|
+
timestamp: dt.datetime
|
|
693
|
+
model: str
|
|
694
|
+
input_tokens: int
|
|
695
|
+
output_tokens: int
|
|
696
|
+
cache_creation_tokens: int
|
|
697
|
+
cache_read_tokens: int
|
|
698
|
+
source_path: str
|
|
699
|
+
session_id: str | None
|
|
700
|
+
project_path: str | None
|
|
701
|
+
# Raw `costUSD` from the JSONL entry when present (None otherwise).
|
|
702
|
+
# Honored by downstream aggregators so `cache-report --by-session`
|
|
703
|
+
# reconciles with daily/range-cost paths that already pass
|
|
704
|
+
# `entry.cost_usd` into `_calculate_entry_cost`.
|
|
705
|
+
cost_usd: float | None = None
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def get_claude_session_entries(
|
|
709
|
+
range_start: dt.datetime,
|
|
710
|
+
range_end: dt.datetime,
|
|
711
|
+
*,
|
|
712
|
+
project: str | None = None,
|
|
713
|
+
skip_sync: bool = False,
|
|
714
|
+
) -> list[_JoinedClaudeEntry]:
|
|
715
|
+
"""Fetch in-range Claude entries joined to per-file metadata.
|
|
716
|
+
|
|
717
|
+
Executes a LEFT JOIN between `session_entries` and `session_files`
|
|
718
|
+
(PK column `path`, NOT `source_path`) so rows still appear when the
|
|
719
|
+
Task 16 backfill of `session_id` / `project_path` has not yet
|
|
720
|
+
completed for a given file. Mirrors `get_entries`' cache-first
|
|
721
|
+
pattern: open the cache DB, run `sync_cache` for delta ingest +
|
|
722
|
+
metadata backfill, then query; fall back to a direct JSONL parse
|
|
723
|
+
on cache open failure or lock contention.
|
|
724
|
+
|
|
725
|
+
`project`, when set, matches against the escaped project directory
|
|
726
|
+
name under `<claude>/projects/` via `source_path LIKE %/projects/<slug>/%`
|
|
727
|
+
— same semantics as `iter_entries(project=...)`.
|
|
728
|
+
|
|
729
|
+
When `skip_sync=True`, bypass the JSONL ingest and serve whatever is
|
|
730
|
+
already cached (mirrors `get_entries`' opt-out). The cache-open fallback
|
|
731
|
+
still fires if the cache DB is unusable.
|
|
732
|
+
"""
|
|
733
|
+
try:
|
|
734
|
+
conn = open_cache_db()
|
|
735
|
+
except (sqlite3.DatabaseError, OSError) as exc:
|
|
736
|
+
eprint(f"[cache] unavailable ({exc}); falling back to direct JSONL parse")
|
|
737
|
+
return _direct_parse_claude_session_entries(
|
|
738
|
+
range_start, range_end, project=project
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
if not skip_sync:
|
|
742
|
+
stats = sync_cache(conn)
|
|
743
|
+
if stats.lock_contended:
|
|
744
|
+
# Partial cache window: a concurrent ingest may have committed some
|
|
745
|
+
# files but not others. For correctness, fall back to a direct
|
|
746
|
+
# JSONL parse — same rationale as `get_entries`.
|
|
747
|
+
eprint(
|
|
748
|
+
"[cache] concurrent ingest in progress; "
|
|
749
|
+
"falling back to direct JSONL parse for correctness"
|
|
750
|
+
)
|
|
751
|
+
return _direct_parse_claude_session_entries(
|
|
752
|
+
range_start, range_end, project=project
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
start_iso = range_start.astimezone(dt.timezone.utc).isoformat()
|
|
756
|
+
end_iso = range_end.astimezone(dt.timezone.utc).isoformat()
|
|
757
|
+
|
|
758
|
+
sql = (
|
|
759
|
+
"SELECT "
|
|
760
|
+
" se.timestamp_utc, se.model, "
|
|
761
|
+
" se.input_tokens, se.output_tokens, "
|
|
762
|
+
" se.cache_create_tokens, se.cache_read_tokens, "
|
|
763
|
+
" se.source_path, "
|
|
764
|
+
" sf.session_id, sf.project_path, "
|
|
765
|
+
" se.cost_usd_raw "
|
|
766
|
+
"FROM session_entries se "
|
|
767
|
+
"LEFT JOIN session_files sf ON sf.path = se.source_path "
|
|
768
|
+
"WHERE se.timestamp_utc >= ? AND se.timestamp_utc <= ?"
|
|
769
|
+
)
|
|
770
|
+
params: list[Any] = [start_iso, end_iso]
|
|
771
|
+
if project is not None:
|
|
772
|
+
escaped = (
|
|
773
|
+
project.replace("\\", r"\\").replace("%", r"\%").replace("_", r"\_")
|
|
774
|
+
)
|
|
775
|
+
sql += r" AND se.source_path LIKE ? ESCAPE '\'"
|
|
776
|
+
params.append(f"%/projects/{escaped}/%")
|
|
777
|
+
sql += " ORDER BY se.timestamp_utc ASC"
|
|
778
|
+
|
|
779
|
+
rows = conn.execute(sql, params).fetchall()
|
|
780
|
+
|
|
781
|
+
return [
|
|
782
|
+
_JoinedClaudeEntry(
|
|
783
|
+
timestamp=dt.datetime.fromisoformat(row[0]),
|
|
784
|
+
model=row[1],
|
|
785
|
+
input_tokens=row[2],
|
|
786
|
+
output_tokens=row[3],
|
|
787
|
+
cache_creation_tokens=row[4],
|
|
788
|
+
cache_read_tokens=row[5],
|
|
789
|
+
source_path=row[6],
|
|
790
|
+
session_id=row[7],
|
|
791
|
+
project_path=row[8],
|
|
792
|
+
cost_usd=row[9],
|
|
793
|
+
)
|
|
794
|
+
for row in rows
|
|
795
|
+
]
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def _direct_parse_claude_session_entries(
|
|
799
|
+
range_start: dt.datetime,
|
|
800
|
+
range_end: dt.datetime,
|
|
801
|
+
*,
|
|
802
|
+
project: str | None = None,
|
|
803
|
+
) -> list[_JoinedClaudeEntry]:
|
|
804
|
+
"""Fallback when the cache DB is unavailable — direct JSONL scan.
|
|
805
|
+
|
|
806
|
+
Returns `_JoinedClaudeEntry` rows. Unlike the cache-backed path,
|
|
807
|
+
session_id/project_path are derived per-file here (not via JOIN):
|
|
808
|
+
scan the file for the first `sessionId` / `cwd` value, else fall
|
|
809
|
+
back to the filename UUID and the decoded-escaped parent directory
|
|
810
|
+
— same logic as `_ensure_session_files_row`.
|
|
811
|
+
"""
|
|
812
|
+
results: list[_JoinedClaudeEntry] = []
|
|
813
|
+
files = _discover_session_files(range_start, project=project)
|
|
814
|
+
seen_hashes: set[str] = set()
|
|
815
|
+
|
|
816
|
+
for fp in files:
|
|
817
|
+
source_path = str(fp)
|
|
818
|
+
|
|
819
|
+
# Pull sessionId / cwd from the JSONL (cheap: stops at first hit).
|
|
820
|
+
session_id: str | None = None
|
|
821
|
+
cwd: str | None = None
|
|
822
|
+
try:
|
|
823
|
+
with open(source_path, "r", encoding="utf-8", errors="replace") as fh:
|
|
824
|
+
for line in fh:
|
|
825
|
+
line = line.strip()
|
|
826
|
+
if not line:
|
|
827
|
+
continue
|
|
828
|
+
try:
|
|
829
|
+
obj = json.loads(line)
|
|
830
|
+
except json.JSONDecodeError:
|
|
831
|
+
continue
|
|
832
|
+
if session_id is None:
|
|
833
|
+
sid = obj.get("sessionId")
|
|
834
|
+
if isinstance(sid, str) and sid:
|
|
835
|
+
session_id = sid
|
|
836
|
+
if cwd is None:
|
|
837
|
+
cwd_val = obj.get("cwd")
|
|
838
|
+
if isinstance(cwd_val, str) and cwd_val:
|
|
839
|
+
cwd = cwd_val
|
|
840
|
+
if session_id is not None and cwd is not None:
|
|
841
|
+
break
|
|
842
|
+
except OSError:
|
|
843
|
+
pass
|
|
844
|
+
|
|
845
|
+
if session_id is None:
|
|
846
|
+
session_id = os.path.splitext(os.path.basename(source_path))[0]
|
|
847
|
+
if cwd is None:
|
|
848
|
+
cwd = _decode_escaped_cwd(os.path.basename(os.path.dirname(source_path)))
|
|
849
|
+
|
|
850
|
+
for entry in _parse_usage_entries(
|
|
851
|
+
fp, range_start, range_end, seen_hashes=seen_hashes
|
|
852
|
+
):
|
|
853
|
+
usage = entry.usage
|
|
854
|
+
results.append(_JoinedClaudeEntry(
|
|
855
|
+
timestamp=entry.timestamp,
|
|
856
|
+
model=entry.model,
|
|
857
|
+
input_tokens=int(usage.get("input_tokens", 0) or 0),
|
|
858
|
+
output_tokens=int(usage.get("output_tokens", 0) or 0),
|
|
859
|
+
cache_creation_tokens=int(
|
|
860
|
+
usage.get("cache_creation_input_tokens", 0) or 0
|
|
861
|
+
),
|
|
862
|
+
cache_read_tokens=int(
|
|
863
|
+
usage.get("cache_read_input_tokens", 0) or 0
|
|
864
|
+
),
|
|
865
|
+
source_path=source_path,
|
|
866
|
+
session_id=session_id,
|
|
867
|
+
project_path=cwd,
|
|
868
|
+
cost_usd=entry.cost_usd,
|
|
869
|
+
))
|
|
870
|
+
|
|
871
|
+
return results
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
# === Region 5: CodexIngestStats + Codex ingest path (was bin/cctally:2671-2923) ===
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
@dataclass
|
|
878
|
+
class CodexIngestStats:
|
|
879
|
+
files_total: int = 0
|
|
880
|
+
files_processed: int = 0
|
|
881
|
+
files_skipped_unchanged: int = 0
|
|
882
|
+
files_reset_truncated: int = 0
|
|
883
|
+
rows_inserted: int = 0
|
|
884
|
+
lock_contended: bool = False
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def _progress_codex_stderr(stats: CodexIngestStats, *, force: bool = False) -> None:
|
|
888
|
+
"""Default stderr progress callback for Codex ingest."""
|
|
889
|
+
if not force and stats.files_processed % 200 != 0:
|
|
890
|
+
return
|
|
891
|
+
eprint(
|
|
892
|
+
f"[codex-cache] {stats.files_processed}/{stats.files_total} files, "
|
|
893
|
+
f"{stats.rows_inserted} new rows"
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def sync_codex_cache(
|
|
898
|
+
conn: sqlite3.Connection,
|
|
899
|
+
*,
|
|
900
|
+
progress: Callable[[CodexIngestStats], None] | None = None,
|
|
901
|
+
rebuild: bool = False,
|
|
902
|
+
) -> CodexIngestStats:
|
|
903
|
+
"""Read-through delta ingest of ~/.codex/sessions/**/*.jsonl.
|
|
904
|
+
|
|
905
|
+
Acquires an exclusive fcntl.flock on cache.db.codex.lock (separate from
|
|
906
|
+
the Claude sync lock so the two ingests can run concurrently). On
|
|
907
|
+
contention returns immediately with lock_contended=True.
|
|
908
|
+
|
|
909
|
+
When `rebuild=True`, clears the cached rows AFTER acquiring the lock
|
|
910
|
+
so a lost race does not wipe a cache another process is actively
|
|
911
|
+
populating. If the lock is contended on a rebuild, the cache is left
|
|
912
|
+
untouched and the caller sees `lock_contended=True`.
|
|
913
|
+
"""
|
|
914
|
+
stats = CodexIngestStats()
|
|
915
|
+
c = _cctally()
|
|
916
|
+
c.APP_DIR.mkdir(parents=True, exist_ok=True)
|
|
917
|
+
c.CACHE_LOCK_CODEX_PATH.touch()
|
|
918
|
+
|
|
919
|
+
lock_fh = open(c.CACHE_LOCK_CODEX_PATH, "w")
|
|
920
|
+
try:
|
|
921
|
+
try:
|
|
922
|
+
fcntl.flock(lock_fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
923
|
+
except BlockingIOError:
|
|
924
|
+
eprint("[codex-cache] sync already in progress; using existing cache")
|
|
925
|
+
stats.lock_contended = True
|
|
926
|
+
return stats
|
|
927
|
+
|
|
928
|
+
if rebuild:
|
|
929
|
+
# Clear INSIDE the lock — see sync_cache() for the full
|
|
930
|
+
# rationale. Done before the existing SELECT so delta
|
|
931
|
+
# detection sees an empty baseline.
|
|
932
|
+
conn.execute("DELETE FROM codex_session_entries")
|
|
933
|
+
conn.execute("DELETE FROM codex_session_files")
|
|
934
|
+
conn.commit()
|
|
935
|
+
eprint("[cache-sync] rebuild: cleared Codex cached entries")
|
|
936
|
+
|
|
937
|
+
root = _get_codex_sessions_dir()
|
|
938
|
+
paths: list[pathlib.Path] = []
|
|
939
|
+
if root is not None:
|
|
940
|
+
for jp in root.glob("**/*.jsonl"):
|
|
941
|
+
if jp.is_file():
|
|
942
|
+
paths.append(jp)
|
|
943
|
+
stats.files_total = len(paths)
|
|
944
|
+
|
|
945
|
+
# This SELECT does NOT open an implicit transaction (Python's
|
|
946
|
+
# sqlite3 module only BEGINs on DML). Do NOT add any INSERT/
|
|
947
|
+
# UPDATE/DELETE/REPLACE statement between here and the per-file
|
|
948
|
+
# loop below — the read+parse inside that loop must run with
|
|
949
|
+
# zero cache.db write lock held.
|
|
950
|
+
#
|
|
951
|
+
# mtime_ns is selected into `existing` for diagnostics only —
|
|
952
|
+
# delta detection consults size alone (Codex rollout JSONLs are
|
|
953
|
+
# append-only, so a size change is a sufficient signal and mtime
|
|
954
|
+
# is prone to clock-skew false-positives).
|
|
955
|
+
existing = {
|
|
956
|
+
row[0]: (row[1], row[2], row[3], row[4], row[5], row[6])
|
|
957
|
+
for row in conn.execute(
|
|
958
|
+
"SELECT path, size_bytes, mtime_ns, last_byte_offset, "
|
|
959
|
+
"last_session_id, last_model, last_total_tokens "
|
|
960
|
+
"FROM codex_session_files"
|
|
961
|
+
)
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
for jp in paths:
|
|
965
|
+
path_str = str(jp)
|
|
966
|
+
try:
|
|
967
|
+
st = jp.stat()
|
|
968
|
+
except OSError as exc:
|
|
969
|
+
eprint(f"[codex-cache] stat failed for {jp}: {exc}")
|
|
970
|
+
continue
|
|
971
|
+
|
|
972
|
+
size = st.st_size
|
|
973
|
+
mtime_ns = st.st_mtime_ns
|
|
974
|
+
prev = existing.get(path_str)
|
|
975
|
+
start_offset = 0
|
|
976
|
+
truncated = False
|
|
977
|
+
initial_session_id: str | None = None
|
|
978
|
+
initial_model: str | None = None
|
|
979
|
+
initial_total_tokens = 0
|
|
980
|
+
prev_total_tokens: int | None = None
|
|
981
|
+
if prev is not None:
|
|
982
|
+
(
|
|
983
|
+
prev_size, _, prev_offset, prev_sid, prev_model, prev_ttot,
|
|
984
|
+
) = prev
|
|
985
|
+
prev_total_tokens = (
|
|
986
|
+
int(prev_ttot) if prev_ttot is not None else None
|
|
987
|
+
)
|
|
988
|
+
if size == prev_size:
|
|
989
|
+
stats.files_skipped_unchanged += 1
|
|
990
|
+
continue
|
|
991
|
+
if size > prev_size:
|
|
992
|
+
start_offset = prev_offset
|
|
993
|
+
initial_session_id = prev_sid
|
|
994
|
+
initial_model = prev_model
|
|
995
|
+
initial_total_tokens = prev_total_tokens or 0
|
|
996
|
+
else:
|
|
997
|
+
truncated = True
|
|
998
|
+
start_offset = 0
|
|
999
|
+
initial_session_id = None
|
|
1000
|
+
initial_model = None
|
|
1001
|
+
initial_total_tokens = 0
|
|
1002
|
+
prev_total_tokens = None
|
|
1003
|
+
|
|
1004
|
+
rows: list[tuple[Any, ...]] = []
|
|
1005
|
+
final_offset = start_offset
|
|
1006
|
+
# Mutable tracker that the iterator updates on every
|
|
1007
|
+
# session_meta / turn_context record, regardless of whether a
|
|
1008
|
+
# later token_count yields. Without this, a delta window that
|
|
1009
|
+
# ends on a metadata-only tail would lose the terminal
|
|
1010
|
+
# session_id/model and the next resume would mis-attribute the
|
|
1011
|
+
# first post-resume token_count.
|
|
1012
|
+
iter_state = _CodexIterState(
|
|
1013
|
+
session_id=initial_session_id,
|
|
1014
|
+
model=initial_model,
|
|
1015
|
+
)
|
|
1016
|
+
# Track the cumulative `total_token_usage.total_tokens` across this
|
|
1017
|
+
# call. The iterator only yields when the cumulative strictly
|
|
1018
|
+
# advances by the current turn's `last_token_usage.total_tokens`,
|
|
1019
|
+
# so summing the per-turn totals reconstructs the final cumulative.
|
|
1020
|
+
running_total = initial_total_tokens
|
|
1021
|
+
yielded_count = 0
|
|
1022
|
+
try:
|
|
1023
|
+
with open(jp, "r", encoding="utf-8", errors="replace") as fh:
|
|
1024
|
+
fh.seek(start_offset)
|
|
1025
|
+
for offset, entry in _iter_codex_jsonl_entries_with_offsets(
|
|
1026
|
+
fh,
|
|
1027
|
+
path_str,
|
|
1028
|
+
initial_session_id=initial_session_id,
|
|
1029
|
+
initial_model=initial_model,
|
|
1030
|
+
initial_total_tokens=initial_total_tokens,
|
|
1031
|
+
state=iter_state,
|
|
1032
|
+
):
|
|
1033
|
+
rows.append((
|
|
1034
|
+
path_str,
|
|
1035
|
+
offset,
|
|
1036
|
+
entry.timestamp.astimezone(dt.timezone.utc).isoformat(),
|
|
1037
|
+
entry.session_id,
|
|
1038
|
+
entry.model,
|
|
1039
|
+
entry.input_tokens,
|
|
1040
|
+
entry.cached_input_tokens,
|
|
1041
|
+
entry.output_tokens,
|
|
1042
|
+
entry.reasoning_output_tokens,
|
|
1043
|
+
entry.total_tokens,
|
|
1044
|
+
))
|
|
1045
|
+
running_total += int(entry.total_tokens or 0)
|
|
1046
|
+
yielded_count += 1
|
|
1047
|
+
final_offset = fh.tell()
|
|
1048
|
+
except OSError as exc:
|
|
1049
|
+
eprint(f"[codex-cache] could not read {jp}: {exc}")
|
|
1050
|
+
continue
|
|
1051
|
+
|
|
1052
|
+
# Pull terminal session_id/model from the iterator's tracker.
|
|
1053
|
+
# This picks up updates from session_meta / turn_context events
|
|
1054
|
+
# that occurred AFTER the last yielded token_count (or when no
|
|
1055
|
+
# token_count yielded at all), which the in-loop assignment
|
|
1056
|
+
# would have missed.
|
|
1057
|
+
new_last_session_id: str | None = (
|
|
1058
|
+
iter_state.session_id
|
|
1059
|
+
if iter_state.session_id is not None
|
|
1060
|
+
else initial_session_id
|
|
1061
|
+
)
|
|
1062
|
+
new_last_model: str | None = (
|
|
1063
|
+
iter_state.model
|
|
1064
|
+
if iter_state.model is not None
|
|
1065
|
+
else initial_model
|
|
1066
|
+
)
|
|
1067
|
+
|
|
1068
|
+
# Persist the running cumulative if we yielded this call. Otherwise
|
|
1069
|
+
# preserve the prior value — never overwrite with 0, which would
|
|
1070
|
+
# re-enable double-counting on the next resume.
|
|
1071
|
+
new_last_total_tokens: int | None = (
|
|
1072
|
+
running_total if yielded_count > 0 else prev_total_tokens
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
# Python's sqlite3 module starts an implicit transaction on the
|
|
1076
|
+
# first DML statement and commits on conn.commit(). We do NOT
|
|
1077
|
+
# call "BEGIN IMMEDIATE" ourselves — see sync_cache() for the
|
|
1078
|
+
# full rationale. DELETE + INSERTs + UPDATE happen atomically in
|
|
1079
|
+
# a single commit.
|
|
1080
|
+
try:
|
|
1081
|
+
if truncated:
|
|
1082
|
+
conn.execute(
|
|
1083
|
+
"DELETE FROM codex_session_entries WHERE source_path = ?",
|
|
1084
|
+
(path_str,),
|
|
1085
|
+
)
|
|
1086
|
+
stats.files_reset_truncated += 1
|
|
1087
|
+
if rows:
|
|
1088
|
+
before = conn.total_changes
|
|
1089
|
+
conn.executemany(
|
|
1090
|
+
"""INSERT OR IGNORE INTO codex_session_entries
|
|
1091
|
+
(source_path, line_offset, timestamp_utc, session_id,
|
|
1092
|
+
model, input_tokens, cached_input_tokens,
|
|
1093
|
+
output_tokens, reasoning_output_tokens,
|
|
1094
|
+
total_tokens)
|
|
1095
|
+
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
|
1096
|
+
rows,
|
|
1097
|
+
)
|
|
1098
|
+
stats.rows_inserted += conn.total_changes - before
|
|
1099
|
+
conn.execute(
|
|
1100
|
+
"""INSERT OR REPLACE INTO codex_session_files
|
|
1101
|
+
(path, size_bytes, mtime_ns, last_byte_offset,
|
|
1102
|
+
last_ingested_at, last_session_id, last_model,
|
|
1103
|
+
last_total_tokens)
|
|
1104
|
+
VALUES (?,?,?,?,?,?,?,?)""",
|
|
1105
|
+
(
|
|
1106
|
+
path_str, size, mtime_ns, final_offset,
|
|
1107
|
+
dt.datetime.now(dt.timezone.utc).isoformat(),
|
|
1108
|
+
new_last_session_id, new_last_model,
|
|
1109
|
+
new_last_total_tokens,
|
|
1110
|
+
),
|
|
1111
|
+
)
|
|
1112
|
+
conn.commit()
|
|
1113
|
+
stats.files_processed += 1
|
|
1114
|
+
except sqlite3.DatabaseError as exc:
|
|
1115
|
+
eprint(f"[codex-cache] db error on {jp}: {exc}")
|
|
1116
|
+
conn.rollback()
|
|
1117
|
+
continue
|
|
1118
|
+
|
|
1119
|
+
if progress is not None:
|
|
1120
|
+
progress(stats)
|
|
1121
|
+
|
|
1122
|
+
if progress is not None:
|
|
1123
|
+
progress(stats)
|
|
1124
|
+
return stats
|
|
1125
|
+
finally:
|
|
1126
|
+
try:
|
|
1127
|
+
fcntl.flock(lock_fh, fcntl.LOCK_UN)
|
|
1128
|
+
except OSError:
|
|
1129
|
+
pass
|
|
1130
|
+
lock_fh.close()
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def iter_codex_entries(
|
|
1134
|
+
conn: sqlite3.Connection,
|
|
1135
|
+
range_start: dt.datetime,
|
|
1136
|
+
range_end: dt.datetime,
|
|
1137
|
+
) -> list[CodexEntry]:
|
|
1138
|
+
"""Return cached CodexEntry rows with timestamp in [range_start, range_end]."""
|
|
1139
|
+
start_iso = range_start.astimezone(dt.timezone.utc).isoformat()
|
|
1140
|
+
end_iso = range_end.astimezone(dt.timezone.utc).isoformat()
|
|
1141
|
+
sql = (
|
|
1142
|
+
"SELECT timestamp_utc, session_id, model, "
|
|
1143
|
+
"input_tokens, cached_input_tokens, output_tokens, "
|
|
1144
|
+
"reasoning_output_tokens, total_tokens, source_path "
|
|
1145
|
+
"FROM codex_session_entries "
|
|
1146
|
+
"WHERE timestamp_utc >= ? AND timestamp_utc <= ? "
|
|
1147
|
+
"ORDER BY timestamp_utc ASC"
|
|
1148
|
+
)
|
|
1149
|
+
entries: list[CodexEntry] = []
|
|
1150
|
+
for row in conn.execute(sql, (start_iso, end_iso)):
|
|
1151
|
+
entries.append(CodexEntry(
|
|
1152
|
+
timestamp=dt.datetime.fromisoformat(row[0]),
|
|
1153
|
+
session_id=row[1],
|
|
1154
|
+
model=row[2],
|
|
1155
|
+
input_tokens=row[3],
|
|
1156
|
+
cached_input_tokens=row[4],
|
|
1157
|
+
output_tokens=row[5],
|
|
1158
|
+
reasoning_output_tokens=row[6],
|
|
1159
|
+
total_tokens=row[7],
|
|
1160
|
+
source_path=row[8],
|
|
1161
|
+
))
|
|
1162
|
+
return entries
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
def _collect_codex_entries_direct(
|
|
1166
|
+
range_start: dt.datetime,
|
|
1167
|
+
range_end: dt.datetime,
|
|
1168
|
+
) -> list[CodexEntry]:
|
|
1169
|
+
"""Legacy direct-parse fallback when cache.db is unavailable."""
|
|
1170
|
+
files = _discover_codex_session_files(range_start)
|
|
1171
|
+
entries: list[CodexEntry] = []
|
|
1172
|
+
for fp in files:
|
|
1173
|
+
try:
|
|
1174
|
+
with open(fp, "r", encoding="utf-8", errors="replace") as fh:
|
|
1175
|
+
for _offset, entry in _iter_codex_jsonl_entries_with_offsets(fh, str(fp)):
|
|
1176
|
+
if entry.timestamp < range_start or entry.timestamp > range_end:
|
|
1177
|
+
continue
|
|
1178
|
+
entries.append(entry)
|
|
1179
|
+
except OSError as exc:
|
|
1180
|
+
eprint(f"[codex] could not read {fp}: {exc}")
|
|
1181
|
+
return entries
|
|
1182
|
+
|
|
1183
|
+
|
|
1184
|
+
def get_codex_entries(
|
|
1185
|
+
range_start: dt.datetime,
|
|
1186
|
+
range_end: dt.datetime,
|
|
1187
|
+
) -> list[CodexEntry]:
|
|
1188
|
+
"""Cache-first Codex entry fetch with transparent fallback.
|
|
1189
|
+
|
|
1190
|
+
Every Codex-reading command must use this rather than touching
|
|
1191
|
+
open_cache_db directly.
|
|
1192
|
+
"""
|
|
1193
|
+
try:
|
|
1194
|
+
conn = open_cache_db()
|
|
1195
|
+
except (sqlite3.DatabaseError, OSError) as exc:
|
|
1196
|
+
eprint(f"[cache] unavailable ({exc}); falling back to direct JSONL parse")
|
|
1197
|
+
return _collect_codex_entries_direct(range_start, range_end)
|
|
1198
|
+
stats = sync_codex_cache(conn)
|
|
1199
|
+
if stats.lock_contended:
|
|
1200
|
+
# Sync commits file-by-file, so contention on the ingest lock
|
|
1201
|
+
# (e.g. a concurrent --rebuild, or a first-run sync still in
|
|
1202
|
+
# flight) can leave the cache PARTIALLY populated — some files
|
|
1203
|
+
# ingested, others pending. An "is the table empty?" guard passes
|
|
1204
|
+
# in that window and we'd silently return results missing the
|
|
1205
|
+
# caller's range. Fall back to a direct JSONL parse unconditionally
|
|
1206
|
+
# on contention; correctness > speed in the rare-but-real window
|
|
1207
|
+
# where cache state does not match disk.
|
|
1208
|
+
eprint(
|
|
1209
|
+
"[cache] concurrent codex ingest in progress; "
|
|
1210
|
+
"falling back to direct JSONL parse for correctness"
|
|
1211
|
+
)
|
|
1212
|
+
return _collect_codex_entries_direct(range_start, range_end)
|
|
1213
|
+
return iter_codex_entries(conn, range_start, range_end)
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
def get_entries(
|
|
1217
|
+
range_start: dt.datetime,
|
|
1218
|
+
range_end: dt.datetime,
|
|
1219
|
+
*,
|
|
1220
|
+
project: str | None = None,
|
|
1221
|
+
skip_sync: bool = False,
|
|
1222
|
+
) -> list[UsageEntry]:
|
|
1223
|
+
"""Cache-first entry fetch with transparent fallback. Every JSONL-consuming
|
|
1224
|
+
command should use this instead of talking to open_cache_db directly.
|
|
1225
|
+
|
|
1226
|
+
When `skip_sync=True`, bypass the JSONL ingest and serve whatever is
|
|
1227
|
+
already cached. The cache-open fallback still fires if the cache DB is
|
|
1228
|
+
unusable, but the ingest + lock-contention fallback are both skipped.
|
|
1229
|
+
"""
|
|
1230
|
+
try:
|
|
1231
|
+
conn = open_cache_db()
|
|
1232
|
+
except (sqlite3.DatabaseError, OSError) as exc:
|
|
1233
|
+
eprint(f"[cache] unavailable ({exc}); falling back to direct JSONL parse")
|
|
1234
|
+
return _collect_entries_direct(range_start, range_end, project=project)
|
|
1235
|
+
if not skip_sync:
|
|
1236
|
+
stats = sync_cache(conn)
|
|
1237
|
+
if stats.lock_contended:
|
|
1238
|
+
# Sync commits file-by-file, so contention on the ingest lock
|
|
1239
|
+
# (e.g. a concurrent --rebuild, or a first-run sync still in
|
|
1240
|
+
# flight) can leave the cache PARTIALLY populated — some files
|
|
1241
|
+
# ingested, others pending. An "is the table empty?" guard passes
|
|
1242
|
+
# in that window and we'd silently return results missing the
|
|
1243
|
+
# caller's range. Fall back to a direct JSONL parse unconditionally
|
|
1244
|
+
# on contention; correctness > speed in the rare-but-real window
|
|
1245
|
+
# where cache state does not match disk.
|
|
1246
|
+
eprint(
|
|
1247
|
+
"[cache] concurrent ingest in progress; "
|
|
1248
|
+
"falling back to direct JSONL parse for correctness"
|
|
1249
|
+
)
|
|
1250
|
+
return _collect_entries_direct(range_start, range_end, project=project)
|
|
1251
|
+
return iter_entries(conn, range_start, range_end, project=project)
|
|
1252
|
+
|
|
1253
|
+
|
|
1254
|
+
# === Region 6: open_cache_db (was bin/cctally:9040-9155) ===
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
def open_cache_db() -> sqlite3.Connection:
|
|
1258
|
+
"""Open (or create) the session-entry cache DB.
|
|
1259
|
+
|
|
1260
|
+
Enables WAL mode so queries can run concurrently with an in-progress
|
|
1261
|
+
ingest. On sqlite3.DatabaseError (corruption) the file is unlinked and
|
|
1262
|
+
recreated — the cache is fully re-derivable from JSONL, so this is safe.
|
|
1263
|
+
"""
|
|
1264
|
+
c = _cctally()
|
|
1265
|
+
c.APP_DIR.mkdir(parents=True, exist_ok=True)
|
|
1266
|
+
try:
|
|
1267
|
+
conn = sqlite3.connect(c.CACHE_DB_PATH)
|
|
1268
|
+
conn.execute("SELECT 1").fetchone()
|
|
1269
|
+
except sqlite3.DatabaseError as exc:
|
|
1270
|
+
eprint(f"[cache] corrupt cache DB ({exc}); recreating")
|
|
1271
|
+
try:
|
|
1272
|
+
c.CACHE_DB_PATH.unlink()
|
|
1273
|
+
except FileNotFoundError:
|
|
1274
|
+
pass
|
|
1275
|
+
conn = sqlite3.connect(c.CACHE_DB_PATH)
|
|
1276
|
+
|
|
1277
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
1278
|
+
conn.execute("PRAGMA busy_timeout=5000")
|
|
1279
|
+
|
|
1280
|
+
conn.executescript(
|
|
1281
|
+
"""
|
|
1282
|
+
CREATE TABLE IF NOT EXISTS session_files (
|
|
1283
|
+
path TEXT PRIMARY KEY,
|
|
1284
|
+
size_bytes INTEGER NOT NULL,
|
|
1285
|
+
mtime_ns INTEGER NOT NULL,
|
|
1286
|
+
last_byte_offset INTEGER NOT NULL,
|
|
1287
|
+
last_ingested_at TEXT NOT NULL
|
|
1288
|
+
);
|
|
1289
|
+
CREATE TABLE IF NOT EXISTS session_entries (
|
|
1290
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1291
|
+
source_path TEXT NOT NULL,
|
|
1292
|
+
line_offset INTEGER NOT NULL,
|
|
1293
|
+
timestamp_utc TEXT NOT NULL,
|
|
1294
|
+
model TEXT NOT NULL,
|
|
1295
|
+
msg_id TEXT,
|
|
1296
|
+
req_id TEXT,
|
|
1297
|
+
input_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1298
|
+
output_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1299
|
+
cache_create_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1300
|
+
cache_read_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1301
|
+
usage_extra_json TEXT,
|
|
1302
|
+
cost_usd_raw REAL
|
|
1303
|
+
);
|
|
1304
|
+
CREATE INDEX IF NOT EXISTS idx_entries_timestamp
|
|
1305
|
+
ON session_entries(timestamp_utc);
|
|
1306
|
+
CREATE INDEX IF NOT EXISTS idx_entries_source
|
|
1307
|
+
ON session_entries(source_path);
|
|
1308
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_entries_dedup
|
|
1309
|
+
ON session_entries(msg_id, req_id)
|
|
1310
|
+
WHERE msg_id IS NOT NULL AND req_id IS NOT NULL;
|
|
1311
|
+
|
|
1312
|
+
CREATE TABLE IF NOT EXISTS codex_session_files (
|
|
1313
|
+
path TEXT PRIMARY KEY,
|
|
1314
|
+
size_bytes INTEGER NOT NULL,
|
|
1315
|
+
mtime_ns INTEGER NOT NULL,
|
|
1316
|
+
last_byte_offset INTEGER NOT NULL,
|
|
1317
|
+
last_ingested_at TEXT NOT NULL,
|
|
1318
|
+
last_session_id TEXT,
|
|
1319
|
+
last_model TEXT
|
|
1320
|
+
);
|
|
1321
|
+
CREATE TABLE IF NOT EXISTS codex_session_entries (
|
|
1322
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1323
|
+
source_path TEXT NOT NULL,
|
|
1324
|
+
line_offset INTEGER NOT NULL,
|
|
1325
|
+
timestamp_utc TEXT NOT NULL,
|
|
1326
|
+
session_id TEXT NOT NULL,
|
|
1327
|
+
model TEXT NOT NULL,
|
|
1328
|
+
input_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1329
|
+
cached_input_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1330
|
+
output_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1331
|
+
reasoning_output_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1332
|
+
total_tokens INTEGER NOT NULL DEFAULT 0,
|
|
1333
|
+
UNIQUE(source_path, line_offset)
|
|
1334
|
+
);
|
|
1335
|
+
CREATE INDEX IF NOT EXISTS idx_codex_entries_timestamp
|
|
1336
|
+
ON codex_session_entries(timestamp_utc);
|
|
1337
|
+
CREATE INDEX IF NOT EXISTS idx_codex_entries_session
|
|
1338
|
+
ON codex_session_entries(session_id);
|
|
1339
|
+
CREATE INDEX IF NOT EXISTS idx_codex_entries_source
|
|
1340
|
+
ON codex_session_entries(source_path);
|
|
1341
|
+
"""
|
|
1342
|
+
)
|
|
1343
|
+
|
|
1344
|
+
# Inline migration: add session_id / project_path columns to session_files
|
|
1345
|
+
# if they're missing. These were added for A2 `session` subcommand metadata;
|
|
1346
|
+
# populated lazily in sync_cache() / _ensure_session_files_row().
|
|
1347
|
+
add_column_if_missing(conn, "session_files", "session_id", "TEXT")
|
|
1348
|
+
add_column_if_missing(conn, "session_files", "project_path", "TEXT")
|
|
1349
|
+
conn.execute(
|
|
1350
|
+
"CREATE INDEX IF NOT EXISTS idx_session_files_session_id "
|
|
1351
|
+
"ON session_files(session_id)"
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1354
|
+
# Migration: add last_total_tokens to codex_session_files. When the column
|
|
1355
|
+
# is newly added (i.e. this is the first run after upgrade), purge the
|
|
1356
|
+
# Codex cache so the duplicate-counted rows produced by the previous
|
|
1357
|
+
# iterator are reingested cleanly by sync_codex_cache(). The cache is
|
|
1358
|
+
# fully re-derivable from ~/.codex/sessions/*.jsonl so this is safe.
|
|
1359
|
+
if add_column_if_missing(conn, "codex_session_files", "last_total_tokens", "INTEGER"):
|
|
1360
|
+
conn.execute("DELETE FROM codex_session_entries")
|
|
1361
|
+
conn.execute("DELETE FROM codex_session_files")
|
|
1362
|
+
conn.commit()
|
|
1363
|
+
eprint("[cache] migrated codex cache — re-ingesting")
|
|
1364
|
+
|
|
1365
|
+
# Migration framework dispatcher for cache.db. The registry is empty in
|
|
1366
|
+
# v1 — this is preparatory wiring that activates when the next cache.db
|
|
1367
|
+
# migration ships. With an empty registry the dispatcher hits the
|
|
1368
|
+
# fast-path or fresh-install branch and returns immediately. See spec
|
|
1369
|
+
# §2.5, §3.3 + the @cache_migration decorator further down in this file.
|
|
1370
|
+
_run_pending_migrations(
|
|
1371
|
+
conn, registry=_CACHE_MIGRATIONS, db_label="cache.db",
|
|
1372
|
+
)
|
|
1373
|
+
return conn
|
|
1374
|
+
|
|
1375
|
+
|
|
1376
|
+
# === Region 7: cmd_cache_sync (was bin/cctally:11563-11616) ===
|
|
1377
|
+
|
|
1378
|
+
|
|
1379
|
+
def cmd_cache_sync(args: argparse.Namespace) -> int:
|
|
1380
|
+
"""Explicitly sync (or rebuild) the session-entry cache.
|
|
1381
|
+
|
|
1382
|
+
Transparent auto-sync happens on every JSONL-reading command; this
|
|
1383
|
+
subcommand exists for priming (e.g. via launchd) and for forcing a
|
|
1384
|
+
full rebuild after pricing-dict changes or cache corruption.
|
|
1385
|
+
|
|
1386
|
+
--source {claude,codex,all} selects which half(s) to sync/rebuild;
|
|
1387
|
+
default is 'all'.
|
|
1388
|
+
"""
|
|
1389
|
+
source = getattr(args, "source", "all")
|
|
1390
|
+
conn = open_cache_db()
|
|
1391
|
+
|
|
1392
|
+
# Note: when --rebuild is set we delegate the DELETE to sync_cache /
|
|
1393
|
+
# sync_codex_cache, which execute it AFTER acquiring the flock. A
|
|
1394
|
+
# pre-sync DELETE here would wipe the cache even when the subsequent
|
|
1395
|
+
# sync loses the lock race and bails — leaving the user with empty
|
|
1396
|
+
# state. See sync_cache() / sync_codex_cache() docstrings.
|
|
1397
|
+
|
|
1398
|
+
if source in ("claude", "all"):
|
|
1399
|
+
stats = sync_cache(conn, progress=_progress_stderr, rebuild=args.rebuild)
|
|
1400
|
+
_progress_stderr(stats, force=True)
|
|
1401
|
+
if stats.lock_contended and args.rebuild:
|
|
1402
|
+
eprint(
|
|
1403
|
+
"[cache-sync] rebuild skipped (claude): "
|
|
1404
|
+
"another process holds the lock"
|
|
1405
|
+
)
|
|
1406
|
+
elif not stats.lock_contended:
|
|
1407
|
+
eprint(
|
|
1408
|
+
f"[cache-sync] claude done: {stats.files_processed} processed, "
|
|
1409
|
+
f"{stats.files_skipped_unchanged} skipped, "
|
|
1410
|
+
f"{stats.files_reset_truncated} reset, "
|
|
1411
|
+
f"{stats.rows_inserted} rows inserted"
|
|
1412
|
+
)
|
|
1413
|
+
|
|
1414
|
+
if source in ("codex", "all"):
|
|
1415
|
+
stats = sync_codex_cache(
|
|
1416
|
+
conn, progress=_progress_codex_stderr, rebuild=args.rebuild
|
|
1417
|
+
)
|
|
1418
|
+
_progress_codex_stderr(stats, force=True)
|
|
1419
|
+
if stats.lock_contended and args.rebuild:
|
|
1420
|
+
eprint(
|
|
1421
|
+
"[cache-sync] rebuild skipped (codex): "
|
|
1422
|
+
"another process holds the lock"
|
|
1423
|
+
)
|
|
1424
|
+
elif not stats.lock_contended:
|
|
1425
|
+
eprint(
|
|
1426
|
+
f"[cache-sync] codex done: {stats.files_processed} processed, "
|
|
1427
|
+
f"{stats.files_skipped_unchanged} skipped, "
|
|
1428
|
+
f"{stats.files_reset_truncated} reset, "
|
|
1429
|
+
f"{stats.rows_inserted} rows inserted"
|
|
1430
|
+
)
|
|
1431
|
+
|
|
1432
|
+
return 0
|