cctally 1.7.0 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,403 @@
1
+ """JSONL entry parsing for Claude + Codex session files.
2
+
3
+ Pure-fn layer (no I/O at import time): holds the two streaming readers
4
+ that delta-resume Claude `~/.claude/projects/**/*.jsonl` and Codex
5
+ `~/.codex/sessions/**/*.jsonl` files, the bulk parser that does
6
+ range-filtered + msg-id/req-id-dedup'd reads (the legacy entry point
7
+ preserved for paths that don't go through `cache.db`), and the
8
+ dataclasses they produce (`UsageEntry`, `CodexEntry`) + the mutable
9
+ cross-call tracker (`_CodexIterState`).
10
+
11
+ `bin/cctally` re-exports every public symbol below so the ~50 internal
12
+ call sites + SourceFileLoader-based tests
13
+ (`tests/test_dashboard_api_block`, `tests/test_blocks_recorded_anchor`,
14
+ `bin/build-codex-fixtures.py`) resolve unchanged. Zero call-time
15
+ back-references to `bin/cctally`: this module is a pure leaf in the
16
+ sibling graph. The only cross-module helper used (`eprint`) is
17
+ duplicated as a private `_eprint` per the split design's §5.3 contract.
18
+
19
+ Spec: docs/superpowers/specs/2026-05-13-bin-cctally-split-design.md
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import datetime as dt
24
+ import json
25
+ import pathlib
26
+ import re
27
+ import sys
28
+ from dataclasses import dataclass
29
+ from typing import Any
30
+
31
+
32
+ def _eprint(*args: Any) -> None:
33
+ print(*args, file=sys.stderr)
34
+
35
+
36
+ @dataclass
37
+ class UsageEntry:
38
+ timestamp: dt.datetime
39
+ model: str
40
+ usage: dict[str, Any]
41
+ cost_usd: float | None
42
+
43
+
44
+ @dataclass
45
+ class CodexEntry:
46
+ """One emitted Codex `token_count` event row.
47
+
48
+ Mirrors the columns of codex_session_entries. `last_token_usage` fields
49
+ are used (per-turn deltas), not the cumulative totals.
50
+ """
51
+ timestamp: dt.datetime
52
+ session_id: str
53
+ model: str
54
+ input_tokens: int
55
+ cached_input_tokens: int
56
+ output_tokens: int
57
+ reasoning_output_tokens: int
58
+ total_tokens: int
59
+ source_path: str
60
+
61
+
62
+ def _parse_usage_entries(
63
+ jsonl_path: pathlib.Path,
64
+ range_start: dt.datetime,
65
+ range_end: dt.datetime,
66
+ seen_hashes: set[str] | None = None,
67
+ ) -> list[UsageEntry]:
68
+ """Parse assistant entries from a JSONL file within the given time range."""
69
+ entries: list[UsageEntry] = []
70
+ try:
71
+ with open(jsonl_path, "r", encoding="utf-8", errors="replace") as fh:
72
+ for line in fh:
73
+ line = line.strip()
74
+ if not line:
75
+ continue
76
+ try:
77
+ obj = json.loads(line)
78
+ except json.JSONDecodeError:
79
+ continue
80
+
81
+ if obj.get("type") != "assistant":
82
+ continue
83
+
84
+ ts_raw = obj.get("timestamp")
85
+ if not isinstance(ts_raw, str) or not ts_raw.strip():
86
+ continue
87
+
88
+ msg = obj.get("message")
89
+ if not isinstance(msg, dict):
90
+ msg = obj
91
+
92
+ usage = msg.get("usage")
93
+ if not isinstance(usage, dict):
94
+ continue
95
+
96
+ model = msg.get("model") or obj.get("model")
97
+ if not isinstance(model, str) or not model.strip():
98
+ continue
99
+
100
+ try:
101
+ ts = dt.datetime.fromisoformat(
102
+ ts_raw.strip().replace("Z", "+00:00")
103
+ )
104
+ if ts.tzinfo is None:
105
+ ts = ts.replace(tzinfo=dt.timezone.utc)
106
+ except ValueError:
107
+ continue
108
+
109
+ if ts < range_start or ts > range_end:
110
+ continue
111
+
112
+ # Deduplicate by message.id + requestId (same as ccusage)
113
+ msg_id = msg.get("id")
114
+ req_id = obj.get("requestId")
115
+ if msg_id is not None and req_id is not None:
116
+ entry_hash = f"{msg_id}:{req_id}"
117
+ if seen_hashes is not None:
118
+ if entry_hash in seen_hashes:
119
+ continue
120
+ seen_hashes.add(entry_hash)
121
+
122
+ cost_usd_raw = obj.get("costUSD")
123
+ cost_usd = (
124
+ float(cost_usd_raw)
125
+ if cost_usd_raw is not None
126
+ else None
127
+ )
128
+
129
+ entries.append(UsageEntry(
130
+ timestamp=ts,
131
+ model=model.strip(),
132
+ usage=usage,
133
+ cost_usd=cost_usd,
134
+ ))
135
+ except OSError as exc:
136
+ _eprint(f"[cost] could not read {jsonl_path}: {exc}")
137
+
138
+ return entries
139
+
140
+
141
+ def _iter_jsonl_entries_with_offsets(fh):
142
+ """Yield (byte_offset, UsageEntry, msg_id, req_id) for each assistant
143
+ entry starting from fh's current position.
144
+
145
+ Uses readline()+tell() rather than `for line in fh` so byte offsets are
146
+ accurate for resume-from-offset after partial ingests. Malformed JSON
147
+ and non-assistant lines are skipped, but the offset still advances past
148
+ them so they are never re-read. Range filtering is intentionally NOT
149
+ done here — filters are applied at query time by iter_entries().
150
+ """
151
+ while True:
152
+ offset = fh.tell()
153
+ line = fh.readline()
154
+ if not line:
155
+ return
156
+ if not line.endswith("\n"):
157
+ # Partial tail line — writer is mid-flight. Rewind so the
158
+ # next sync re-reads this line once the newline is in place.
159
+ # Without this, sync_cache would store fh.tell() (past the
160
+ # partial) as last_byte_offset and permanently skip the entry.
161
+ fh.seek(offset)
162
+ return
163
+ stripped = line.strip()
164
+ if not stripped:
165
+ continue
166
+ try:
167
+ obj = json.loads(stripped)
168
+ except json.JSONDecodeError:
169
+ continue
170
+ if obj.get("type") != "assistant":
171
+ continue
172
+
173
+ ts_raw = obj.get("timestamp")
174
+ if not isinstance(ts_raw, str) or not ts_raw.strip():
175
+ continue
176
+
177
+ msg = obj.get("message")
178
+ if not isinstance(msg, dict):
179
+ msg = obj
180
+
181
+ usage = msg.get("usage")
182
+ if not isinstance(usage, dict):
183
+ continue
184
+
185
+ model = msg.get("model") or obj.get("model")
186
+ if not isinstance(model, str) or not model.strip():
187
+ continue
188
+
189
+ try:
190
+ ts = dt.datetime.fromisoformat(ts_raw.strip().replace("Z", "+00:00"))
191
+ if ts.tzinfo is None:
192
+ ts = ts.replace(tzinfo=dt.timezone.utc)
193
+ except ValueError:
194
+ continue
195
+
196
+ msg_id = msg.get("id")
197
+ req_id = obj.get("requestId")
198
+ cost_usd_raw = obj.get("costUSD")
199
+ cost_usd = float(cost_usd_raw) if cost_usd_raw is not None else None
200
+
201
+ yield (
202
+ offset,
203
+ UsageEntry(
204
+ timestamp=ts,
205
+ model=model.strip(),
206
+ usage=usage,
207
+ cost_usd=cost_usd,
208
+ ),
209
+ msg_id,
210
+ req_id,
211
+ )
212
+
213
+
214
+ _CODEX_FILENAME_UUID_RE = re.compile(
215
+ r"rollout-\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-([0-9a-fA-F-]{36})\.jsonl$"
216
+ )
217
+
218
+
219
+ @dataclass
220
+ class _CodexIterState:
221
+ """Mutable per-file tracker exposed to callers of
222
+ `_iter_codex_jsonl_entries_with_offsets` so the iterator's terminal
223
+ session_id/model are visible even when the delta window ends on a
224
+ `session_meta` or `turn_context` event with no subsequent yielded
225
+ `token_count`. Callers seed it with previously-persisted values and
226
+ read it back after the iterator drains.
227
+ """
228
+ session_id: str | None = None
229
+ model: str | None = None
230
+ total_tokens: int = 0
231
+
232
+
233
+ def _iter_codex_jsonl_entries_with_offsets(
234
+ fh,
235
+ path_str: str,
236
+ *,
237
+ initial_session_id: str | None = None,
238
+ initial_model: str | None = None,
239
+ initial_total_tokens: int = 0,
240
+ state: _CodexIterState | None = None,
241
+ ):
242
+ """Yield (line_offset, CodexEntry) for each billable `token_count` event.
243
+
244
+ Maintains per-file state (session_id, model) as records are streamed.
245
+ Callers performing a delta resume from non-zero byte offset should pass
246
+ the previously-observed session_id/model as initial_session_id and
247
+ initial_model so attribution stays correct even if the new byte range
248
+ contains no fresh session_meta / turn_context record.
249
+
250
+ If `state` is supplied it is updated in-place on every `session_meta`
251
+ / `turn_context` record regardless of whether any subsequent
252
+ `token_count` actually yields. This lets callers observe the iterator's
253
+ terminal state even when the delta window ends on a metadata record —
254
+ otherwise `last_model` would silently persist a stale value and the
255
+ next resume would mis-attribute the first post-resume token_count.
256
+
257
+ Skips token_count events with payload.info == None (rate-limit-only
258
+ events). Falls back to filename-derived session_id with a one-shot warning
259
+ if session_meta is never observed.
260
+
261
+ Codex CLI emits multiple `token_count` events per completed turn (UI/
262
+ turn_context updates re-emit the same `last_token_usage` while the
263
+ cumulative `info.total_token_usage.total_tokens` stays flat). To avoid
264
+ double-counting, we track the cumulative total across yields and skip
265
+ any event whose cumulative total is not strictly greater than the
266
+ previously-seen cumulative. Callers doing delta resumes should pass the
267
+ last persisted cumulative as `initial_total_tokens`. If `total_token_usage`
268
+ is missing or non-dict (older Codex builds), we fall back to yielding
269
+ unconditionally — preserving legacy behavior on those rollouts.
270
+
271
+ Readline()+tell() is used rather than `for line in fh` so byte offsets
272
+ are accurate for resume-from-offset after partial ingests. Partial-tail
273
+ lines (no trailing \\n) trigger a seek-back so the next sync re-reads
274
+ the line once the newline is flushed.
275
+ """
276
+ if state is None:
277
+ state = _CodexIterState()
278
+ # Seed the tracker from the kwargs. Kwargs take priority only when the
279
+ # caller-supplied state has no value yet — this preserves the existing
280
+ # contract for callers that pass kwargs without a state object, while
281
+ # letting callers who DO pass a pre-populated state see it honored.
282
+ if state.session_id is None and initial_session_id is not None:
283
+ state.session_id = initial_session_id
284
+ if state.model is None and initial_model is not None:
285
+ state.model = initial_model
286
+ last_total_tokens: int = int(initial_total_tokens or 0)
287
+ # Suppress the filename-UUID fallback warning when we already have a
288
+ # seeded session_id (delta resume path). Without this, every resume
289
+ # into a slice of the file that doesn't re-observe session_meta would
290
+ # noisily warn even though attribution is correct.
291
+ filename_session_id_warned = state.session_id is not None
292
+ filename_uuid_match = _CODEX_FILENAME_UUID_RE.search(path_str)
293
+ filename_uuid = filename_uuid_match.group(1) if filename_uuid_match else None
294
+
295
+ while True:
296
+ offset = fh.tell()
297
+ line = fh.readline()
298
+ if not line:
299
+ return
300
+ if not line.endswith("\n"):
301
+ fh.seek(offset)
302
+ return
303
+ stripped = line.strip()
304
+ if not stripped:
305
+ continue
306
+ try:
307
+ obj = json.loads(stripped)
308
+ except json.JSONDecodeError:
309
+ continue
310
+
311
+ rtype = obj.get("type")
312
+ payload = obj.get("payload") if isinstance(obj.get("payload"), dict) else {}
313
+
314
+ if rtype == "session_meta":
315
+ sid = payload.get("id")
316
+ if isinstance(sid, str) and sid:
317
+ state.session_id = sid
318
+ continue
319
+
320
+ if rtype == "turn_context":
321
+ m = payload.get("model")
322
+ if isinstance(m, str) and m.strip():
323
+ state.model = m.strip()
324
+ continue
325
+
326
+ if rtype != "event_msg":
327
+ continue
328
+
329
+ if payload.get("type") != "token_count":
330
+ continue
331
+ info = payload.get("info")
332
+ if not isinstance(info, dict):
333
+ continue
334
+ ltu = info.get("last_token_usage")
335
+ if not isinstance(ltu, dict):
336
+ continue
337
+
338
+ # Dedupe re-emitted token_count events. Codex re-emits `last_token_usage`
339
+ # on UI/turn_context updates with a flat `total_token_usage.total_tokens`;
340
+ # only yield once per actual turn by requiring the cumulative to strictly
341
+ # advance. If `total_token_usage` is missing or non-dict (older Codex
342
+ # builds), skip the guard and yield — preserving legacy behavior.
343
+ ttu = info.get("total_token_usage")
344
+ if isinstance(ttu, dict):
345
+ try:
346
+ cumulative = int(ttu.get("total_tokens") or 0)
347
+ except (TypeError, ValueError):
348
+ cumulative = 0
349
+ if cumulative <= last_total_tokens:
350
+ continue
351
+ else:
352
+ cumulative = None # type: ignore[assignment]
353
+
354
+ ts_raw = obj.get("timestamp")
355
+ if not isinstance(ts_raw, str) or not ts_raw.strip():
356
+ continue
357
+ try:
358
+ ts = dt.datetime.fromisoformat(ts_raw.strip().replace("Z", "+00:00"))
359
+ if ts.tzinfo is None:
360
+ ts = ts.replace(tzinfo=dt.timezone.utc)
361
+ except ValueError:
362
+ continue
363
+
364
+ session_id = state.session_id
365
+ if session_id is None:
366
+ session_id = filename_uuid
367
+ if session_id is not None and not filename_session_id_warned:
368
+ _eprint(
369
+ f"[codex] session_meta not seen in {path_str}; "
370
+ f"falling back to filename UUID {session_id}"
371
+ )
372
+ filename_session_id_warned = True
373
+ if session_id is None:
374
+ # No session_meta and no parseable filename UUID — skip row.
375
+ continue
376
+
377
+ model = state.model or "unknown"
378
+
379
+ def _int(key: str) -> int:
380
+ v = ltu.get(key)
381
+ try:
382
+ return int(v or 0)
383
+ except (TypeError, ValueError):
384
+ return 0
385
+
386
+ yield (
387
+ offset,
388
+ CodexEntry(
389
+ timestamp=ts,
390
+ session_id=session_id,
391
+ model=model,
392
+ input_tokens=_int("input_tokens"),
393
+ cached_input_tokens=_int("cached_input_tokens"),
394
+ output_tokens=_int("output_tokens"),
395
+ reasoning_output_tokens=_int("reasoning_output_tokens"),
396
+ total_tokens=_int("total_tokens"),
397
+ source_path=path_str,
398
+ ),
399
+ )
400
+ # Advance the cumulative watermark only after a successful yield so
401
+ # resume-from-offset continues to dedupe against the last counted turn.
402
+ if isinstance(ttu, dict) and cumulative is not None:
403
+ last_total_tokens = cumulative