cctally 1.28.0 → 1.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/bin/_cctally_cache.py +147 -59
- package/bin/_cctally_core.py +22 -49
- package/bin/_cctally_dashboard.py +239 -152
- package/bin/_cctally_db.py +211 -31
- package/bin/_cctally_milestones.py +126 -166
- package/bin/_cctally_record.py +161 -192
- package/bin/_lib_alert_axes.py +7 -4
- package/bin/_lib_conversation.py +59 -8
- package/bin/_lib_conversation_query.py +306 -52
- package/bin/_lib_jsonl.py +69 -50
- package/bin/cctally +5 -5
- package/dashboard/static/assets/index-4OxMhN7N.js +53 -0
- package/dashboard/static/assets/index-DEDO-eqP.css +1 -0
- package/dashboard/static/assets/newsreader-latin-400-italic-CEihAR-f.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-400-italic-CNZoH1hn.woff +0 -0
- package/dashboard/static/assets/newsreader-latin-400-normal-BFBkh4jY.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-400-normal-gRTjlS2D.woff +0 -0
- package/dashboard/static/assets/newsreader-latin-500-normal-B66TYsaK.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-500-normal-DFwuUcdu.woff +0 -0
- package/dashboard/static/assets/newsreader-latin-600-normal-30OJ_TG_.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-600-normal-DUnT2r2g.woff +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-400-italic-BMTE_bNQ.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-400-italic-qdgKLcPG.woff +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-400-normal-DYA1XoQK.woff +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-400-normal-svq1FPys.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-500-normal-BNHmvKvI.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-500-normal-CZruMFou.woff +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-600-normal-BXv5iMHi.woff2 +0 -0
- package/dashboard/static/assets/newsreader-latin-ext-600-normal-BrbfzHZ5.woff +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-400-italic-QbB8kb5s.woff +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-400-italic-bZegYFuM.woff2 +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-400-normal-BekUZro8.woff +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-400-normal-DdKr49mV.woff2 +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-500-normal-BEAbKU8A.woff +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-500-normal-CL6a8tp2.woff2 +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-600-normal-CVAR0otO.woff +0 -0
- package/dashboard/static/assets/newsreader-vietnamese-600-normal-CaH84vfx.woff2 +0 -0
- package/dashboard/static/dashboard.html +2 -2
- package/package.json +1 -1
- package/dashboard/static/assets/index-Bj5ckRUE.css +0 -1
- package/dashboard/static/assets/index-Dw4G5FD9.js +0 -18
|
@@ -13,6 +13,7 @@ deduped session_entries row (idx_entries_dedup), via the shared pricing helper
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
import json as _json
|
|
15
15
|
import os
|
|
16
|
+
import re
|
|
16
17
|
import sqlite3
|
|
17
18
|
|
|
18
19
|
# Public surface (Plan 2): shipped in the npm tarball + brew formula + public
|
|
@@ -21,6 +22,75 @@ import sqlite3
|
|
|
21
22
|
from _lib_pricing import _calculate_entry_cost
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
# Mirror of dashboard/web/src/conversations/systemMarkers.ts::MARKER_RE — anchored
|
|
26
|
+
# whole-string (fullmatch), unrolled-lazy body for linear time (no ReDoS), \1
|
|
27
|
+
# backref forces each close tag to match its open tag. Used to SKIP slash-command
|
|
28
|
+
# plumbing when deriving a conversation title (#165 Q2). MUST stay equivalent to
|
|
29
|
+
# the TS predicate over ASCII whitespace (parity-tested); exotic Unicode/control
|
|
30
|
+
# whitespace is an explicit non-goal. See docs/dashboard-gotchas.md.
|
|
31
|
+
_MARKER_TAGS = ("command-name", "command-message", "command-args", "local-command-caveat")
|
|
32
|
+
_MARKER_RE = re.compile(
|
|
33
|
+
r"\s*(?:<(" + "|".join(_MARKER_TAGS) + r")>(?:(?!</\1>)[\s\S])*</\1>\s*)+"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _is_system_marker(text) -> bool:
|
|
38
|
+
"""True iff `text` is ONLY concatenated command-marker wrappers (slash-command
|
|
39
|
+
plumbing) — the title-derivation skip predicate. `fullmatch` reproduces the TS
|
|
40
|
+
`^\\s*…\\s*$` anchor (no `$`-before-trailing-`\\n` foot-gun)."""
|
|
41
|
+
return bool(text) and _MARKER_RE.fullmatch(text) is not None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_TITLE_MAX = 120
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _title_from_text(text) -> str:
|
|
48
|
+
"""First non-blank LINE of `text`, trimmed, sliced to _TITLE_MAX with a
|
|
49
|
+
trailing '…' ONLY when truncated (rstrip before the ellipsis). '' if none.
|
|
50
|
+
Semantics IDENTICAL to the client deriveReaderTitle (#165 P2.5)."""
|
|
51
|
+
for line in (text or "").split("\n"):
|
|
52
|
+
s = line.strip()
|
|
53
|
+
if s:
|
|
54
|
+
return (s[:_TITLE_MAX].rstrip() + "…") if len(s) > _TITLE_MAX else s
|
|
55
|
+
return ""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _session_titles_map(conn, session_ids):
|
|
59
|
+
"""{sid: title} for the first non-marker, non-blank MAIN-session human line
|
|
60
|
+
per session (read-time, no migration). Windowed to the earliest 12 human
|
|
61
|
+
rows/session (rides idx_conv_session_ts); Python skips system markers. A
|
|
62
|
+
session whose first 12 human rows are all markers/blank is simply absent
|
|
63
|
+
(caller falls back). NOTE (Codex P1.2): the window ranks the full per-session
|
|
64
|
+
human partition before rn<=12 — confirmed index-ordered + bounded by the page
|
|
65
|
+
(≤200 sessions); per-session human counts are modest. If EXPLAIN QUERY PLAN
|
|
66
|
+
ever shows a temp B-tree sort here, switch to a per-session correlated
|
|
67
|
+
LIMIT 12 candidate fetch."""
|
|
68
|
+
if not session_ids:
|
|
69
|
+
return {}
|
|
70
|
+
titles = {}
|
|
71
|
+
ph = ",".join("?" for _ in session_ids)
|
|
72
|
+
rows = conn.execute(
|
|
73
|
+
"SELECT session_id, text FROM ("
|
|
74
|
+
" SELECT session_id, text, "
|
|
75
|
+
" ROW_NUMBER() OVER (PARTITION BY session_id "
|
|
76
|
+
" ORDER BY timestamp_utc, id) AS rn "
|
|
77
|
+
f" FROM conversation_messages "
|
|
78
|
+
f" WHERE session_id IN ({ph}) AND entry_type='human' "
|
|
79
|
+
" AND is_sidechain=0 AND COALESCE(text,'') <> ''"
|
|
80
|
+
") WHERE rn <= 12 ORDER BY session_id, rn",
|
|
81
|
+
tuple(session_ids),
|
|
82
|
+
).fetchall()
|
|
83
|
+
for sid, text in rows:
|
|
84
|
+
if sid in titles:
|
|
85
|
+
continue # already resolved to the first non-marker
|
|
86
|
+
if _is_system_marker(text):
|
|
87
|
+
continue
|
|
88
|
+
t = _title_from_text(text)
|
|
89
|
+
if t:
|
|
90
|
+
titles[sid] = t
|
|
91
|
+
return titles
|
|
92
|
+
|
|
93
|
+
|
|
24
94
|
def _project_label(cwd) -> str:
|
|
25
95
|
"""Basename of the project cwd (dashboard label posture — no reveal). Falls
|
|
26
96
|
back to the raw path for root-ish cwds, '' when absent."""
|
|
@@ -29,6 +99,25 @@ def _project_label(cwd) -> str:
|
|
|
29
99
|
return os.path.basename(cwd.rstrip("/")) or cwd
|
|
30
100
|
|
|
31
101
|
|
|
102
|
+
def _subagent_key(source_path):
|
|
103
|
+
"""Privacy-safe subagent-thread identity for the reader. Each subagent (Task)
|
|
104
|
+
invocation writes its own ``agent-<hash>.jsonl``; the main session is
|
|
105
|
+
``<session_id>.jsonl``. Returns the agent hash (``agent-`` prefix + ``.jsonl``
|
|
106
|
+
suffix stripped; an ``acompact-`` middle is kept), or ``None`` for the main
|
|
107
|
+
file / a non-agent path. We expose ONLY this derived key — never the raw
|
|
108
|
+
absolute ``source_path`` (which leaks home dir / username / encoded project,
|
|
109
|
+
and the conversation routes are LAN-exposable via dashboard.expose_transcripts)."""
|
|
110
|
+
if not source_path:
|
|
111
|
+
return None
|
|
112
|
+
base = os.path.basename(source_path)
|
|
113
|
+
if not base.startswith("agent-"):
|
|
114
|
+
return None
|
|
115
|
+
stem = base[len("agent-"):]
|
|
116
|
+
if stem.endswith(".jsonl"):
|
|
117
|
+
stem = stem[: -len(".jsonl")]
|
|
118
|
+
return stem or None
|
|
119
|
+
|
|
120
|
+
|
|
32
121
|
def _entry_cost(model, inp, out, cc, cr, cost_usd_raw) -> float:
|
|
33
122
|
"""Cost for one session_entries row via the shared pricing helper. Tokens →
|
|
34
123
|
the helper's usage dict. cost_usd_raw is passed as the optional override the
|
|
@@ -136,9 +225,11 @@ def list_conversations(conn, *, sort="recent", limit=50, offset=0) -> dict:
|
|
|
136
225
|
models = _session_models_map(conn, session_ids)
|
|
137
226
|
# cwd/git_branch as the latest non-null (reader posture), NOT a lexical MAX().
|
|
138
227
|
meta = _session_latest_meta_map(conn, session_ids)
|
|
228
|
+
titles = _session_titles_map(conn, session_ids)
|
|
139
229
|
conversations = [
|
|
140
230
|
{
|
|
141
231
|
"session_id": sid,
|
|
232
|
+
"title": titles.get(sid) or _project_label(meta.get(sid, (None, None))[0]) or sid,
|
|
142
233
|
"project_label": _project_label(meta.get(sid, (None, None))[0]),
|
|
143
234
|
"git_branch": meta.get(sid, (None, None))[1],
|
|
144
235
|
"started_utc": started,
|
|
@@ -198,7 +289,7 @@ def get_conversation(conn, session_id, *, after=None, limit=500):
|
|
|
198
289
|
# uuid, so the first occurrence in ascending order is canonical.
|
|
199
290
|
raw = conn.execute(
|
|
200
291
|
"SELECT id, uuid, timestamp_utc, entry_type, text, blocks_json, model, "
|
|
201
|
-
" msg_id, req_id, is_sidechain, cwd, git_branch "
|
|
292
|
+
" msg_id, req_id, is_sidechain, cwd, git_branch, source_path, parent_uuid "
|
|
202
293
|
"FROM conversation_messages WHERE session_id=? "
|
|
203
294
|
"ORDER BY timestamp_utc, id", (session_id,)).fetchall()
|
|
204
295
|
|
|
@@ -220,21 +311,92 @@ def get_conversation(conn, session_id, *, after=None, limit=500):
|
|
|
220
311
|
# item. A turn → exactly ONE item → cost counted exactly once. Humans,
|
|
221
312
|
# tool_results, and assistant rows with a null msg_id emit as simple items at
|
|
222
313
|
# their own position.
|
|
314
|
+
# ---- Phase 1: build items + index every assistant item's tool_use ids ----
|
|
315
|
+
# A tool_result is NOT guaranteed to sort after its tool_use (a grounded
|
|
316
|
+
# transcript scan found a matched result ordered BEFORE its use, plus orphan
|
|
317
|
+
# results with no in-session use), so this is a build-and-index-ALL pass
|
|
318
|
+
# FOLLOWED by a fold pass — never a single forward pass. None ids are never
|
|
319
|
+
# indexed (the id-less degradation guard).
|
|
223
320
|
items = []
|
|
224
|
-
turn_index = {}
|
|
321
|
+
turn_index = {} # (msg_id, req_id) -> index into items
|
|
322
|
+
tooluse_index = {} # tool_use id -> (item, block_dict)
|
|
323
|
+
tool_result_items = [] # placeholder items deferred to Phase 2
|
|
324
|
+
|
|
325
|
+
def _index_tool_uses(item):
|
|
326
|
+
# Index every tool_use id -> its (item, block). Idempotent: re-scanning
|
|
327
|
+
# a turn's blocks re-maps the same id to the same (item, block). Anthropic
|
|
328
|
+
# tool_use ids are unique within a session; a collision would be
|
|
329
|
+
# last-writer-wins (a result then folds to one deterministic owner).
|
|
330
|
+
for b in item["blocks"]:
|
|
331
|
+
if b.get("kind") == "tool_use" and b.get("id") is not None:
|
|
332
|
+
tooluse_index[b["id"]] = (item, b)
|
|
333
|
+
|
|
225
334
|
for row in logical:
|
|
226
335
|
(rid, u, ts, etype, text, blocks, model, msg_id, req_id,
|
|
227
|
-
is_sc, cwd, branch) = row
|
|
336
|
+
is_sc, cwd, branch, source_path, parent_uuid) = row
|
|
228
337
|
if etype == "assistant" and msg_id is not None:
|
|
229
338
|
key = (msg_id, req_id)
|
|
230
339
|
idx = turn_index.get(key)
|
|
231
340
|
if idx is None:
|
|
232
341
|
turn_index[key] = len(items)
|
|
233
|
-
|
|
342
|
+
it = _build_turn([row])
|
|
343
|
+
items.append(it)
|
|
344
|
+
_index_tool_uses(it)
|
|
234
345
|
else:
|
|
235
346
|
_extend_turn(items[idx], row)
|
|
347
|
+
_index_tool_uses(items[idx]) # re-index the turn (idempotent; new fragment may add ids)
|
|
348
|
+
elif etype == "tool_result":
|
|
349
|
+
it = _build_simple(row)
|
|
350
|
+
items.append(it)
|
|
351
|
+
tool_result_items.append(it)
|
|
236
352
|
else:
|
|
237
|
-
|
|
353
|
+
it = _build_simple(row)
|
|
354
|
+
items.append(it)
|
|
355
|
+
if etype == "assistant": # null-msg_id assistant: index its uses too
|
|
356
|
+
_index_tool_uses(it)
|
|
357
|
+
|
|
358
|
+
# ---- Phase 2: fold each tool_result item into its owning assistant item ----
|
|
359
|
+
drop = set() # id() of folded placeholder items
|
|
360
|
+
for tr in tool_result_items:
|
|
361
|
+
tr_blocks = [b for b in tr["blocks"] if b.get("kind") == "tool_result"]
|
|
362
|
+
non_result = [b for b in tr["blocks"] if b.get("kind") != "tool_result"]
|
|
363
|
+
owners = []
|
|
364
|
+
resolved = []
|
|
365
|
+
for b in tr_blocks:
|
|
366
|
+
tid = b.get("tool_use_id")
|
|
367
|
+
hit = tooluse_index.get(tid) if tid is not None else None
|
|
368
|
+
if hit is None:
|
|
369
|
+
owners = None # an unresolved block -> keep standalone
|
|
370
|
+
break
|
|
371
|
+
owners.append(hit[0])
|
|
372
|
+
resolved.append((hit[1], b))
|
|
373
|
+
# fold iff every result block resolved to exactly ONE owning item, no leftovers
|
|
374
|
+
owner_ids = {id(o) for o in owners} if owners is not None else set()
|
|
375
|
+
if owners and not non_result and len(owner_ids) == 1:
|
|
376
|
+
owner = owners[0]
|
|
377
|
+
for use_block, res_block in resolved:
|
|
378
|
+
use_block["result"] = {"text": res_block.get("text", ""),
|
|
379
|
+
"truncated": bool(res_block.get("truncated")),
|
|
380
|
+
"is_error": bool(res_block.get("is_error"))}
|
|
381
|
+
owner["member_uuids"].append(tr["anchor"]["uuid"])
|
|
382
|
+
drop.add(id(tr))
|
|
383
|
+
# else: leave tr standalone (orphan / multi-owner / mixed) — a folded
|
|
384
|
+
# row's uuid then joins EXACTLY ONE item's member_uuids (the #160 anchor).
|
|
385
|
+
|
|
386
|
+
if drop:
|
|
387
|
+
items = [it for it in items if id(it) not in drop]
|
|
388
|
+
|
|
389
|
+
# ---- Phase 3: sweep every assistant item's tool_use -> tool_call ----
|
|
390
|
+
# Covers turn items AND _build_simple null-msg_id assistant items. Matched
|
|
391
|
+
# requests already carry `result`; unmatched get `result: None`
|
|
392
|
+
# (request-only). Post-migration the client never receives a bare tool_use.
|
|
393
|
+
for it in items:
|
|
394
|
+
if it["kind"] == "assistant":
|
|
395
|
+
for b in it["blocks"]:
|
|
396
|
+
if b.get("kind") == "tool_use":
|
|
397
|
+
b["kind"] = "tool_call"
|
|
398
|
+
b["tool_use_id"] = b.pop("id", None)
|
|
399
|
+
b.setdefault("result", None)
|
|
238
400
|
|
|
239
401
|
costs = _turn_cost_map(conn, list(turn_index))
|
|
240
402
|
# Stamp per-item cost first, then derive the header from the SUM of the
|
|
@@ -327,6 +489,12 @@ def _build_turn(members):
|
|
|
327
489
|
"blocks": [],
|
|
328
490
|
"model": first[6],
|
|
329
491
|
"is_sidechain": bool(first[9]),
|
|
492
|
+
# subagent_key / parent_uuid are SEED-sourced (the first fragment, the
|
|
493
|
+
# turn's entry point) and NOT re-promoted in _fold_fragment — the prose
|
|
494
|
+
# anchor's parent_uuid is an intra-turn link, not the entry point (Codex
|
|
495
|
+
# P1). subagent_key is uniform across a turn's fragments (one file).
|
|
496
|
+
"subagent_key": _subagent_key(first[12]),
|
|
497
|
+
"parent_uuid": first[13],
|
|
330
498
|
"_msg_id": first[7],
|
|
331
499
|
"_req_id": first[8],
|
|
332
500
|
"_has_prose": False,
|
|
@@ -374,7 +542,8 @@ def _build_simple(row):
|
|
|
374
542
|
key → no session_entries join); it carries an explicit cost_usd of 0.0 and NO
|
|
375
543
|
internal _msg_id/_req_id keys, so the cost loop's KeyError path can never fire
|
|
376
544
|
(I2). The model is preserved for assistant rows."""
|
|
377
|
-
(rid, u, ts, etype, text, blocks, model, msg_id, req_id, is_sc, cwd, branch
|
|
545
|
+
(rid, u, ts, etype, text, blocks, model, msg_id, req_id, is_sc, cwd, branch,
|
|
546
|
+
source_path, parent_uuid) = row
|
|
378
547
|
try:
|
|
379
548
|
parsed = _json.loads(blocks or "[]")
|
|
380
549
|
except (ValueError, TypeError):
|
|
@@ -387,6 +556,8 @@ def _build_simple(row):
|
|
|
387
556
|
"text": text,
|
|
388
557
|
"blocks": parsed,
|
|
389
558
|
"is_sidechain": bool(is_sc),
|
|
559
|
+
"subagent_key": _subagent_key(source_path),
|
|
560
|
+
"parent_uuid": parent_uuid,
|
|
390
561
|
}
|
|
391
562
|
if etype == "assistant":
|
|
392
563
|
item["model"] = model
|
|
@@ -440,19 +611,6 @@ def _row_to_hit(uuid_, sid, ts, cwd, snippet, msg_id, req_id):
|
|
|
440
611
|
}
|
|
441
612
|
|
|
442
613
|
|
|
443
|
-
def _dedup_hits(hits, limit, offset):
|
|
444
|
-
seen = set()
|
|
445
|
-
out = []
|
|
446
|
-
for h in hits:
|
|
447
|
-
key = (h["session_id"], h["uuid"])
|
|
448
|
-
if key in seen:
|
|
449
|
-
continue
|
|
450
|
-
seen.add(key)
|
|
451
|
-
out.append(h)
|
|
452
|
-
total = len(out)
|
|
453
|
-
return out[offset:offset + limit], total
|
|
454
|
-
|
|
455
|
-
|
|
456
614
|
def _attach_costs(conn, page):
|
|
457
615
|
"""Compute turn cost for the FINAL page's hits in ONE _turn_cost_map call,
|
|
458
616
|
then map it onto each hit and drop the private `_turn_key`. Off-page and
|
|
@@ -465,45 +623,141 @@ def _attach_costs(conn, page):
|
|
|
465
623
|
return page
|
|
466
624
|
|
|
467
625
|
|
|
468
|
-
def
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
626
|
+
def _attach_titles(conn, page):
|
|
627
|
+
"""Stamp each final-page hit with its session's derived title — ONE batched
|
|
628
|
+
_session_titles_map over the distinct page session_ids (parallel to
|
|
629
|
+
_attach_costs). Fallback project_label → session_id, matching
|
|
630
|
+
list_conversations (#165 Q4)."""
|
|
631
|
+
sids = list({h["session_id"] for h in page})
|
|
632
|
+
titles = _session_titles_map(conn, sids)
|
|
633
|
+
for h in page:
|
|
634
|
+
sid = h["session_id"]
|
|
635
|
+
h["title"] = titles.get(sid) or h.get("project_label") or sid
|
|
636
|
+
return page
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _like_pattern(q):
|
|
640
|
+
"""Build the LIKE pattern for `q`. Escape the ESCAPE char (\\) FIRST, then
|
|
641
|
+
the wildcards — otherwise a query containing a backslash (incl. a trailing
|
|
642
|
+
one) mis-escapes the appended '%' and the LIKE silently matches nothing
|
|
643
|
+
(paired with ESCAPE '\\' in the queries below)."""
|
|
644
|
+
return ("%" + q.replace("\\", "\\\\").replace("%", r"\%").replace("_", r"\_")
|
|
645
|
+
+ "%")
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _fts_snippets(conn, fts_q, ids):
|
|
649
|
+
"""{rowid: snippet} for the page rowids ONLY (#149). snippet() needs an
|
|
650
|
+
active MATCH, so it can't be deferred to an outer query over the page CTE;
|
|
651
|
+
a second bounded MATCH restricted to the page rowids generates snippets for
|
|
652
|
+
at most one page of hits instead of every corpus match."""
|
|
653
|
+
if not ids:
|
|
654
|
+
return {}
|
|
655
|
+
ph = ",".join("?" for _ in ids)
|
|
656
|
+
rows = conn.execute(
|
|
657
|
+
"SELECT cm.id, snippet(conversation_fts, 0, '[', ']', ' … ', 12) "
|
|
473
658
|
"FROM conversation_fts "
|
|
474
659
|
"JOIN conversation_messages cm ON cm.id = conversation_fts.rowid "
|
|
475
|
-
"WHERE conversation_fts MATCH ? "
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
page
|
|
485
|
-
|
|
660
|
+
f"WHERE conversation_fts MATCH ? AND cm.id IN ({ph})",
|
|
661
|
+
(fts_q, *ids),
|
|
662
|
+
).fetchall()
|
|
663
|
+
return {r[0]: r[1] for r in rows}
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def _texts_for_ids(conn, ids):
|
|
667
|
+
"""{rowid: text} for the page rowids ONLY (#149) — the LIKE page query omits
|
|
668
|
+
`text` so we never pull every matched row's body into Python; this fetches
|
|
669
|
+
it for just the page so `_manual_snippet` runs at most `limit` times."""
|
|
670
|
+
if not ids:
|
|
671
|
+
return {}
|
|
672
|
+
ph = ",".join("?" for _ in ids)
|
|
673
|
+
rows = conn.execute(
|
|
674
|
+
f"SELECT id, text FROM conversation_messages WHERE id IN ({ph})",
|
|
675
|
+
tuple(ids),
|
|
676
|
+
).fetchall()
|
|
677
|
+
return {r[0]: r[1] for r in rows}
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def _search_fts(conn, q, limit, offset):
|
|
681
|
+
# All of dedup + paging + total live in SQL (#149) so Python never holds
|
|
682
|
+
# more than one page of hits/snippets, regardless of corpus match count.
|
|
683
|
+
fts_q = _fts_query(q)
|
|
684
|
+
# Exact post-dedup logical total — counted in C with no snippet generation
|
|
685
|
+
# and no Python row materialization.
|
|
686
|
+
total = conn.execute(
|
|
687
|
+
"SELECT COUNT(*) FROM ("
|
|
688
|
+
" SELECT DISTINCT cm.session_id, cm.uuid "
|
|
689
|
+
" FROM conversation_fts "
|
|
690
|
+
" JOIN conversation_messages cm ON cm.id = conversation_fts.rowid "
|
|
691
|
+
" WHERE conversation_fts MATCH ?)",
|
|
692
|
+
(fts_q,),
|
|
693
|
+
).fetchone()[0]
|
|
694
|
+
# One row per logical (session_id, uuid): ROW_NUMBER()=1 keeps the SAME row
|
|
695
|
+
# the old Python dedup kept as its FIRST occurrence (order: bm25, ts DESC,
|
|
696
|
+
# id DESC — cm.id is the final deterministic tiebreaker), so the surviving
|
|
697
|
+
# snippet/cost and the page boundary stay byte-stable. bm25 still ranks
|
|
698
|
+
# across all matches (inherent to relevance ordering).
|
|
699
|
+
#
|
|
700
|
+
# bm25 is materialized as a plain `rank` column in the inner `matched` CTE
|
|
701
|
+
# before the window function runs: FTS5 auxiliary functions (bm25/snippet)
|
|
702
|
+
# may only be used directly against the MATCH query, NOT inside a window
|
|
703
|
+
# ORDER BY ("unable to use function bm25 in the requested context").
|
|
704
|
+
page = conn.execute(
|
|
705
|
+
"WITH matched AS ("
|
|
706
|
+
" SELECT cm.id AS rid, cm.session_id AS sid, cm.uuid AS uuid, "
|
|
707
|
+
" cm.timestamp_utc AS ts, cm.cwd AS cwd, "
|
|
708
|
+
" cm.msg_id AS mid, cm.req_id AS rqd, "
|
|
709
|
+
" bm25(conversation_fts) AS rank "
|
|
710
|
+
" FROM conversation_fts "
|
|
711
|
+
" JOIN conversation_messages cm ON cm.id = conversation_fts.rowid "
|
|
712
|
+
" WHERE conversation_fts MATCH ?), "
|
|
713
|
+
"ranked AS ("
|
|
714
|
+
" SELECT *, ROW_NUMBER() OVER ("
|
|
715
|
+
" PARTITION BY sid, uuid ORDER BY rank, ts DESC, rid DESC"
|
|
716
|
+
" ) AS rn "
|
|
717
|
+
" FROM matched) "
|
|
718
|
+
"SELECT rid, sid, uuid, ts, cwd, mid, rqd FROM ranked WHERE rn = 1 "
|
|
719
|
+
"ORDER BY rank, ts DESC, rid DESC LIMIT ? OFFSET ?",
|
|
720
|
+
(fts_q, limit, offset),
|
|
721
|
+
).fetchall()
|
|
722
|
+
snips = _fts_snippets(conn, fts_q, [r[0] for r in page])
|
|
723
|
+
hits = [_row_to_hit(uuid, sid, ts, cwd, snips.get(rid, ""), mid, rqd)
|
|
724
|
+
for (rid, sid, uuid, ts, cwd, mid, rqd) in page]
|
|
725
|
+
return {"query": q, "mode": "fts",
|
|
726
|
+
"hits": _attach_titles(conn, _attach_costs(conn, hits)),
|
|
486
727
|
"total": total}
|
|
487
728
|
|
|
488
729
|
|
|
489
730
|
def _search_like(conn, q, limit, offset):
|
|
490
|
-
#
|
|
491
|
-
#
|
|
492
|
-
#
|
|
493
|
-
like = (
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
"SELECT session_id, uuid
|
|
497
|
-
"
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
731
|
+
# SQL-bounded mirror of _search_fts for the no-FTS5 fallback (#149); the
|
|
732
|
+
# COUNT + page each scan the table once (the degraded path already lacks an
|
|
733
|
+
# index for the substring match).
|
|
734
|
+
like = _like_pattern(q)
|
|
735
|
+
total = conn.execute(
|
|
736
|
+
"SELECT COUNT(*) FROM ("
|
|
737
|
+
" SELECT DISTINCT session_id, uuid FROM conversation_messages "
|
|
738
|
+
" WHERE text LIKE ? ESCAPE '\\' AND text != '')",
|
|
739
|
+
(like,),
|
|
740
|
+
).fetchone()[0]
|
|
741
|
+
page = conn.execute(
|
|
742
|
+
"WITH ranked AS ("
|
|
743
|
+
" SELECT id AS rid, session_id AS sid, uuid AS uuid, "
|
|
744
|
+
" timestamp_utc AS ts, cwd AS cwd, msg_id AS mid, req_id AS rqd, "
|
|
745
|
+
" ROW_NUMBER() OVER ("
|
|
746
|
+
" PARTITION BY session_id, uuid "
|
|
747
|
+
" ORDER BY timestamp_utc DESC, id DESC"
|
|
748
|
+
" ) AS rn "
|
|
749
|
+
" FROM conversation_messages "
|
|
750
|
+
" WHERE text LIKE ? ESCAPE '\\' AND text != '') "
|
|
751
|
+
"SELECT rid, sid, uuid, ts, cwd, mid, rqd FROM ranked WHERE rn = 1 "
|
|
752
|
+
"ORDER BY ts DESC, rid DESC LIMIT ? OFFSET ?",
|
|
753
|
+
(like, limit, offset),
|
|
754
|
+
).fetchall()
|
|
755
|
+
texts = _texts_for_ids(conn, [r[0] for r in page])
|
|
756
|
+
hits = [_row_to_hit(uuid, sid, ts, cwd,
|
|
757
|
+
_manual_snippet(texts.get(rid, ""), q), mid, rqd)
|
|
758
|
+
for (rid, sid, uuid, ts, cwd, mid, rqd) in page]
|
|
759
|
+
return {"query": q, "mode": "like",
|
|
760
|
+
"hits": _attach_titles(conn, _attach_costs(conn, hits)),
|
|
507
761
|
"total": total}
|
|
508
762
|
|
|
509
763
|
|
package/bin/_lib_jsonl.py
CHANGED
|
@@ -201,6 +201,68 @@ def _parse_usage_entries(
|
|
|
201
201
|
return no_key_entries
|
|
202
202
|
|
|
203
203
|
|
|
204
|
+
def parse_cost_entry(obj, path_str: str):
|
|
205
|
+
"""Pure per-line cost parser: given a parsed JSONL object, return
|
|
206
|
+
``(UsageEntry, msg_id, req_id)`` when it is a billable assistant entry, or
|
|
207
|
+
``None`` otherwise (non-assistant, missing/invalid usage, model, or
|
|
208
|
+
timestamp, or a ``<synthetic>`` placeholder). No I/O, no byte offset — the
|
|
209
|
+
caller owns the readline()+tell() loop.
|
|
210
|
+
|
|
211
|
+
Extracted (#138) so the streaming ``_iter_jsonl_entries_with_offsets`` reader
|
|
212
|
+
and the fused single-pass sync walker (``_cctally_cache._iter_sync_entries``)
|
|
213
|
+
share ONE gating implementation — each JSONL line is ``json.loads``-parsed
|
|
214
|
+
once and classified once, never re-parsed for a separate second walk.
|
|
215
|
+
"""
|
|
216
|
+
if obj.get("type") != "assistant":
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
ts_raw = obj.get("timestamp")
|
|
220
|
+
if not isinstance(ts_raw, str) or not ts_raw.strip():
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
msg = obj.get("message")
|
|
224
|
+
if not isinstance(msg, dict):
|
|
225
|
+
msg = obj
|
|
226
|
+
|
|
227
|
+
usage = msg.get("usage")
|
|
228
|
+
if not isinstance(usage, dict):
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
model = msg.get("model") or obj.get("model")
|
|
232
|
+
if not isinstance(model, str) or not model.strip():
|
|
233
|
+
return None
|
|
234
|
+
model = model.strip()
|
|
235
|
+
if model == "<synthetic>":
|
|
236
|
+
# Matches ccusage's claude_loader.rs:454. Filtered here so the cache
|
|
237
|
+
# ingest path can't accidentally store these rows even if a downstream
|
|
238
|
+
# loop forgets to double-check (see `sync_cache` in _cctally_cache.py).
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
ts = dt.datetime.fromisoformat(ts_raw.strip().replace("Z", "+00:00"))
|
|
243
|
+
if ts.tzinfo is None:
|
|
244
|
+
ts = ts.replace(tzinfo=dt.timezone.utc)
|
|
245
|
+
except ValueError:
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
msg_id = msg.get("id")
|
|
249
|
+
req_id = obj.get("requestId")
|
|
250
|
+
cost_usd_raw = obj.get("costUSD")
|
|
251
|
+
cost_usd = float(cost_usd_raw) if cost_usd_raw is not None else None
|
|
252
|
+
|
|
253
|
+
return (
|
|
254
|
+
UsageEntry(
|
|
255
|
+
timestamp=ts,
|
|
256
|
+
model=model,
|
|
257
|
+
usage=usage,
|
|
258
|
+
cost_usd=cost_usd,
|
|
259
|
+
source_path=path_str,
|
|
260
|
+
),
|
|
261
|
+
msg_id,
|
|
262
|
+
req_id,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
204
266
|
def _iter_jsonl_entries_with_offsets(fh, path_str: str):
|
|
205
267
|
"""Yield (byte_offset, UsageEntry, msg_id, req_id) for each assistant
|
|
206
268
|
entry starting from fh's current position.
|
|
@@ -209,7 +271,9 @@ def _iter_jsonl_entries_with_offsets(fh, path_str: str):
|
|
|
209
271
|
accurate for resume-from-offset after partial ingests. Malformed JSON
|
|
210
272
|
and non-assistant lines are skipped, but the offset still advances past
|
|
211
273
|
them so they are never re-read. Range filtering is intentionally NOT
|
|
212
|
-
done here — filters are applied at query time by iter_entries().
|
|
274
|
+
done here — filters are applied at query time by iter_entries(). The
|
|
275
|
+
per-line gating lives in ``parse_cost_entry`` (shared with the fused
|
|
276
|
+
single-pass sync walker, #138).
|
|
213
277
|
"""
|
|
214
278
|
while True:
|
|
215
279
|
offset = fh.tell()
|
|
@@ -230,56 +294,11 @@ def _iter_jsonl_entries_with_offsets(fh, path_str: str):
|
|
|
230
294
|
obj = json.loads(stripped)
|
|
231
295
|
except json.JSONDecodeError:
|
|
232
296
|
continue
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
ts_raw = obj.get("timestamp")
|
|
237
|
-
if not isinstance(ts_raw, str) or not ts_raw.strip():
|
|
238
|
-
continue
|
|
239
|
-
|
|
240
|
-
msg = obj.get("message")
|
|
241
|
-
if not isinstance(msg, dict):
|
|
242
|
-
msg = obj
|
|
243
|
-
|
|
244
|
-
usage = msg.get("usage")
|
|
245
|
-
if not isinstance(usage, dict):
|
|
246
|
-
continue
|
|
247
|
-
|
|
248
|
-
model = msg.get("model") or obj.get("model")
|
|
249
|
-
if not isinstance(model, str) or not model.strip():
|
|
297
|
+
parsed = parse_cost_entry(obj, path_str)
|
|
298
|
+
if parsed is None:
|
|
250
299
|
continue
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
# Matches ccusage's claude_loader.rs:454. Filtered at the
|
|
254
|
-
# iterator level so the cache ingest path can't accidentally
|
|
255
|
-
# store these rows even if a downstream loop forgets to
|
|
256
|
-
# double-check (see `sync_cache` in _cctally_cache.py).
|
|
257
|
-
continue
|
|
258
|
-
|
|
259
|
-
try:
|
|
260
|
-
ts = dt.datetime.fromisoformat(ts_raw.strip().replace("Z", "+00:00"))
|
|
261
|
-
if ts.tzinfo is None:
|
|
262
|
-
ts = ts.replace(tzinfo=dt.timezone.utc)
|
|
263
|
-
except ValueError:
|
|
264
|
-
continue
|
|
265
|
-
|
|
266
|
-
msg_id = msg.get("id")
|
|
267
|
-
req_id = obj.get("requestId")
|
|
268
|
-
cost_usd_raw = obj.get("costUSD")
|
|
269
|
-
cost_usd = float(cost_usd_raw) if cost_usd_raw is not None else None
|
|
270
|
-
|
|
271
|
-
yield (
|
|
272
|
-
offset,
|
|
273
|
-
UsageEntry(
|
|
274
|
-
timestamp=ts,
|
|
275
|
-
model=model,
|
|
276
|
-
usage=usage,
|
|
277
|
-
cost_usd=cost_usd,
|
|
278
|
-
source_path=path_str,
|
|
279
|
-
),
|
|
280
|
-
msg_id,
|
|
281
|
-
req_id,
|
|
282
|
-
)
|
|
300
|
+
entry, msg_id, req_id = parsed
|
|
301
|
+
yield (offset, entry, msg_id, req_id)
|
|
283
302
|
|
|
284
303
|
|
|
285
304
|
_CODEX_FILENAME_UUID_RE = re.compile(
|
package/bin/cctally
CHANGED
|
@@ -2100,18 +2100,18 @@ get_max_milestone_for_week = _cctally_milestones.get_max_milestone_for_
|
|
|
2100
2100
|
get_milestone_cost_for_week = _cctally_milestones.get_milestone_cost_for_week # record shim
|
|
2101
2101
|
get_milestones_for_week = _cctally_milestones.get_milestones_for_week # forecast c.; tui shim; percent-breakdown c.
|
|
2102
2102
|
insert_percent_milestone = _cctally_milestones.insert_percent_milestone # record shim; idempotency-test mod.
|
|
2103
|
-
insert_budget_milestone = _cctally_milestones.insert_budget_milestone # record shim
|
|
2103
|
+
insert_budget_milestone = _cctally_milestones.insert_budget_milestone # record shim; test_budget_alerts / test_project_budget_dashboard ns[] (+ test_codex_budget_alerts / test_projected_alerts post-#143 vendor-param unification)
|
|
2104
2104
|
insert_project_budget_milestone = _cctally_milestones.insert_project_budget_milestone # record shim; project-budget-config-test ns[]
|
|
2105
|
-
|
|
2106
|
-
_codex_budget_crossings = _cctally_milestones._codex_budget_crossings # record shim (shared INSERT-and-arm core for the codex_budget axis)
|
|
2105
|
+
_budget_crossings = _cctally_milestones._budget_crossings # record shim (shared INSERT-and-arm core for the budget axis, both vendors, #143)
|
|
2107
2106
|
_resolve_codex_budget_period_window = _cctally_milestones._resolve_codex_budget_period_window # record shim; milestones c. (codex period window)
|
|
2108
|
-
|
|
2107
|
+
_resolve_budget_window = _cctally_milestones._resolve_budget_window # record shim; milestones c. (per-vendor cheap budget window dispatcher, #143)
|
|
2108
|
+
_budget_spend_for_vendor = _cctally_milestones._budget_spend_for_vendor # record shim; milestones c. (per-vendor budget spend dispatcher, #143)
|
|
2109
2109
|
_reconcile_codex_budget_on_config_write = _cctally_milestones._reconcile_codex_budget_on_config_write # forecast/config c. (forward-only codex-budget reconcile)
|
|
2110
2110
|
_resolve_claude_budget_window = _cctally_milestones._resolve_claude_budget_window # record shim; milestones c. (period-aware Claude budget window)
|
|
2111
2111
|
_project_crossings = _cctally_milestones._project_crossings # record shim; milestones c. (#130 firing/reconcile shared crossing arithmetic)
|
|
2112
2112
|
insert_projected_milestone = _cctally_milestones.insert_projected_milestone # record shim
|
|
2113
2113
|
_projected_levels_already_latched = _cctally_milestones._projected_levels_already_latched # record shim
|
|
2114
|
-
_reconcile_budget_milestones_on_set = _cctally_milestones._reconcile_budget_milestones_on_set # test_budget_alerts ns[]
|
|
2114
|
+
_reconcile_budget_milestones_on_set = _cctally_milestones._reconcile_budget_milestones_on_set # test_budget_alerts / test_codex_budget_alerts ns[] (vendor-param, #143)
|
|
2115
2115
|
_reconcile_budget_on_config_write = _cctally_milestones._reconcile_budget_on_config_write # forecast/config/dashboard c.; test_forecast_ns_patch mod. patch
|
|
2116
2116
|
_reconcile_project_budget_milestones_on_write = _cctally_milestones._reconcile_project_budget_milestones_on_write # forecast/config/dashboard c. (forward-only project-budget reconcile)
|
|
2117
2117
|
|