zeno-cli 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. zeno_adapters/__init__.py +17 -0
  2. zeno_adapters/_common.py +38 -0
  3. zeno_adapters/anthropic.py +68 -0
  4. zeno_adapters/claude_code.py +101 -0
  5. zeno_adapters/crewai.py +92 -0
  6. zeno_adapters/langgraph.py +49 -0
  7. zeno_adapters/openai.py +108 -0
  8. zeno_cli/__init__.py +1 -0
  9. zeno_cli/_hooks/cc_bridge.py +1016 -0
  10. zeno_cli/doctor.py +535 -0
  11. zeno_cli/hook_install.py +269 -0
  12. zeno_cli/hud/__init__.py +1 -0
  13. zeno_cli/hud/hud_install.py +652 -0
  14. zeno_cli/hud/zeno_attention.py +288 -0
  15. zeno_cli/hud/zeno_cognition.py +457 -0
  16. zeno_cli/hud/zeno_hud.py +496 -0
  17. zeno_cli/interview_invites.py +342 -0
  18. zeno_cli/login.py +241 -0
  19. zeno_cli/main.py +2534 -0
  20. zeno_cli/onboard.py +206 -0
  21. zeno_cli/outreach.py +456 -0
  22. zeno_cli/version.py +67 -0
  23. zeno_cli-0.3.4.dist-info/METADATA +161 -0
  24. zeno_cli-0.3.4.dist-info/RECORD +69 -0
  25. zeno_cli-0.3.4.dist-info/WHEEL +4 -0
  26. zeno_cli-0.3.4.dist-info/entry_points.txt +4 -0
  27. zeno_core/__init__.py +67 -0
  28. zeno_core/analytics.py +193 -0
  29. zeno_core/rtlx_s.py +460 -0
  30. zeno_core/streak.py +178 -0
  31. zeno_core/tlx_s.py +192 -0
  32. zeno_sdk/__init__.py +6 -0
  33. zeno_sdk/_generated/__init__.py +6 -0
  34. zeno_sdk/_generated/client.py +819 -0
  35. zeno_sdk/_migrations/alembic/env.py +33 -0
  36. zeno_sdk/_migrations/alembic/script.py.mako +18 -0
  37. zeno_sdk/_migrations/alembic/versions/0001_initial.py +79 -0
  38. zeno_sdk/_migrations/alembic/versions/0002_cognition_samples.py +53 -0
  39. zeno_sdk/_migrations/alembic/versions/0003_cognition_drivers.py +41 -0
  40. zeno_sdk/_migrations/alembic/versions/0004_transcript_intelligence.py +248 -0
  41. zeno_sdk/_migrations/alembic.ini +35 -0
  42. zeno_sdk/_runtime.py +12 -0
  43. zeno_sdk/adapters/__init__.py +15 -0
  44. zeno_sdk/adapters/anthropic.py +5 -0
  45. zeno_sdk/adapters/claude_code.py +5 -0
  46. zeno_sdk/adapters/crewai.py +5 -0
  47. zeno_sdk/adapters/langgraph.py +5 -0
  48. zeno_sdk/adapters/openai.py +5 -0
  49. zeno_sdk/auth.py +25 -0
  50. zeno_sdk/client.py +87 -0
  51. zeno_sdk/config.py +61 -0
  52. zeno_sdk/daemon.py +72 -0
  53. zeno_sdk/privacy.py +46 -0
  54. zeno_sdk/session.py +179 -0
  55. zeno_sdk/storage.py +487 -0
  56. zeno_sdk/types/__init__.py +121 -0
  57. zeno_session_intel/__init__.py +19 -0
  58. zeno_session_intel/analytics.py +588 -0
  59. zeno_session_intel/compression.py +123 -0
  60. zeno_session_intel/ingest.py +376 -0
  61. zeno_session_intel/model.py +129 -0
  62. zeno_session_intel/parsers/__init__.py +31 -0
  63. zeno_session_intel/parsers/claude_code.py +169 -0
  64. zeno_session_intel/parsers/codex.py +265 -0
  65. zeno_session_intel/parsers/cursor.py +198 -0
  66. zeno_session_intel/prices.py +281 -0
  67. zeno_session_intel/schema.py +277 -0
  68. zeno_session_intel/signals.py +319 -0
  69. zeno_session_intel/taxonomy.py +71 -0
@@ -0,0 +1,123 @@
1
+ """Directional "tokens you could have saved" estimator over captured content.
2
+
3
+ Measures how much of your captured agent content is SAFELY compressible, segmented
4
+ by type. Code and tracebacks are never counted as compressible: lossy token-level
5
+ compression corrupts them (near-0% exact-match on repo-level code completion; it
6
+ drops a SWE-bench agent from 62% to 54%), so the savings come only from redundant
7
+ JSON / logs / listings. See the zeno-compression-decision memo.
8
+
9
+ This is a MEASURE, never an inline compressor: the agent request path is never
10
+ touched. Heuristic + directional, stdlib-only, py3.9-safe.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import functools
16
+ import re
17
+
18
+ # Rough chars-per-token for mixed Claude/GPT-family content. Directional only.
19
+ CHARS_PER_TOKEN = 4.0
20
+
21
+ # Safely-compressible fraction by content type. Conservative + directional.
22
+ # code / traceback = 0.0 ON PURPOSE: lossy compression destroys exact tokens
23
+ # (file paths, line numbers, identifiers, syntax). Never claim savings there.
24
+ COMPRESSIBLE_FRACTION: dict[str, float] = {
25
+ "json": 0.55,
26
+ "log": 0.45,
27
+ "code": 0.0,
28
+ "traceback": 0.0,
29
+ "prose": 0.10,
30
+ }
31
+
32
+ _JSON_RE = re.compile(r"^\s*[\[{]")
33
+ _TRACEBACK_RE = re.compile(
34
+ r"Traceback \(most recent call last\)|^\s+File \".*\", line \d+",
35
+ re.MULTILINE,
36
+ )
37
+ # Unified / git diffs are exact-token (paths, hunk headers, line numbers); never compress.
38
+ _DIFF_RE = re.compile(r"^(diff --git |@@ -|\+\+\+ |--- |[+-]\S)", re.MULTILINE)
39
+ _LOG_RE = re.compile(
40
+ r"^\s*(\d{4}-\d\d-\d\d[ T]\d\d:\d\d|\[\d{2}:\d{2}|DEBUG|INFO|WARN|ERROR|TRACE)\b",
41
+ re.MULTILINE,
42
+ )
43
+ # Broad, multi-language/markup code shape: anything code-like must land in the
44
+ # 0%-savable bucket (the cardinal sin is crediting code as compressible). Covers
45
+ # keywords across Python/JS/Go/Rust/Java/C-likes, arrows, line-end semicolons,
46
+ # HTML/XML tags, CSS selector blocks, and unquoted YAML/key: lines.
47
+ _CODE_RE = re.compile(
48
+ r"\b(def|class|import|from|func|fn|function|const|let|var|return|public|private|package|struct|impl|fun)\b"
49
+ r"|=>|->|;\s*$|</?[a-zA-Z][\w-]*[\s/>]|^[.#]?[\w-]+\s*\{|^\s*[\w.-]+:\s",
50
+ re.MULTILINE,
51
+ )
52
+
53
+
54
+ @functools.lru_cache(maxsize=4096)
55
+ def _classify_cached(text: str) -> str:
56
+ """Memoized classify over the exact content string.
57
+
58
+ The compression scan re-sees the same blob shapes thousands of times (tool
59
+ output, listings, repeated JSON), so caching on the content collapses the
60
+ per-chunk regex sweep from O(rows) to O(distinct-content). The label is a
61
+ pure function of ``text``, so the cache is BYTE-IDENTICAL to the uncached
62
+ path - this is a speed-only change. ``lru_cache`` is bounded so a large,
63
+ high-cardinality scan never grows memory without limit; on a cache miss past
64
+ the bound it just recomputes (still correct, just not cached)."""
65
+ return _classify_impl(text)
66
+
67
+
68
+ def classify(text: str) -> str:
69
+ """Best-effort single content-type label for a chunk of captured content.
70
+
71
+ Order is deliberate: traceback -> diff -> CODE (before JSON, so a brace-opening
72
+ object literal / block lands as code, not json) -> json -> log -> prose. For a
73
+ large blob we run the regexes over a head + interior + tail sample, so a code or
74
+ traceback tail after a benign 2000-char head is not mislabeled. Anything not
75
+ confidently redundant (genuine json/log) ends up code (0) or prose (0.1); we
76
+ never over-credit code as compressible.
77
+
78
+ Content-memoized: identical text returns the identical label without re-running
79
+ the regexes. The label depends only on ``text``, so the memo is byte-identical
80
+ to recomputing every call."""
81
+ # Empty / whitespace-only is the cheap common case and not worth a cache slot.
82
+ if not text or not text.strip():
83
+ return "prose"
84
+ return _classify_cached(text)
85
+
86
+
87
+ def _classify_impl(text: str) -> str:
88
+ """Pure classifier (the original body). Callers go through ``classify`` so the
89
+ content memo applies; kept separate so the memo wraps a single pure function."""
90
+ if not text or not text.strip():
91
+ return "prose"
92
+ n = len(text)
93
+ if n > 6000:
94
+ sample = text[:2000] + "\n" + text[n // 2 - 1000 : n // 2 + 1000] + "\n" + text[-2000:]
95
+ else:
96
+ sample = text
97
+ if _TRACEBACK_RE.search(sample):
98
+ return "traceback"
99
+ if _DIFF_RE.search(sample):
100
+ return "code"
101
+ if len(_CODE_RE.findall(sample)) >= 2:
102
+ return "code"
103
+ if _JSON_RE.match(text) and (sample.count("{") + sample.count("[")) >= 2:
104
+ return "json"
105
+ if len(_LOG_RE.findall(sample)) >= 3:
106
+ return "log"
107
+ return "prose"
108
+
109
+
110
+ def est_tokens(text: str) -> int:
111
+ """Rough token count from character length. Directional only."""
112
+ return int(len(text) / CHARS_PER_TOKEN)
113
+
114
+
115
+ def estimate(text: str) -> dict:
116
+ """Classify a chunk and estimate its safely-savable tokens."""
117
+ content_type = classify(text)
118
+ tokens = est_tokens(text)
119
+ return {
120
+ "type": content_type,
121
+ "tokens": tokens,
122
+ "savable_tokens": int(tokens * COMPRESSIBLE_FRACTION[content_type]),
123
+ }
@@ -0,0 +1,376 @@
1
+ """Multi-tool session ingester: discover -> parse -> extract signals -> idempotent write.
2
+
3
+ Passively reads coding-agent session files (Claude Code, Cursor, Codex) READ-ONLY and
4
+ writes the agentsview-shaped session-intelligence tables into zeno's local SQLite.
5
+ Idempotent at three layers (session content-hash skip, message ``(session_id, ordinal)``
6
+ replace, usage ``dedup_key`` partial-unique). Stdlib-only; never writes to source files.
7
+
8
+ Tool status (this build is Claude-Code-first - see docs/TOOLS_STATUS.md):
9
+ - PRIMARY: ``claude`` (Claude Code) - the first-class capture path, validated on real data.
10
+ - CAPTURE-ONLY: ``cursor`` - validated against real local data, no SDK adapter; LATER.
11
+ - EXPERIMENTAL: ``codex`` - fixture-verified only (no real Codex install was available),
12
+ so it is an explicit opt-in ``--tools`` value, never advertised as a default - pointing
13
+ it at a divergently shaped real rollout can drop a whole session silently. Promote it
14
+ once a real ``~/.codex/.../rollout-*.jsonl`` confirms the schema.
15
+
16
+ CLI::
17
+
18
+ python -m zeno_session_intel.ingest --db /path/to/dev.db --tools claude,cursor
19
+ python -m zeno_session_intel.ingest --db dev.db --tools claude --root claude=~/.claude/projects
20
+
21
+ The default DB is the SDK capture path (``ZENO_DB_PATH`` / ``ZENO_HOME`` / ~/.zeno/zeno.db).
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import hashlib
28
+ import json
29
+ import os
30
+ import sqlite3
31
+ import sys
32
+ from pathlib import Path
33
+
34
+ from . import prices, schema
35
+ from .model import ParsedSession
36
+ from .parsers import REGISTRY
37
+ from .signals import extract_signals
38
+ from .taxonomy import normalize_tool_category
39
+
40
+ _SESSION_INSERT_COLS = tuple(name for name, _ in schema.TRANSCRIPT_SESSIONS_COLUMNS)
41
+
42
+
43
+ def default_db_path() -> str:
44
+ base = os.environ.get("ZENO_HOME") or os.path.join(os.path.expanduser("~"), ".zeno")
45
+ return os.environ.get("ZENO_DB_PATH") or os.path.join(base, "zeno.db")
46
+
47
+
48
+ def _content_hash(session: ParsedSession) -> str:
49
+ h = hashlib.sha256()
50
+ for m in session.messages:
51
+ h.update(
52
+ json.dumps(
53
+ [
54
+ m.ordinal,
55
+ m.role,
56
+ m.content,
57
+ [tu.name for tu in m.tool_uses],
58
+ [tr.tool_use_id for tr in m.tool_results],
59
+ m.usage_dedup_key,
60
+ m.context_tokens,
61
+ m.output_tokens,
62
+ ],
63
+ sort_keys=True,
64
+ ).encode()
65
+ )
66
+ return h.hexdigest()
67
+
68
+
69
+ def _load_pricing(con: sqlite3.Connection) -> tuple[dict[str, dict], list[str]]:
70
+ rows = con.execute(
71
+ "SELECT model_pattern, input_per_mtok, output_per_mtok, cache_creation_per_mtok, "
72
+ "cache_read_per_mtok, context_window FROM model_pricing"
73
+ ).fetchall()
74
+ rate_by_pattern: dict[str, dict] = {}
75
+ for r in rows:
76
+ rate_by_pattern[r[0]] = {
77
+ "input_per_mtok": r[1],
78
+ "output_per_mtok": r[2],
79
+ "cache_creation_per_mtok": r[3],
80
+ "cache_read_per_mtok": r[4],
81
+ "context_window": r[5],
82
+ }
83
+ return rate_by_pattern, list(rate_by_pattern.keys())
84
+
85
+
86
+ def _dominant_model(session: ParsedSession) -> str:
87
+ counts: dict[str, int] = {}
88
+ for m in session.messages:
89
+ if m.model:
90
+ counts[m.model] = counts.get(m.model, 0) + 1
91
+ if not counts:
92
+ return ""
93
+ return max(counts.items(), key=lambda kv: kv[1])[0]
94
+
95
+
96
+ def ingest_session(
97
+ con: sqlite3.Connection,
98
+ session: ParsedSession,
99
+ rate_by_pattern: dict[str, dict],
100
+ patterns: list[str],
101
+ ) -> str:
102
+ """Write one parsed session idempotently. Returns 'inserted' | 'updated' | 'skipped'."""
103
+ chash = _content_hash(session)
104
+ existing = con.execute(
105
+ "SELECT content_hash FROM transcript_sessions WHERE id=?", (session.id,)
106
+ ).fetchone()
107
+ status = "inserted"
108
+ if existing is not None:
109
+ if existing[0] == chash:
110
+ return "skipped"
111
+ status = "updated"
112
+ # ReplaceSessionMessages: clear derived rows for this session, then re-insert.
113
+ con.execute("DELETE FROM transcript_messages WHERE session_id=?", (session.id,))
114
+ con.execute("DELETE FROM transcript_tool_calls WHERE session_id=?", (session.id,))
115
+ con.execute("DELETE FROM token_usage_events WHERE session_id=?", (session.id,))
116
+ con.execute("DELETE FROM transcript_sessions WHERE id=?", (session.id,))
117
+
118
+ # tool_use_id -> is_error, so each tool call carries its result's failure flag
119
+ result_error: dict[str, bool] = {}
120
+ for m in session.messages:
121
+ for tr in m.tool_results:
122
+ if tr.tool_use_id:
123
+ result_error[tr.tool_use_id] = tr.is_error
124
+
125
+ dom_model = _dominant_model(session)
126
+ resolved_dom = prices.resolve_model(dom_model, patterns) if dom_model else None
127
+ window = None
128
+ if resolved_dom:
129
+ window = (rate_by_pattern.get(resolved_dom) or {}).get("context_window")
130
+ sig = extract_signals(session, context_window=window)
131
+ boundary_ordinals = set(sig.compaction_boundaries)
132
+
133
+ row = {
134
+ "id": session.id,
135
+ "zeno_session_id": None,
136
+ "agent": session.agent,
137
+ "project": session.project or "",
138
+ "machine": session.machine or "local",
139
+ "first_message": session.first_message,
140
+ "display_name": session.display_name,
141
+ "started_at": session.started_at,
142
+ "ended_at": session.ended_at,
143
+ "message_count": sig.message_count,
144
+ "user_message_count": sig.user_message_count,
145
+ "total_output_tokens": sig.total_output_tokens,
146
+ "peak_context_tokens": sig.peak_context_tokens,
147
+ "is_automated": int(sig.is_automated),
148
+ "tool_failure_signal_count": sig.tool_failure_signal_count,
149
+ "tool_retry_count": sig.tool_retry_count,
150
+ "edit_churn_count": sig.edit_churn_count,
151
+ "consecutive_failure_max": sig.consecutive_failure_max,
152
+ "outcome": sig.outcome,
153
+ "outcome_confidence": sig.outcome_confidence,
154
+ "ended_with_role": sig.ended_with_role,
155
+ "final_failure_streak": sig.final_failure_streak,
156
+ "compaction_count": sig.compaction_count,
157
+ "mid_task_compaction_count": sig.mid_task_compaction_count,
158
+ "context_pressure_max": sig.context_pressure_max,
159
+ "health_score": sig.health_score,
160
+ "health_grade": sig.health_grade,
161
+ "has_tool_calls": int(sig.has_tool_calls),
162
+ "has_context_data": int(sig.has_context_data),
163
+ "cwd": session.cwd or "",
164
+ "git_branch": session.git_branch or "",
165
+ "file_path": session.file_path,
166
+ "file_mtime": session.file_mtime,
167
+ "content_hash": chash,
168
+ }
169
+ # let the column default stamp created_at
170
+ cols = [c for c in _SESSION_INSERT_COLS if c != "created_at"]
171
+ placeholders = ",".join("?" for _ in cols)
172
+ con.execute(
173
+ f"INSERT INTO transcript_sessions ({','.join(cols)}) VALUES ({placeholders})",
174
+ [row[c] for c in cols],
175
+ )
176
+
177
+ for m in session.messages:
178
+ con.execute(
179
+ "INSERT INTO transcript_messages "
180
+ "(session_id, ordinal, role, content, thinking_text, timestamp, has_thinking, "
181
+ " has_tool_use, content_length, model, token_usage, context_tokens, output_tokens, "
182
+ " source_uuid, source_parent_uuid, is_sidechain, is_compact_boundary) "
183
+ "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
184
+ (
185
+ session.id,
186
+ m.ordinal,
187
+ m.role,
188
+ m.content,
189
+ m.thinking_text,
190
+ m.timestamp,
191
+ int(m.has_thinking),
192
+ int(m.has_tool_use),
193
+ len(m.content or ""),
194
+ m.model,
195
+ m.raw_usage,
196
+ m.context_tokens,
197
+ int(m.output_tokens or 0),
198
+ m.source_uuid,
199
+ m.source_parent_uuid,
200
+ int(m.is_sidechain),
201
+ int(m.ordinal in boundary_ordinals),
202
+ ),
203
+ )
204
+ for k, tu in enumerate(m.tool_uses):
205
+ con.execute(
206
+ "INSERT INTO transcript_tool_calls "
207
+ "(session_id, message_ordinal, ordinal_in_message, tool_name, category, "
208
+ " is_error, source) VALUES (?,?,?,?,?,?,?)",
209
+ (
210
+ session.id,
211
+ m.ordinal,
212
+ k,
213
+ tu.name,
214
+ normalize_tool_category(tu.name),
215
+ int(bool(result_error.get(tu.id, False))),
216
+ session.agent,
217
+ ),
218
+ )
219
+ if m.has_usage:
220
+ resolved = prices.resolve_model(m.model, patterns) if m.model else None
221
+ rate = rate_by_pattern.get(resolved) if resolved else None
222
+ tokens = {
223
+ "input_tokens": m.input_tokens,
224
+ "output_tokens": m.output_tokens,
225
+ "cache_creation_input_tokens": m.cache_creation_input_tokens,
226
+ "cache_read_input_tokens": m.cache_read_input_tokens,
227
+ }
228
+ cost = prices.cost_for(tokens, rate)
229
+ priced = cost is not None
230
+ con.execute(
231
+ "INSERT INTO token_usage_events "
232
+ "(session_id, message_ordinal, source, model, input_tokens, output_tokens, "
233
+ " cache_creation_input_tokens, cache_read_input_tokens, reasoning_tokens, "
234
+ " cost_usd, cost_status, cost_source, occurred_at, dedup_key) "
235
+ "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?) "
236
+ # the conflict target must repeat the partial index's WHERE predicate
237
+ "ON CONFLICT(session_id, source, dedup_key) WHERE dedup_key != '' DO NOTHING",
238
+ (
239
+ session.id,
240
+ m.ordinal,
241
+ session.agent,
242
+ resolved or m.model,
243
+ m.input_tokens,
244
+ m.output_tokens,
245
+ m.cache_creation_input_tokens,
246
+ m.cache_read_input_tokens,
247
+ m.reasoning_tokens,
248
+ cost,
249
+ "computed" if priced else "",
250
+ "computed" if priced else "",
251
+ m.timestamp,
252
+ m.usage_dedup_key,
253
+ ),
254
+ )
255
+ return status
256
+
257
+
258
+ def ingest_tool(
259
+ con: sqlite3.Connection,
260
+ tool: str,
261
+ root: Path | None = None,
262
+ limit: int | None = None,
263
+ ) -> dict[str, int]:
264
+ """Discover + parse + ingest every session for one tool. Returns a status tally."""
265
+ mod = REGISTRY[tool]
266
+ root = mod.default_root() if root is None else Path(root)
267
+ rate_by_pattern, patterns = _load_pricing(con)
268
+ # failed = the file/session RAISED; empty = parsed cleanly to zero sessions (a
269
+ # header-only Cursor vscdb is the common, healthy empty case - not a failure).
270
+ tally = {"files": 0, "inserted": 0, "updated": 0, "skipped": 0, "failed": 0, "empty": 0}
271
+ files = mod.discover(root)
272
+ if limit is not None:
273
+ files = files[:limit]
274
+ has_multi = hasattr(mod, "parse_sessions")
275
+ for path in files:
276
+ tally["files"] += 1
277
+ # file-level parse: a raise here loses only this file
278
+ try:
279
+ # one source file may hold many sessions (Cursor: a vscdb of composers)
280
+ if has_multi:
281
+ parsed = mod.parse_sessions(path)
282
+ else:
283
+ one = mod.parse_file(path)
284
+ parsed = [one] if one is not None else []
285
+ except Exception:
286
+ tally["failed"] += 1
287
+ continue
288
+ live = [s for s in parsed if s is not None and s.messages]
289
+ if not live:
290
+ tally["empty"] += 1 # valid file with nothing to ingest (e.g. header-only)
291
+ continue
292
+ # per-session: one bad composer must NOT abandon the rest of the file
293
+ for session in live:
294
+ try:
295
+ status = ingest_session(con, session, rate_by_pattern, patterns)
296
+ tally[status] += 1
297
+ except Exception:
298
+ tally["failed"] += 1
299
+ return tally
300
+
301
+
302
+ def run(
303
+ db_path: str,
304
+ tools: list[str],
305
+ roots: dict[str, Path] | None = None,
306
+ limit: int | None = None,
307
+ ) -> dict[str, dict[str, int]]:
308
+ """Open the DB, ensure schema + pricing, ingest each tool. Returns per-tool tallies."""
309
+ roots = roots or {}
310
+ con = sqlite3.connect(db_path)
311
+ try:
312
+ con.execute("PRAGMA journal_mode=WAL")
313
+ con.execute("PRAGMA busy_timeout=5000")
314
+ schema.ensure_schema(con)
315
+ prices.seed_pricing(con)
316
+ con.commit()
317
+ results: dict[str, dict[str, int]] = {}
318
+ for tool in tools:
319
+ if tool not in REGISTRY:
320
+ results[tool] = {"error": 1}
321
+ continue
322
+ results[tool] = ingest_tool(con, tool, roots.get(tool), limit)
323
+ con.commit()
324
+ schema.rebuild_fts(con)
325
+ con.commit()
326
+ return results
327
+ finally:
328
+ con.close()
329
+
330
+
331
+ def main(argv: list[str] | None = None) -> int:
332
+ ap = argparse.ArgumentParser(prog="zeno-ingest", description="Multi-tool session ingester")
333
+ ap.add_argument("--db", default=default_db_path(), help="target SQLite DB (zeno capture)")
334
+ ap.add_argument("--tools", default="claude", help="comma-separated: " + ",".join(REGISTRY))
335
+ ap.add_argument(
336
+ "--root",
337
+ action="append",
338
+ default=[],
339
+ metavar="TOOL=PATH",
340
+ help="override a tool's source root, e.g. claude=/path",
341
+ )
342
+ ap.add_argument("--limit", type=int, default=None, help="max files per tool (debug)")
343
+ ap.add_argument(
344
+ "--write-live",
345
+ action="store_true",
346
+ help="confirm writing the live capture DB at ~/.zeno/zeno.db (refused otherwise)",
347
+ )
348
+ args = ap.parse_args(argv)
349
+
350
+ # Guard the foot-gun: a bare invocation resolves --db to the live ~/.zeno/zeno.db
351
+ # (the file that also holds the frozen SCED). Refuse to write it without explicit
352
+ # confirmation; schema there should come from the 0004 alembic migration, not the
353
+ # ingester's ensure_schema, to keep alembic_version consistent.
354
+ live_db = os.path.join(os.path.expanduser("~"), ".zeno", "zeno.db")
355
+ if os.path.abspath(args.db) == os.path.abspath(live_db) and not args.write_live:
356
+ sys.stderr.write(
357
+ f"zeno-ingest: refusing to write the live capture DB at {live_db}\n"
358
+ " pass --write-live to confirm, or --db <path> for a throwaway DB.\n"
359
+ )
360
+ return 2
361
+
362
+ tools = [t.strip() for t in args.tools.split(",") if t.strip()]
363
+ roots: dict[str, Path] = {}
364
+ for spec in args.root:
365
+ if "=" in spec:
366
+ tool, _, path = spec.partition("=")
367
+ roots[tool.strip()] = Path(os.path.expanduser(path.strip()))
368
+
369
+ results = run(args.db, tools, roots, args.limit)
370
+ json.dump({"db": args.db, "results": results}, sys.stdout, indent=2)
371
+ sys.stdout.write("\n")
372
+ return 0
373
+
374
+
375
+ if __name__ == "__main__":
376
+ raise SystemExit(main())
@@ -0,0 +1,129 @@
1
+ """Normalized intermediate representation shared by every tool parser.
2
+
3
+ A parser's only job is to turn a tool's on-disk session format (Claude Code JSONL,
4
+ Codex rollout JSONL, Cursor vscdb) into a ``ParsedSession``. The signal extractor and
5
+ the writer then operate on this one shape, so adding a tool never touches them.
6
+
7
+ Stdlib-only dataclasses (Python 3.12).
8
+
9
+
10
+ Note: no ``slots=True`` and no 3.10+ runtime constructs - this package must import under
11
+ the system ``python3`` (3.9) the dashboard export scripts use, matching the cognition
12
+ stdlib doctrine. Modern annotations are fine (lazy via ``from __future__``).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass, field
18
+
19
+ _INT64_MAX = 2**63 - 1 # SQLite INTEGER ceiling
20
+
21
+
22
+ def to_int(v: object) -> int:
23
+ """Best-effort token-count coercion that NEVER raises, clamped to ``[0, 2**63-1]``.
24
+
25
+ Honors the parser contract ('never raises - a malformed line is skipped'): a corrupt
26
+ or locale-formatted value (``"1,234"``, ``"NaN"``, ``None``) zeroes that one field
27
+ instead of throwing and dropping the whole session. Token counts are non-negative and
28
+ must fit a SQLite INTEGER, so a negative value clamps to 0 (else it would silently net
29
+ against real cost in the ledger SUM) and an oversized value clamps to the INT64 ceiling
30
+ (else the sqlite3 bind would OverflowError and drop the session)."""
31
+ try:
32
+ if isinstance(v, bool):
33
+ n = int(v)
34
+ elif isinstance(v, (int, float)):
35
+ n = int(v)
36
+ elif isinstance(v, str) and v.strip():
37
+ n = int(float(v.replace(",", "")))
38
+ else:
39
+ return 0
40
+ except (ValueError, TypeError, OverflowError):
41
+ return 0
42
+ if n < 0:
43
+ return 0
44
+ return n if n <= _INT64_MAX else _INT64_MAX
45
+
46
+
47
+ @dataclass
48
+ class ToolUse:
49
+ name: str
50
+ input_json: str = "" # canonical JSON of the tool input (for retry detection)
51
+ id: str = ""
52
+
53
+
54
+ @dataclass
55
+ class ToolResult:
56
+ tool_use_id: str = ""
57
+ content: str = ""
58
+ is_error: bool = False
59
+
60
+
61
+ @dataclass
62
+ class ParsedMessage:
63
+ ordinal: int
64
+ role: str # user | assistant | system | tool
65
+ content: str = "" # flattened text (FTS + content_length)
66
+ thinking_text: str = ""
67
+ timestamp: str | None = None
68
+ model: str = ""
69
+ # token usage (cache-aware); raw kept for provenance
70
+ input_tokens: int = 0
71
+ output_tokens: int = 0
72
+ cache_creation_input_tokens: int = 0
73
+ cache_read_input_tokens: int = 0
74
+ reasoning_tokens: int = 0
75
+ raw_usage: str = ""
76
+ # source threading + provenance
77
+ source_uuid: str = ""
78
+ source_parent_uuid: str = ""
79
+ is_sidechain: bool = False
80
+ # tool activity
81
+ tool_uses: list[ToolUse] = field(default_factory=list)
82
+ tool_results: list[ToolResult] = field(default_factory=list)
83
+ # idempotency: stable per-message id for the usage ledger dedup_key
84
+ usage_dedup_key: str = ""
85
+
86
+ @property
87
+ def has_thinking(self) -> bool:
88
+ return bool(self.thinking_text)
89
+
90
+ @property
91
+ def has_tool_use(self) -> bool:
92
+ return bool(self.tool_uses)
93
+
94
+ @property
95
+ def context_tokens(self) -> int:
96
+ """Live context-window size sent for this turn (the prompt), cache-inclusive."""
97
+ return (
98
+ int(self.input_tokens or 0)
99
+ + int(self.cache_read_input_tokens or 0)
100
+ + int(self.cache_creation_input_tokens or 0)
101
+ )
102
+
103
+ @property
104
+ def has_usage(self) -> bool:
105
+ return (
106
+ self.input_tokens
107
+ or self.output_tokens
108
+ or self.cache_creation_input_tokens
109
+ or self.cache_read_input_tokens
110
+ or self.reasoning_tokens
111
+ ) > 0
112
+
113
+
114
+ @dataclass
115
+ class ParsedSession:
116
+ id: str
117
+ agent: str # claude (primary) | cursor (validated, capture-only) | codex (experimental)
118
+ messages: list[ParsedMessage] = field(default_factory=list)
119
+ project: str = ""
120
+ machine: str = "local"
121
+ cwd: str = ""
122
+ git_branch: str = ""
123
+ display_name: str | None = None
124
+ first_message: str | None = None
125
+ started_at: str | None = None
126
+ ended_at: str | None = None
127
+ # ingest identity (set by the loader, used for idempotency)
128
+ file_path: str | None = None
129
+ file_mtime: int | None = None
@@ -0,0 +1,31 @@
1
+ """Per-tool session parsers. Each turns one tool's on-disk format into a ParsedSession.
2
+
3
+ A parser module exposes:
4
+ - ``default_root() -> Path`` where this tool stores sessions locally
5
+ - ``discover(root) -> list[Path]`` session files under root (read-only)
6
+ - ``parse_file(path) -> ParsedSession | None``
7
+ - optionally ``parse_sessions(path) -> list[ParsedSession]`` when one source file holds
8
+ MANY sessions (Cursor: one vscdb == many composers). The ingester prefers it.
9
+
10
+ The registry keeps the ingester tool-agnostic: adding a tool is adding a module + a row.
11
+
12
+ Tool status (this build is Claude-Code-first - see docs/TOOLS_STATUS.md):
13
+ - ``claude`` (Claude Code): PRIMARY, the first-class capture path, validated on real data.
14
+ - ``cursor``: validated against real local data, capture-only (no SDK adapter); LATER.
15
+ - ``codex`` (OpenAI): EXPERIMENTAL, fixture-verified only (no real Codex install);
16
+ explicit ``--tools`` opt-in, never a default.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from . import claude_code, codex, cursor
22
+
23
+ # tool name -> parser module.
24
+ # claude = primary; cursor = validated (capture-only); codex = experimental (fixture-only).
25
+ REGISTRY = {
26
+ "claude": claude_code,
27
+ "codex": codex,
28
+ "cursor": cursor,
29
+ }
30
+
31
+ __all__ = ["REGISTRY", "claude_code", "codex", "cursor"]