octarin-cli 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1113 @@
1
+ #!/usr/bin/env python3
2
+ # ruff: noqa: BLE001, S110, S112, S310
3
+ #
4
+ # This is a fail-open one-shot importer: every error path is intentionally a
5
+ # bare ``except Exception: continue`` (S112) / ``pass`` (S110) so a single
6
+ # malformed line, parse failure, or network blip never aborts the run. The
7
+ # ingest URL comes from ``OCTARIN_INGEST_URL`` env (controlled), not user
8
+ # input, so S310 (urlopen with permitted schemes) doesn't apply here.
9
+ """Octarin history backfill (https://octarin.ai) — team AI-usage analytics tool.
10
+
11
+ Octarin is a team AI-usage analytics tool for AI-assisted coding (the same
12
+ pattern as Langfuse / OpenTelemetry session logging). This one-shot importer
13
+ reads the AI-coding session logs your tools already keep on THIS machine and
14
+ POSTs them, as canonical Octarin ``IngestEvent``s, to your team's shared Octarin
15
+ workspace so your past usage + cost shows up alongside new sessions. It reads
16
+ only local agent-session transcripts you produced and sends them to the ingest
17
+ URL baked into your install.
18
+
19
+ What it scans (read-only):
20
+ * ~/.claude/projects/**/*.jsonl (Claude Code transcripts)
21
+ * ~/.codex/sessions/**/rollout-*.jsonl (Codex rollout logs)
22
+ * Cursor workspaceStorage (best-effort; state.vscdb chat history)
23
+
24
+ Excludes ``_*`` files and any ``subagents/`` directory (sidecar/internal logs).
25
+
26
+ Pure stdlib. Fail-open: a single bad file or failed POST never aborts the run.
27
+ Idempotent: each session maps to a deterministic trace id server-side (derived
28
+ from project + source + session id), so re-running dedupes instead of double
29
+ -counting. Use ``--since`` to limit how far back to import.
30
+
31
+ Usage:
32
+ OCTARIN_API_KEY=<key> python3 backfill.py [--since 7d] [--dry-run] [--verbose]
33
+
34
+ Env:
35
+ OCTARIN_API_KEY (required) the project key, same one the hooks use.
36
+ OCTARIN_INGEST_URL (optional) defaults to https://api.octarin.ai/v1/ingest.
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import argparse
42
+ import contextlib
43
+ import getpass
44
+ import json
45
+ import os
46
+ import re
47
+ import subprocess
48
+ import sys
49
+ import urllib.error
50
+ import urllib.request
51
+ from datetime import datetime, timedelta, timezone
52
+ from pathlib import Path
53
+
54
+ INGEST_URL_DEFAULT = "https://api.octarin.ai/v1/ingest"
55
+
56
+ # Truncation budgets mirror the live hooks so backfilled spans look identical to
57
+ # streamed ones.
58
+ _INPUT_CAP = 8000
59
+ _OUTPUT_CAP = 16000
60
+ _NAME_CAP = 80
61
+
62
+
63
+ def _now_iso() -> str:
64
+ return datetime.now(timezone.utc).isoformat()
65
+
66
+
67
+ def _eprint(*args: object) -> None:
68
+ print(*args, file=sys.stderr, flush=True)
69
+
70
+
71
+ # ── argument / config plumbing ───────────────────────────────────────────────
72
+
73
+
74
+ def parse_since(spec: str | None) -> datetime | None:
75
+ """Parse a ``--since`` value into a UTC cutoff datetime (or None for 'all').
76
+
77
+ Accepts a relative span (``30m``, ``12h``, ``7d``, ``2w``) or an absolute
78
+ ISO date/datetime (``2026-01-01`` / ``2026-01-01T00:00:00``). Returns None on
79
+ an empty/unparseable value so the run simply imports everything.
80
+ """
81
+ if not spec:
82
+ return None
83
+ spec = spec.strip()
84
+ m = re.fullmatch(r"(\d+)\s*([smhdw])", spec.lower())
85
+ if m:
86
+ n = int(m.group(1))
87
+ unit = m.group(2)
88
+ seconds = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}[unit] * n
89
+ return datetime.now(timezone.utc) - timedelta(seconds=seconds)
90
+ for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
91
+ try:
92
+ return datetime.strptime(spec, fmt).replace(tzinfo=timezone.utc)
93
+ except ValueError:
94
+ continue
95
+ _eprint(
96
+ f"[octarin] warning: could not parse --since {spec!r}; importing all history"
97
+ )
98
+ return None
99
+
100
+
101
+ def _parse_ts(value: object) -> datetime | None:
102
+ """Best-effort parse of a timestamp string into an aware UTC datetime."""
103
+ if not isinstance(value, str) or not value:
104
+ return None
105
+ v = value.strip()
106
+ if v.endswith("Z"):
107
+ v = v[:-1] + "+00:00"
108
+ try:
109
+ dt = datetime.fromisoformat(v)
110
+ except ValueError:
111
+ return None
112
+ if dt.tzinfo is None:
113
+ dt = dt.replace(tzinfo=timezone.utc)
114
+ return dt.astimezone(timezone.utc)
115
+
116
+
117
+ def _default_user(cwd: str | None = None) -> str:
118
+ """Resolve a stable, pseudonymous user_ref (git email, then OS user)."""
119
+ user = os.environ.get("OCTARIN_USER")
120
+ if user:
121
+ return user
122
+ try:
123
+ out = subprocess.check_output(
124
+ ["git", "config", "user.email"],
125
+ cwd=cwd or os.getcwd(),
126
+ stderr=subprocess.DEVNULL,
127
+ )
128
+ email = out.decode().strip()
129
+ if email:
130
+ return email
131
+ except Exception:
132
+ pass
133
+ try:
134
+ return getpass.getuser()
135
+ except Exception:
136
+ return "unknown"
137
+
138
+
139
+ def _repo_of(cwd: str | None) -> str | None:
140
+ """Derive a repo label from a working directory path."""
141
+ if not cwd:
142
+ return None
143
+ base = os.path.basename(cwd.rstrip("/"))
144
+ return base or None
145
+
146
+
147
+ # ── Claude Code transcript parser (ported from the install Stop hook) ─────────
148
+
149
+
150
+ def parse_claude_transcript( # noqa: PLR0915 - top-down jsonl parser; splitting
151
+ path: str, # would scatter the local span-id/state bookkeeping.
152
+ default_user: str,
153
+ ) -> dict | None:
154
+ """Map a Claude Code ``*.jsonl`` transcript to a canonical IngestEvent dict.
155
+
156
+ This mirrors the streaming Stop hook (install/router.py ``HOOK_PY``) so
157
+ backfilled traces are identical to live ones: deterministic session id,
158
+ smart session_name, multi-hop parent_span_id tree (tool/subagent nested
159
+ under their assistant turn), real tool/file content, and span_type
160
+ classification (subagents→agent, search/read→retrieval, else→tool).
161
+
162
+ Returns None when the file has no usable spans.
163
+ """
164
+ msgs: list[dict] = []
165
+ cwd: str | None = None
166
+ try:
167
+ with Path(path).open(encoding="utf-8", errors="replace") as f:
168
+ for raw_line in f:
169
+ line = raw_line.strip()
170
+ if not line:
171
+ continue
172
+ try:
173
+ obj = json.loads(line)
174
+ except Exception:
175
+ continue
176
+ if isinstance(obj, dict):
177
+ msgs.append(obj)
178
+ if cwd is None and isinstance(obj.get("cwd"), str):
179
+ cwd = obj.get("cwd")
180
+ except Exception:
181
+ return None
182
+
183
+ # session id: prefer an embedded sessionId, else the file stem.
184
+ sid = None
185
+ for m in msgs:
186
+ sid = m.get("sessionId") or m.get("session_id")
187
+ if sid:
188
+ break
189
+ if not sid:
190
+ sid = os.path.splitext(os.path.basename(path))[0] or "unknown"
191
+
192
+ repo = _repo_of(cwd)
193
+
194
+ # First pass: collect tool_result content (and its ts) keyed by tool_use_id.
195
+ results: dict[str, str] = {}
196
+ result_ts: dict[str, str] = {}
197
+ for m in msgs:
198
+ msg = m.get("message", m)
199
+ if not isinstance(msg, dict):
200
+ continue
201
+ mts = m.get("timestamp")
202
+ content = msg.get("content")
203
+ if isinstance(content, list):
204
+ for b in content:
205
+ if isinstance(b, dict) and b.get("type") == "tool_result":
206
+ txt = b.get("content")
207
+ if isinstance(txt, list):
208
+ txt = "\n".join(
209
+ x.get("text", "") for x in txt if isinstance(x, dict)
210
+ )
211
+ tid = b.get("tool_use_id")
212
+ results[tid] = (
213
+ txt if isinstance(txt, str) else json.dumps(txt)
214
+ )
215
+ if mts:
216
+ result_ts[tid] = mts
217
+
218
+ # Second pass: build spans.
219
+ spans: list[dict] = []
220
+ session_name: str | None = None
221
+ model: str | None = None
222
+ gen = 0
223
+ prev_ts: str | None = None # for assistant-span start = prior message ts
224
+ for m in msgs:
225
+ msg = m.get("message", m)
226
+ if not isinstance(msg, dict):
227
+ continue
228
+ role = msg.get("role") or m.get("type")
229
+ ts = m.get("timestamp") or _now_iso()
230
+ content = msg.get("content")
231
+ if session_name is None and role == "user":
232
+ t = content if isinstance(content, str) else None
233
+ if isinstance(content, list):
234
+ t = " ".join(
235
+ b.get("text", "")
236
+ for b in content
237
+ if isinstance(b, dict) and b.get("type") == "text"
238
+ )
239
+ if t and t.strip():
240
+ session_name = t.strip().replace("\n", " ")[:_NAME_CAP]
241
+ if role == "assistant":
242
+ model = msg.get("model") or model
243
+ usage = msg.get("usage") or {}
244
+ out = content if isinstance(content, str) else ""
245
+ if isinstance(content, list):
246
+ out = "\n".join(
247
+ b.get("text", "")
248
+ for b in content
249
+ if isinstance(b, dict) and b.get("type") == "text"
250
+ )
251
+ gen += 1
252
+ spans.append(
253
+ {
254
+ "span_id": f"{sid}-gen{gen}",
255
+ "name": "assistant",
256
+ "span_type": "llm",
257
+ "start_time": prev_ts or ts,
258
+ "end_time": ts,
259
+ "model": model,
260
+ "output": out,
261
+ "input_tokens": usage.get("input_tokens", 0),
262
+ "output_tokens": usage.get("output_tokens", 0),
263
+ "cache_read_tokens": usage.get("cache_read_input_tokens", 0),
264
+ "cache_write_tokens": usage.get("cache_creation_input_tokens", 0),
265
+ "status": "ok",
266
+ }
267
+ )
268
+ if isinstance(content, list):
269
+ for b in content:
270
+ if isinstance(b, dict) and b.get("type") == "tool_use":
271
+ tname = b.get("name", "")
272
+ if tname in ("Task", "Agent"):
273
+ stype, label = "agent", f"subagent:{tname}"
274
+ elif tname in ("Read", "Grep", "Glob", "WebFetch", "WebSearch"):
275
+ stype, label = "retrieval", f"tool:{tname}"
276
+ else:
277
+ stype, label = "tool", f"tool:{tname}"
278
+ tid = b.get("id")
279
+ spans.append(
280
+ {
281
+ "span_id": f"{sid}-{tid or 'tool'}",
282
+ "parent_span_id": f"{sid}-gen{gen}",
283
+ "name": label,
284
+ "span_type": stype,
285
+ "start_time": ts,
286
+ "end_time": result_ts.get(tid, ts),
287
+ "input": json.dumps(b.get("input", {}))[:_INPUT_CAP],
288
+ "output": (results.get(tid) or "")[
289
+ :_OUTPUT_CAP
290
+ ],
291
+ "status": "ok",
292
+ }
293
+ )
294
+ prev_ts = ts
295
+ if not spans:
296
+ return None
297
+ if not session_name:
298
+ session_name = f"Claude Code: {repo or sid}"
299
+ return {
300
+ "source": "claude-code",
301
+ "session_id": sid,
302
+ "session_name": session_name,
303
+ "user_ref": default_user,
304
+ "repo": repo,
305
+ "model": model,
306
+ "spans": spans,
307
+ }
308
+
309
+
310
+ # ── Codex rollout parser ──────────────────────────────────────────────────────
311
+
312
+
313
+ def _codex_text(content: object) -> str:
314
+ """Flatten a Codex content value (string or list of typed blocks) to text."""
315
+ if isinstance(content, str):
316
+ return content
317
+ if isinstance(content, list):
318
+ out = []
319
+ for b in content:
320
+ if isinstance(b, dict):
321
+ out.append(b.get("text") or b.get("content") or "")
322
+ elif isinstance(b, str):
323
+ out.append(b)
324
+ return "\n".join(x for x in out if x)
325
+ return ""
326
+
327
+
328
+ def parse_codex_rollout( # noqa: PLR0915 - top-down jsonl parser; same shape
329
+ path: str, # as parse_claude_transcript, kept linear for clarity.
330
+ default_user: str,
331
+ ) -> dict | None:
332
+ """Map a Codex ``rollout-*.jsonl`` session to a canonical IngestEvent dict.
333
+
334
+ Codex wraps each record in ``{type, payload}``. ``response_item`` /
335
+ ``event_msg`` payloads carry ``message`` (role user/assistant), ``reasoning``,
336
+ and ``function_call`` / ``function_call_output`` items. We build an
337
+ assistant ``llm`` span per assistant message and nest tool (function-call)
338
+ spans under the most recent assistant turn for the multi-hop tree.
339
+
340
+ Returns None when the file has no usable spans (e.g. a pure SDK exec call).
341
+ """
342
+ items: list[dict] = []
343
+ sid: str | None = None
344
+ cwd: str | None = None
345
+ model: str | None = None
346
+ try:
347
+ with Path(path).open(encoding="utf-8", errors="replace") as f:
348
+ for raw_line in f:
349
+ line = raw_line.strip()
350
+ if not line:
351
+ continue
352
+ try:
353
+ obj = json.loads(line)
354
+ except Exception:
355
+ continue
356
+ if not isinstance(obj, dict):
357
+ continue
358
+ payload = obj.get("payload")
359
+ if obj.get("type") == "session_meta" and isinstance(payload, dict):
360
+ sid = payload.get("id") or sid
361
+ cwd = payload.get("cwd") or cwd
362
+ model = payload.get("model") or model
363
+ continue
364
+ if isinstance(payload, dict):
365
+ payload.setdefault("_ts", obj.get("timestamp"))
366
+ items.append(payload)
367
+ except Exception:
368
+ return None
369
+
370
+ if not sid:
371
+ # rollout-<iso>-<uuid>.jsonl → take the trailing uuid-ish segment.
372
+ stem = os.path.splitext(os.path.basename(path))[0]
373
+ sid = stem
374
+ m = re.search(r"([0-9a-fA-F-]{8,})$", stem)
375
+ if m:
376
+ sid = m.group(1)
377
+
378
+ repo = _repo_of(cwd)
379
+ spans: list[dict] = []
380
+ session_name: str | None = None
381
+ gen = 0
382
+ last_gen_id: str | None = None
383
+ # Map function_call call_id → its emitted span so outputs can attach.
384
+ call_index: dict[str, dict] = {}
385
+ prev_ts: str | None = None # LLM start ts; tool spans also extend to result ts
386
+
387
+ for it in items:
388
+ ts = it.get("_ts") or _now_iso()
389
+ ptype = it.get("type")
390
+ role = it.get("role") or ""
391
+ model = it.get("model") or model
392
+
393
+ if ptype == "message":
394
+ text = _codex_text(it.get("content"))
395
+ if role == "user":
396
+ # Skip the giant injected developer/instruction preamble for the
397
+ # session name; user prompts are inputs, not billable turns.
398
+ if session_name is None and text and not text.lstrip().startswith("#"):
399
+ session_name = text.strip().replace("\n", " ")[:_NAME_CAP]
400
+ elif role == "assistant":
401
+ gen += 1
402
+ last_gen_id = f"{sid}-gen{gen}"
403
+ spans.append(
404
+ {
405
+ "span_id": last_gen_id,
406
+ "name": "assistant",
407
+ "span_type": "llm",
408
+ "start_time": prev_ts or ts,
409
+ "end_time": ts,
410
+ "model": model,
411
+ "output": text[:_OUTPUT_CAP],
412
+ "status": "ok",
413
+ }
414
+ )
415
+ elif ptype == "function_call":
416
+ name = it.get("name") or "tool"
417
+ call_id = it.get("call_id") or it.get("id") or f"call{len(spans)}"
418
+ if name in ("read_file", "grep", "search", "web_search", "find"):
419
+ stype = "retrieval"
420
+ else:
421
+ stype = "tool"
422
+ arguments = it.get("arguments")
423
+ span = {
424
+ "span_id": f"{sid}-{call_id}",
425
+ "name": f"tool:{name}",
426
+ "span_type": stype,
427
+ "start_time": ts,
428
+ "end_time": ts,
429
+ "input": (
430
+ arguments[:_INPUT_CAP]
431
+ if isinstance(arguments, str)
432
+ else json.dumps(arguments or {})[:_INPUT_CAP]
433
+ ),
434
+ "output": "",
435
+ "status": "ok",
436
+ }
437
+ if last_gen_id:
438
+ span["parent_span_id"] = last_gen_id
439
+ spans.append(span)
440
+ call_index[call_id] = span
441
+ elif ptype == "function_call_output":
442
+ call_id = it.get("call_id") or it.get("id")
443
+ target = call_index.get(call_id)
444
+ if target is not None:
445
+ out = it.get("output")
446
+ if isinstance(out, dict):
447
+ out = out.get("content") or json.dumps(out)
448
+ target["output"] = out[:_OUTPUT_CAP] if isinstance(out, str) else ""
449
+ target["end_time"] = ts
450
+ prev_ts = ts
451
+
452
+ if not spans:
453
+ return None
454
+ if not session_name:
455
+ session_name = f"Codex: {repo or sid}"
456
+ return {
457
+ "source": "codex",
458
+ "session_id": sid,
459
+ "session_name": session_name,
460
+ "user_ref": default_user,
461
+ "repo": repo,
462
+ "model": model,
463
+ "spans": spans,
464
+ }
465
+
466
+
467
+ # ── Cursor (best-effort) ──────────────────────────────────────────────────────
468
+
469
+
470
+ def _cursor_storage_root() -> str | None:
471
+ """Locate the Cursor workspaceStorage dir for the host OS, if present."""
472
+ home = os.path.expanduser("~")
473
+ candidates = [
474
+ os.path.join(
475
+ home, "Library", "Application Support", "Cursor", "User", "workspaceStorage"
476
+ ),
477
+ os.path.join(home, ".config", "Cursor", "User", "workspaceStorage"),
478
+ os.path.join(home, "AppData", "Roaming", "Cursor", "User", "workspaceStorage"),
479
+ ]
480
+ for c in candidates:
481
+ if os.path.isdir(c):
482
+ return c
483
+ return None
484
+
485
+
486
+ def parse_cursor_sessions(
487
+ default_user: str, cutoff: datetime | None, verbose: bool
488
+ ) -> list[dict]:
489
+ """Best-effort import of Cursor chat history from workspaceStorage.
490
+
491
+ Two parsers run per workspace in priority order:
492
+
493
+ * **aiService** (current Cursor) — reads ``aiService.generations`` and
494
+ ``aiService.prompts`` from ``ItemTable``. Generations carry a ``unixMs``
495
+ timestamp and a ``generationUUID``, so we can split a workspace into
496
+ proper time-bounded sessions (one session per >30-min gap) with
497
+ deterministic ``trace_id``/``span_id`` for idempotent re-runs.
498
+ * **bubbles** (older Cursor) — legacy fallback that walks ``cursorDiskKV``
499
+ and ``ItemTable`` for keys containing ``bubble``/``chat``. Used only when
500
+ the aiService keys are absent.
501
+
502
+ Inherently schema-fragile across Cursor versions, so still fail-open: any
503
+ error on any workspace is skipped (logged under --verbose). Live capture
504
+ via the Cursor hook is the authoritative path; this just backfills the
505
+ history sitting on disk.
506
+ """
507
+ root = _cursor_storage_root()
508
+ if not root:
509
+ if verbose:
510
+ _eprint("[octarin] cursor: no workspaceStorage found; skipping")
511
+ return []
512
+ try:
513
+ import sqlite3
514
+ except Exception:
515
+ return []
516
+
517
+ events: list[dict] = []
518
+ workspaces_seen = 0
519
+ for ws in sorted(os.listdir(root)):
520
+ db = os.path.join(root, ws, "state.vscdb")
521
+ if not os.path.isfile(db):
522
+ continue
523
+ workspaces_seen += 1
524
+ if cutoff is not None:
525
+ try:
526
+ mtime = datetime.fromtimestamp(os.path.getmtime(db), tz=timezone.utc)
527
+ if mtime < cutoff:
528
+ continue
529
+ except Exception:
530
+ pass
531
+ try:
532
+ ws_events = _parse_cursor_workspace(sqlite3, db, ws, default_user, cutoff)
533
+ except Exception as exc: # fail-open per workspace
534
+ if verbose:
535
+ _eprint(f"[octarin] cursor: skipped {ws}: {exc}")
536
+ continue
537
+ if verbose and ws_events:
538
+ _eprint(f"[octarin] cursor: {ws}: {len(ws_events)} session(s)")
539
+ events.extend(ws_events)
540
+ if verbose:
541
+ _eprint(
542
+ f"[octarin] cursor: scanned {workspaces_seen} workspace(s), "
543
+ f"recovered {len(events)} session(s)"
544
+ )
545
+ return events
546
+
547
+
548
+ # Generations more than this many seconds apart land in distinct sessions. 30
549
+ # minutes is a pragmatic boundary that matches how people use Cursor (start a
550
+ # task, walk away, come back later → that's two sessions).
551
+ _CURSOR_SESSION_GAP_SECONDS = 30 * 60
552
+
553
+
554
+ def _parse_cursor_workspace(
555
+ sqlite3, db_path: str, ws_id: str, default_user: str, cutoff: datetime | None
556
+ ) -> list[dict]:
557
+ """Read one workspace's state.vscdb → list of IngestEvents (one per session).
558
+
559
+ Reads aiService.generations + aiService.prompts (current Cursor schema). If
560
+ those are absent or empty, falls back to the legacy bubble path so we don't
561
+ regress on older installs.
562
+ """
563
+ # Open read-only so we never disturb a running Cursor.
564
+ uri = f"file:{db_path}?mode=ro&immutable=1"
565
+ conn = sqlite3.connect(uri, uri=True, timeout=2.0)
566
+ try:
567
+ cur = conn.cursor()
568
+ try:
569
+ cur.execute(
570
+ "SELECT key, value FROM ItemTable "
571
+ "WHERE key IN ('aiService.generations', 'aiService.prompts')"
572
+ )
573
+ ai_rows = dict(cur.fetchall())
574
+ except Exception:
575
+ ai_rows = {}
576
+
577
+ # Fallback: legacy bubble/chat rows (older Cursor versions only — the
578
+ # current schema lives in aiService.* above).
579
+ legacy_rows: list[tuple] = []
580
+ if not ai_rows.get("aiService.generations"):
581
+ for table in ("cursorDiskKV", "ItemTable"):
582
+ try:
583
+ # Fixed table names from a literal tuple — not user input.
584
+ cur.execute(f"SELECT key, value FROM {table}")
585
+ legacy_rows.extend(cur.fetchall())
586
+ except Exception:
587
+ continue
588
+ finally:
589
+ conn.close()
590
+
591
+ # Path 1: the aiService schema (every modern Cursor).
592
+ events = _parse_cursor_aiservice(ai_rows, ws_id, default_user, cutoff)
593
+ if events:
594
+ return events
595
+
596
+ # Path 2: legacy bubbles (no time grouping, one event per workspace).
597
+ legacy = _parse_cursor_bubbles(legacy_rows, ws_id, default_user)
598
+ return [legacy] if legacy else []
599
+
600
+
601
+ def _parse_cursor_aiservice( # noqa: PLR0915 - per-session bookkeeping inline
602
+ rows: dict, ws_id: str, default_user: str, cutoff: datetime | None
603
+ ) -> list[dict]:
604
+ """Build per-session IngestEvents from ``aiService.generations`` (+ ``prompts``).
605
+
606
+ Generations are the authoritative records (they carry ``unixMs`` +
607
+ ``generationUUID``). We sort by timestamp, split on ``_CURSOR_SESSION_GAP_SECONDS``
608
+ gaps, and emit one event per resulting session — each with a deterministic
609
+ ``session_id`` derived from workspace + session start so re-runs dedupe via
610
+ the backend's trace-id derivation. User prompts get joined into the
611
+ session-level ``input`` (Cursor doesn't timestamp them so per-turn
612
+ correlation isn't safe).
613
+ """
614
+ raw_gens = rows.get("aiService.generations")
615
+ raw_prompts = rows.get("aiService.prompts")
616
+ if not raw_gens:
617
+ return []
618
+ try:
619
+ gens = json.loads(raw_gens) or []
620
+ except Exception:
621
+ return []
622
+ if not isinstance(gens, list) or not gens:
623
+ return []
624
+
625
+ # Sort generations chronologically. Defensive about missing/non-int unixMs.
626
+ def _ts(g: object) -> int:
627
+ if isinstance(g, dict):
628
+ v = g.get("unixMs")
629
+ if isinstance(v, (int, float)):
630
+ return int(v)
631
+ return 0
632
+
633
+ gens = sorted([g for g in gens if isinstance(g, dict) and _ts(g) > 0], key=_ts)
634
+ if not gens:
635
+ return []
636
+
637
+ # Apply --since cutoff at the generation level (mtime on the file is too
638
+ # coarse — a workspace touched yesterday may have months of older history).
639
+ if cutoff is not None:
640
+ cutoff_ms = int(cutoff.timestamp() * 1000)
641
+ gens = [g for g in gens if _ts(g) >= cutoff_ms]
642
+ if not gens:
643
+ return []
644
+
645
+ # User prompts (no timestamps in Cursor's schema) — we attach the full text
646
+ # as the session-level input. Best-effort join; if the JSON is malformed we
647
+ # quietly proceed without prompts.
648
+ prompt_texts: list[str] = []
649
+ if raw_prompts:
650
+ try:
651
+ prompts = json.loads(raw_prompts) or []
652
+ for p in prompts if isinstance(prompts, list) else []:
653
+ if isinstance(p, dict):
654
+ t = p.get("text")
655
+ if isinstance(t, str) and t.strip():
656
+ prompt_texts.append(t.strip())
657
+ except Exception:
658
+ pass
659
+
660
+ # Split into sessions on the gap boundary.
661
+ sessions: list[list[dict]] = []
662
+ for g in gens:
663
+ if (
664
+ not sessions
665
+ or _ts(g) - _ts(sessions[-1][-1]) > _CURSOR_SESSION_GAP_SECONDS * 1000
666
+ ):
667
+ sessions.append([g])
668
+ else:
669
+ sessions[-1].append(g)
670
+
671
+ events: list[dict] = []
672
+ # Distribute prompts proportionally across sessions (best-effort: split
673
+ # evenly by session count — better than dumping all of them onto session
674
+ # one and worse than impossible without timestamps).
675
+ prompts_per_session = (
676
+ [prompt_texts[i :: len(sessions)] for i in range(len(sessions))]
677
+ if prompt_texts and sessions
678
+ else [[] for _ in sessions]
679
+ )
680
+
681
+ for idx, gen_list in enumerate(sessions):
682
+ start_ms = _ts(gen_list[0])
683
+ end_ms = _ts(gen_list[-1])
684
+ # Session-level input from the prompt slice (may be empty).
685
+ my_prompts = prompts_per_session[idx]
686
+ session_input = "\n\n".join(my_prompts)[:_INPUT_CAP] if my_prompts else None
687
+ # Session name = first prompt (truncated), or first generation snippet.
688
+ first_text = (
689
+ my_prompts[0]
690
+ if my_prompts
691
+ else str(gen_list[0].get("textDescription") or "")
692
+ )
693
+ session_name = (
694
+ first_text.strip().replace("\n", " ")[:_NAME_CAP] or f"Cursor: {ws_id}"
695
+ )
696
+
697
+ # Estimated session-level input tokens — the total length of every
698
+ # prompt that landed in this session bucket (the prompts have no
699
+ # timestamps so we attribute their tokens at the session, not span).
700
+ # We surface this on the FIRST span so the trace-level rollup picks it
701
+ # up exactly once without double-counting.
702
+ session_input_tokens = sum(_est_tokens(p) for p in my_prompts)
703
+
704
+ spans: list[dict] = []
705
+ for span_idx, g in enumerate(gen_list):
706
+ gen_uuid = g.get("generationUUID") or g.get("uuid") or ""
707
+ text = g.get("textDescription") or ""
708
+ if isinstance(text, dict):
709
+ text = json.dumps(text)
710
+ if not isinstance(text, str):
711
+ text = ""
712
+ gen_ms = _ts(g)
713
+ gen_type = g.get("type") or "composer"
714
+ out_tokens = _est_tokens(text)
715
+ # First span carries the session's input-token estimate (sum of
716
+ # prompt text lengths). Subsequent spans only have output tokens.
717
+ input_tokens = session_input_tokens if span_idx == 0 else 0
718
+ spans.append(
719
+ {
720
+ "span_id": f"cursor-{ws_id}-{gen_uuid or str(gen_ms)}",
721
+ "name": gen_type,
722
+ "span_type": "llm",
723
+ # We don't have separate start/end timestamps; Cursor only
724
+ # records a single time per generation. Use it for both so
725
+ # duration is 0 — fair: this is post-hoc, not live timing.
726
+ "start_time": _ms_to_iso(gen_ms),
727
+ "end_time": _ms_to_iso(gen_ms),
728
+ # Cursor doesn't expose its underlying model id in this
729
+ # storage path; ``cursor:composer`` (or whatever ``type``
730
+ # the generation declared) flags this as Cursor's own
731
+ # composer flow. The backend pricing layer doesn't know
732
+ # this model, so cost stays $0 (correct: Cursor cost is
733
+ # tier-based, not per-token).
734
+ "model": f"cursor:{gen_type}",
735
+ "output": text[:_OUTPUT_CAP],
736
+ "input_tokens": input_tokens,
737
+ "output_tokens": out_tokens,
738
+ "total_tokens": input_tokens + out_tokens,
739
+ "status": "ok",
740
+ }
741
+ )
742
+
743
+ events.append(
744
+ {
745
+ "source": "cursor",
746
+ # Deterministic per session so re-runs dedupe via backend
747
+ # trace_id derivation (uuid5(project, source, session_id)).
748
+ "session_id": f"cursor-{ws_id}-{start_ms}",
749
+ "session_name": session_name,
750
+ "user_ref": default_user,
751
+ "model": f"cursor:{gen_list[0].get('type') or 'composer'}",
752
+ "input": session_input,
753
+ "start_time": _ms_to_iso(start_ms),
754
+ "end_time": _ms_to_iso(end_ms),
755
+ "spans": spans,
756
+ }
757
+ )
758
+ return events
759
+
760
+
761
+ def _est_tokens(text: str) -> int:
762
+ """Char-based token estimate (~4 chars/token).
763
+
764
+ Cursor's ``aiService.generations`` records the rendered text of a model
765
+ reply but NOT a token count — Cursor charges per-request on its own tier,
766
+ not per-token, so it doesn't bother. To make tokens-per-day / tokens-per-
767
+ session analytics meaningful for imported Cursor history we estimate from
768
+ text length using the standard ~4-char-per-token English ratio. Spans
769
+ minted this way get a ``cursor:composer`` model name which the backend's
770
+ pricing layer doesn't know — so $ stays $0 (correct: Cursor cost is opaque
771
+ on a tier plan) but token counts roll up.
772
+ """
773
+ if not text:
774
+ return 0
775
+ return max(0, (len(text) + 3) // 4)
776
+
777
+
778
+ def _ms_to_iso(unix_ms: int) -> str:
779
+ """Convert a unix millisecond timestamp to a UTC ISO 8601 string."""
780
+ try:
781
+ return datetime.fromtimestamp(unix_ms / 1000, tz=timezone.utc).isoformat()
782
+ except Exception:
783
+ return _now_iso()
784
+
785
+
786
+ def _parse_cursor_bubbles(
787
+ rows: list[tuple], ws_id: str, default_user: str
788
+ ) -> dict | None:
789
+ """Legacy fallback: walk bubble/chat rows from older Cursor versions.
790
+
791
+ Same shape as the original parser this file shipped with — kept untouched
792
+ so any pre-aiService Cursor install still gets best-effort backfill.
793
+ """
794
+ spans: list[dict] = []
795
+ session_name: str | None = None
796
+ model: str | None = None
797
+ gen = 0
798
+ for key, value in rows:
799
+ if not isinstance(key, str) or not isinstance(value, (str, bytes)):
800
+ continue
801
+ if "bubble" not in key and "chat" not in key.lower():
802
+ continue
803
+ try:
804
+ data = json.loads(value)
805
+ except Exception:
806
+ continue
807
+ for b in _cursor_bubbles(data):
808
+ text = b.get("text") or b.get("richText") or ""
809
+ if isinstance(text, dict):
810
+ text = json.dumps(text)
811
+ if not isinstance(text, str) or not text.strip():
812
+ continue
813
+ btype = b.get("type")
814
+ # Cursor's legacy bubble schema: 1 == user turn, 2 == assistant turn.
815
+ _BUBBLE_USER = 1
816
+ _BUBBLE_ASSISTANT = 2
817
+ is_user = btype == _BUBBLE_USER or b.get("role") == "user"
818
+ is_asst = btype == _BUBBLE_ASSISTANT or b.get("role") == "assistant"
819
+ model = b.get("model") or model
820
+ if is_user and session_name is None:
821
+ session_name = text.strip().replace("\n", " ")[:_NAME_CAP]
822
+ if is_asst:
823
+ gen += 1
824
+ out_tokens = _est_tokens(text)
825
+ spans.append(
826
+ {
827
+ "span_id": f"cursor-{ws_id}-gen{gen}",
828
+ "name": "assistant",
829
+ "span_type": "llm",
830
+ "start_time": _now_iso(),
831
+ "end_time": _now_iso(),
832
+ "model": model,
833
+ "output": text[:_OUTPUT_CAP],
834
+ "output_tokens": out_tokens,
835
+ "total_tokens": out_tokens,
836
+ "status": "ok",
837
+ }
838
+ )
839
+ if not spans:
840
+ return None
841
+ return {
842
+ "source": "cursor",
843
+ "session_id": f"cursor-{ws_id}",
844
+ "session_name": session_name or f"Cursor: {ws_id}",
845
+ "user_ref": default_user,
846
+ "model": model,
847
+ "spans": spans,
848
+ }
849
+
850
+
851
+ def _cursor_bubbles(data: object) -> list[dict]:
852
+ """Extract a flat list of chat-bubble dicts from a decoded Cursor value."""
853
+ if isinstance(data, dict):
854
+ if "type" in data and ("text" in data or "richText" in data):
855
+ return [data]
856
+ for k in ("bubbles", "messages", "conversation"):
857
+ v = data.get(k)
858
+ if isinstance(v, list):
859
+ return [x for x in v if isinstance(x, dict)]
860
+ return []
861
+ if isinstance(data, list):
862
+ return [x for x in data if isinstance(x, dict)]
863
+ return []
864
+
865
+
866
+ # ── file discovery ────────────────────────────────────────────────────────────
867
+
868
+
869
+ def _excluded(path: str) -> bool:
870
+ """True for sidecar/internal files we must skip (``_*`` files, subagents/)."""
871
+ parts = path.split(os.sep)
872
+ if any(p == "subagents" for p in parts):
873
+ return True
874
+ return os.path.basename(path).startswith("_")
875
+
876
+
877
+ def discover_claude(cutoff: datetime | None) -> list[str]:
878
+ """All Claude transcript jsonl files newer than ``cutoff`` (excluding sidecars)."""
879
+ root = os.path.expanduser("~/.claude/projects")
880
+ return _walk(root, lambda n: n.endswith(".jsonl"), cutoff)
881
+
882
+
883
+ def discover_codex(cutoff: datetime | None) -> list[str]:
884
+ """All Codex rollout jsonl files newer than ``cutoff`` (excluding sidecars)."""
885
+ root = os.path.expanduser("~/.codex/sessions")
886
+ return _walk(
887
+ root, lambda n: n.startswith("rollout-") and n.endswith(".jsonl"), cutoff
888
+ )
889
+
890
+
891
+ def _walk(root: str, name_ok, cutoff: datetime | None) -> list[str]:
892
+ """Return matching files under ``root``, excluding sidecars + stale-by-mtime."""
893
+ out: list[str] = []
894
+ if not os.path.isdir(root):
895
+ return out
896
+ for dirpath, dirnames, filenames in os.walk(root):
897
+ # prune subagents dirs so we never descend into them.
898
+ dirnames[:] = [d for d in dirnames if d != "subagents"]
899
+ for name in filenames:
900
+ if not name_ok(name) or name.startswith("_"):
901
+ continue
902
+ path = os.path.join(dirpath, name)
903
+ if _excluded(path):
904
+ continue
905
+ if cutoff is not None:
906
+ try:
907
+ mtime = datetime.fromtimestamp(
908
+ os.path.getmtime(path), tz=timezone.utc
909
+ )
910
+ if mtime < cutoff:
911
+ continue
912
+ except Exception:
913
+ pass
914
+ out.append(path)
915
+ return out
916
+
917
+
918
+ # ── posting ───────────────────────────────────────────────────────────────────
919
+
920
+
921
+ def _filter_spans_by_cutoff(event: dict, cutoff: datetime | None) -> dict | None:
922
+ """Drop spans older than the cutoff; return None if nothing survives.
923
+
924
+ File-level mtime already prunes most stale files; this trims partial files so
925
+ a long-running session that started before the cutoff still imports only its
926
+ recent turns. Spans with no/unparseable timestamp are always kept.
927
+ """
928
+ if cutoff is None:
929
+ return event
930
+ spans = event.get("spans") or []
931
+ kept = []
932
+ for s in spans:
933
+ ts = _parse_ts(s.get("start_time"))
934
+ if ts is None or ts >= cutoff:
935
+ kept.append(s)
936
+ if not kept:
937
+ return None
938
+ event = dict(event)
939
+ event["spans"] = kept
940
+ return event
941
+
942
+
943
+ def post_event(
944
+ url: str, key: str, event: dict, timeout: float = 20.0
945
+ ) -> tuple[bool, str]:
946
+ """POST one IngestEvent. Returns ``(ok, detail)``; never raises."""
947
+ body = json.dumps(event).encode()
948
+ req = urllib.request.Request(
949
+ url,
950
+ data=body,
951
+ headers={
952
+ "Authorization": f"Bearer {key}",
953
+ "Content-Type": "application/json",
954
+ },
955
+ method="POST",
956
+ )
957
+ try:
958
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
959
+ raw = resp.read().decode("utf-8", "replace")
960
+ try:
961
+ n = json.loads(raw).get("span_count")
962
+ return True, f"span_count={n}" if n is not None else "ok"
963
+ except Exception:
964
+ return True, "ok"
965
+ except urllib.error.HTTPError as exc:
966
+ detail = ""
967
+ with contextlib.suppress(Exception):
968
+ detail = exc.read().decode("utf-8", "replace")[:300]
969
+ return False, f"HTTP {exc.code} {detail}".strip()
970
+ except Exception as exc: # network / timeout — fail-open
971
+ return False, str(exc)
972
+
973
+
974
+ # ── orchestration ─────────────────────────────────────────────────────────────
975
+
976
+
977
+ def build_arg_parser() -> argparse.ArgumentParser:
978
+ p = argparse.ArgumentParser(
979
+ prog="backfill.py",
980
+ description="Import past Claude Code / Codex / Cursor sessions into Octarin.",
981
+ )
982
+ p.add_argument(
983
+ "--since",
984
+ default=None,
985
+ help="Only import sessions newer than this (e.g. 7d, 12h, 2w, or 2026-01-01).",
986
+ )
987
+ p.add_argument(
988
+ "--dry-run",
989
+ action="store_true",
990
+ help="Parse and count, but do not POST anything.",
991
+ )
992
+ p.add_argument(
993
+ "--verbose",
994
+ action="store_true",
995
+ help="Log every file parsed/posted.",
996
+ )
997
+ return p
998
+
999
+
1000
+ def _handle_event(event, cutoff, args, url, key, label, totals) -> None:
1001
+ """Filter by cutoff, account it, and (unless dry-run) POST one event."""
1002
+ event = _filter_spans_by_cutoff(event, cutoff)
1003
+ if not event:
1004
+ totals["skipped"] += 1
1005
+ return
1006
+ n_spans = len(event.get("spans") or [])
1007
+ totals["events"] += 1
1008
+ totals["spans"] += n_spans
1009
+ if args.dry_run:
1010
+ if args.verbose:
1011
+ _eprint(
1012
+ f"[octarin] [dry-run] {label} {event['session_id']} "
1013
+ f"({n_spans} spans) {event.get('session_name', '')[:50]}"
1014
+ )
1015
+ return
1016
+ ok, detail = post_event(url, key, event)
1017
+ if ok:
1018
+ totals["posted"] += 1
1019
+ if args.verbose:
1020
+ _eprint(f"[octarin] posted {event['session_id']} ({detail})")
1021
+ else:
1022
+ totals["failed"] += 1
1023
+ _eprint(f"[octarin] FAILED {event['session_id']}: {detail}")
1024
+
1025
+
1026
+ def run(argv: list[str] | None = None) -> int:
1027
+ args = build_arg_parser().parse_args(argv)
1028
+ cutoff = parse_since(args.since)
1029
+
1030
+ key = os.environ.get("OCTARIN_API_KEY", "").strip()
1031
+ url = os.environ.get("OCTARIN_INGEST_URL", "").strip() or INGEST_URL_DEFAULT
1032
+
1033
+ if not args.dry_run and not key:
1034
+ _eprint(
1035
+ "[octarin] OCTARIN_API_KEY is not set; cannot upload.\n"
1036
+ " Run: OCTARIN_API_KEY=<your key> python3 backfill.py"
1037
+ )
1038
+ return 1
1039
+
1040
+ default_user = _default_user()
1041
+ since_label = args.since or "all history"
1042
+ _eprint(f"[octarin] backfill starting (since: {since_label}) → {url}")
1043
+
1044
+ totals = {
1045
+ "files": 0,
1046
+ "events": 0,
1047
+ "spans": 0,
1048
+ "posted": 0,
1049
+ "failed": 0,
1050
+ "skipped": 0,
1051
+ }
1052
+
1053
+ # (label, discover_fn, parse_fn) for the file-based sources.
1054
+ sources = [
1055
+ ("Claude Code", discover_claude, parse_claude_transcript),
1056
+ ("Codex", discover_codex, parse_codex_rollout),
1057
+ ]
1058
+ for label, discover, parse in sources:
1059
+ files = discover(cutoff)
1060
+ _eprint(f"[octarin] {label}: {len(files)} session file(s) to scan")
1061
+ for path in files:
1062
+ totals["files"] += 1
1063
+ event = None
1064
+ try:
1065
+ event = parse(path, default_user)
1066
+ except Exception as exc: # fail-open per file
1067
+ if args.verbose:
1068
+ _eprint(f"[octarin] parse error {os.path.basename(path)}: {exc}")
1069
+ if not event:
1070
+ totals["skipped"] += 1
1071
+ continue
1072
+ _handle_event(event, cutoff, args, url, key, label, totals)
1073
+
1074
+ # Cursor (best-effort, separate path: parses straight to events).
1075
+ cursor_events = parse_cursor_sessions(default_user, cutoff, args.verbose)
1076
+ if cursor_events:
1077
+ _eprint(
1078
+ f"[octarin] Cursor: {len(cursor_events)} chat session(s) recovered "
1079
+ "(best-effort)"
1080
+ )
1081
+ for event in cursor_events:
1082
+ _handle_event(event, cutoff, args, url, key, "Cursor", totals)
1083
+
1084
+ _eprint("")
1085
+ _eprint("[octarin] backfill complete:")
1086
+ _eprint(f" files scanned : {totals['files']}")
1087
+ _eprint(f" sessions found: {totals['events']} ({totals['spans']} spans)")
1088
+ if args.dry_run:
1089
+ _eprint(" dry-run : nothing uploaded")
1090
+ else:
1091
+ _eprint(f" uploaded : {totals['posted']}")
1092
+ if totals["failed"]:
1093
+ _eprint(
1094
+ f" failed : {totals['failed']} "
1095
+ "(fail-open; safe to re-run)"
1096
+ )
1097
+ _eprint(" Re-running is safe — sessions dedupe by trace id.")
1098
+ # Fail-open: a backfill with some failed POSTs still exits 0 so it never
1099
+ # breaks an install pipeline. Only a hard config error (missing key) is
1100
+ # nonzero.
1101
+ return 0
1102
+
1103
+
1104
+ def main() -> None:
1105
+ try:
1106
+ sys.exit(run())
1107
+ except KeyboardInterrupt:
1108
+ _eprint("\n[octarin] interrupted")
1109
+ sys.exit(130)
1110
+
1111
+
1112
+ if __name__ == "__main__":
1113
+ main()