octarin-cli 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +202 -0
- package/assets/backfill.py +1113 -0
- package/assets/claude_code/hook.py +573 -0
- package/assets/codex/hook.mjs +487 -0
- package/assets/cursor/hook-handler.js +41 -0
- package/assets/cursor/lib/canonical.js +240 -0
- package/assets/cursor/lib/utils.js +138 -0
- package/assets/repo-template/dot-claude/octarin/hook.py +685 -0
- package/assets/repo-template/dot-claude/octarin/run.sh +41 -0
- package/assets/repo-template/dot-claude/settings.json +15 -0
- package/assets/repo-template/dot-codex/config.toml +6 -0
- package/assets/repo-template/dot-codex/hooks/hook.mjs +531 -0
- package/assets/repo-template/dot-codex/hooks/run.sh +38 -0
- package/assets/repo-template/dot-cursor/hooks/hook-handler.js +41 -0
- package/assets/repo-template/dot-cursor/hooks/lib/canonical.js +240 -0
- package/assets/repo-template/dot-cursor/hooks/lib/utils.js +196 -0
- package/assets/repo-template/dot-cursor/hooks/run.sh +41 -0
- package/assets/repo-template/dot-cursor/hooks.json +13 -0
- package/dist/args.js +85 -0
- package/dist/assets.js +28 -0
- package/dist/client.js +105 -0
- package/dist/envfile.js +94 -0
- package/dist/index.js +192 -0
- package/dist/init.js +314 -0
- package/dist/init_repo.js +348 -0
- package/dist/login.js +209 -0
- package/dist/output.js +56 -0
- package/package.json +37 -0
|
@@ -0,0 +1,1113 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# ruff: noqa: BLE001, S110, S112, S310
|
|
3
|
+
#
|
|
4
|
+
# This is a fail-open one-shot importer: every error path is intentionally a
|
|
5
|
+
# bare ``except Exception: continue`` (S112) / ``pass`` (S110) so a single
|
|
6
|
+
# malformed line, parse failure, or network blip never aborts the run. The
|
|
7
|
+
# ingest URL comes from ``OCTARIN_INGEST_URL`` env (controlled), not user
|
|
8
|
+
# input, so S310 (urlopen with permitted schemes) doesn't apply here.
|
|
9
|
+
"""Octarin history backfill (https://octarin.ai) — team AI-usage analytics tool.
|
|
10
|
+
|
|
11
|
+
Octarin is a team AI-usage analytics tool for AI-assisted coding (the same
|
|
12
|
+
pattern as Langfuse / OpenTelemetry session logging). This one-shot importer
|
|
13
|
+
reads the AI-coding session logs your tools already keep on THIS machine and
|
|
14
|
+
POSTs them, as canonical Octarin ``IngestEvent``s, to your team's shared Octarin
|
|
15
|
+
workspace so your past usage + cost shows up alongside new sessions. It reads
|
|
16
|
+
only local agent-session transcripts you produced and sends them to the ingest
|
|
17
|
+
URL baked into your install.
|
|
18
|
+
|
|
19
|
+
What it scans (read-only):
|
|
20
|
+
* ~/.claude/projects/**/*.jsonl (Claude Code transcripts)
|
|
21
|
+
* ~/.codex/sessions/**/rollout-*.jsonl (Codex rollout logs)
|
|
22
|
+
* Cursor workspaceStorage (best-effort; state.vscdb chat history)
|
|
23
|
+
|
|
24
|
+
Excludes ``_*`` files and any ``subagents/`` directory (sidecar/internal logs).
|
|
25
|
+
|
|
26
|
+
Pure stdlib. Fail-open: a single bad file or failed POST never aborts the run.
|
|
27
|
+
Idempotent: each session maps to a deterministic trace id server-side (derived
|
|
28
|
+
from project + source + session id), so re-running dedupes instead of double
|
|
29
|
+
-counting. Use ``--since`` to limit how far back to import.
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
OCTARIN_API_KEY=<key> python3 backfill.py [--since 7d] [--dry-run] [--verbose]
|
|
33
|
+
|
|
34
|
+
Env:
|
|
35
|
+
OCTARIN_API_KEY (required) the project key, same one the hooks use.
|
|
36
|
+
OCTARIN_INGEST_URL (optional) defaults to https://api.octarin.ai/v1/ingest.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import argparse
|
|
42
|
+
import contextlib
|
|
43
|
+
import getpass
|
|
44
|
+
import json
|
|
45
|
+
import os
|
|
46
|
+
import re
|
|
47
|
+
import subprocess
|
|
48
|
+
import sys
|
|
49
|
+
import urllib.error
|
|
50
|
+
import urllib.request
|
|
51
|
+
from datetime import datetime, timedelta, timezone
|
|
52
|
+
from pathlib import Path
|
|
53
|
+
|
|
54
|
+
INGEST_URL_DEFAULT = "https://api.octarin.ai/v1/ingest"
|
|
55
|
+
|
|
56
|
+
# Truncation budgets mirror the live hooks so backfilled spans look identical to
|
|
57
|
+
# streamed ones.
|
|
58
|
+
_INPUT_CAP = 8000
|
|
59
|
+
_OUTPUT_CAP = 16000
|
|
60
|
+
_NAME_CAP = 80
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _now_iso() -> str:
|
|
64
|
+
return datetime.now(timezone.utc).isoformat()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _eprint(*args: object) -> None:
|
|
68
|
+
print(*args, file=sys.stderr, flush=True)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ── argument / config plumbing ───────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def parse_since(spec: str | None) -> datetime | None:
|
|
75
|
+
"""Parse a ``--since`` value into a UTC cutoff datetime (or None for 'all').
|
|
76
|
+
|
|
77
|
+
Accepts a relative span (``30m``, ``12h``, ``7d``, ``2w``) or an absolute
|
|
78
|
+
ISO date/datetime (``2026-01-01`` / ``2026-01-01T00:00:00``). Returns None on
|
|
79
|
+
an empty/unparseable value so the run simply imports everything.
|
|
80
|
+
"""
|
|
81
|
+
if not spec:
|
|
82
|
+
return None
|
|
83
|
+
spec = spec.strip()
|
|
84
|
+
m = re.fullmatch(r"(\d+)\s*([smhdw])", spec.lower())
|
|
85
|
+
if m:
|
|
86
|
+
n = int(m.group(1))
|
|
87
|
+
unit = m.group(2)
|
|
88
|
+
seconds = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}[unit] * n
|
|
89
|
+
return datetime.now(timezone.utc) - timedelta(seconds=seconds)
|
|
90
|
+
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
|
|
91
|
+
try:
|
|
92
|
+
return datetime.strptime(spec, fmt).replace(tzinfo=timezone.utc)
|
|
93
|
+
except ValueError:
|
|
94
|
+
continue
|
|
95
|
+
_eprint(
|
|
96
|
+
f"[octarin] warning: could not parse --since {spec!r}; importing all history"
|
|
97
|
+
)
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _parse_ts(value: object) -> datetime | None:
|
|
102
|
+
"""Best-effort parse of a timestamp string into an aware UTC datetime."""
|
|
103
|
+
if not isinstance(value, str) or not value:
|
|
104
|
+
return None
|
|
105
|
+
v = value.strip()
|
|
106
|
+
if v.endswith("Z"):
|
|
107
|
+
v = v[:-1] + "+00:00"
|
|
108
|
+
try:
|
|
109
|
+
dt = datetime.fromisoformat(v)
|
|
110
|
+
except ValueError:
|
|
111
|
+
return None
|
|
112
|
+
if dt.tzinfo is None:
|
|
113
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
114
|
+
return dt.astimezone(timezone.utc)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _default_user(cwd: str | None = None) -> str:
|
|
118
|
+
"""Resolve a stable, pseudonymous user_ref (git email, then OS user)."""
|
|
119
|
+
user = os.environ.get("OCTARIN_USER")
|
|
120
|
+
if user:
|
|
121
|
+
return user
|
|
122
|
+
try:
|
|
123
|
+
out = subprocess.check_output(
|
|
124
|
+
["git", "config", "user.email"],
|
|
125
|
+
cwd=cwd or os.getcwd(),
|
|
126
|
+
stderr=subprocess.DEVNULL,
|
|
127
|
+
)
|
|
128
|
+
email = out.decode().strip()
|
|
129
|
+
if email:
|
|
130
|
+
return email
|
|
131
|
+
except Exception:
|
|
132
|
+
pass
|
|
133
|
+
try:
|
|
134
|
+
return getpass.getuser()
|
|
135
|
+
except Exception:
|
|
136
|
+
return "unknown"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _repo_of(cwd: str | None) -> str | None:
|
|
140
|
+
"""Derive a repo label from a working directory path."""
|
|
141
|
+
if not cwd:
|
|
142
|
+
return None
|
|
143
|
+
base = os.path.basename(cwd.rstrip("/"))
|
|
144
|
+
return base or None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ── Claude Code transcript parser (ported from the install Stop hook) ─────────
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def parse_claude_transcript( # noqa: PLR0915 - top-down jsonl parser; splitting
|
|
151
|
+
path: str, # would scatter the local span-id/state bookkeeping.
|
|
152
|
+
default_user: str,
|
|
153
|
+
) -> dict | None:
|
|
154
|
+
"""Map a Claude Code ``*.jsonl`` transcript to a canonical IngestEvent dict.
|
|
155
|
+
|
|
156
|
+
This mirrors the streaming Stop hook (install/router.py ``HOOK_PY``) so
|
|
157
|
+
backfilled traces are identical to live ones: deterministic session id,
|
|
158
|
+
smart session_name, multi-hop parent_span_id tree (tool/subagent nested
|
|
159
|
+
under their assistant turn), real tool/file content, and span_type
|
|
160
|
+
classification (subagents→agent, search/read→retrieval, else→tool).
|
|
161
|
+
|
|
162
|
+
Returns None when the file has no usable spans.
|
|
163
|
+
"""
|
|
164
|
+
msgs: list[dict] = []
|
|
165
|
+
cwd: str | None = None
|
|
166
|
+
try:
|
|
167
|
+
with Path(path).open(encoding="utf-8", errors="replace") as f:
|
|
168
|
+
for raw_line in f:
|
|
169
|
+
line = raw_line.strip()
|
|
170
|
+
if not line:
|
|
171
|
+
continue
|
|
172
|
+
try:
|
|
173
|
+
obj = json.loads(line)
|
|
174
|
+
except Exception:
|
|
175
|
+
continue
|
|
176
|
+
if isinstance(obj, dict):
|
|
177
|
+
msgs.append(obj)
|
|
178
|
+
if cwd is None and isinstance(obj.get("cwd"), str):
|
|
179
|
+
cwd = obj.get("cwd")
|
|
180
|
+
except Exception:
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
# session id: prefer an embedded sessionId, else the file stem.
|
|
184
|
+
sid = None
|
|
185
|
+
for m in msgs:
|
|
186
|
+
sid = m.get("sessionId") or m.get("session_id")
|
|
187
|
+
if sid:
|
|
188
|
+
break
|
|
189
|
+
if not sid:
|
|
190
|
+
sid = os.path.splitext(os.path.basename(path))[0] or "unknown"
|
|
191
|
+
|
|
192
|
+
repo = _repo_of(cwd)
|
|
193
|
+
|
|
194
|
+
# First pass: collect tool_result content (and its ts) keyed by tool_use_id.
|
|
195
|
+
results: dict[str, str] = {}
|
|
196
|
+
result_ts: dict[str, str] = {}
|
|
197
|
+
for m in msgs:
|
|
198
|
+
msg = m.get("message", m)
|
|
199
|
+
if not isinstance(msg, dict):
|
|
200
|
+
continue
|
|
201
|
+
mts = m.get("timestamp")
|
|
202
|
+
content = msg.get("content")
|
|
203
|
+
if isinstance(content, list):
|
|
204
|
+
for b in content:
|
|
205
|
+
if isinstance(b, dict) and b.get("type") == "tool_result":
|
|
206
|
+
txt = b.get("content")
|
|
207
|
+
if isinstance(txt, list):
|
|
208
|
+
txt = "\n".join(
|
|
209
|
+
x.get("text", "") for x in txt if isinstance(x, dict)
|
|
210
|
+
)
|
|
211
|
+
tid = b.get("tool_use_id")
|
|
212
|
+
results[tid] = (
|
|
213
|
+
txt if isinstance(txt, str) else json.dumps(txt)
|
|
214
|
+
)
|
|
215
|
+
if mts:
|
|
216
|
+
result_ts[tid] = mts
|
|
217
|
+
|
|
218
|
+
# Second pass: build spans.
|
|
219
|
+
spans: list[dict] = []
|
|
220
|
+
session_name: str | None = None
|
|
221
|
+
model: str | None = None
|
|
222
|
+
gen = 0
|
|
223
|
+
prev_ts: str | None = None # for assistant-span start = prior message ts
|
|
224
|
+
for m in msgs:
|
|
225
|
+
msg = m.get("message", m)
|
|
226
|
+
if not isinstance(msg, dict):
|
|
227
|
+
continue
|
|
228
|
+
role = msg.get("role") or m.get("type")
|
|
229
|
+
ts = m.get("timestamp") or _now_iso()
|
|
230
|
+
content = msg.get("content")
|
|
231
|
+
if session_name is None and role == "user":
|
|
232
|
+
t = content if isinstance(content, str) else None
|
|
233
|
+
if isinstance(content, list):
|
|
234
|
+
t = " ".join(
|
|
235
|
+
b.get("text", "")
|
|
236
|
+
for b in content
|
|
237
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
238
|
+
)
|
|
239
|
+
if t and t.strip():
|
|
240
|
+
session_name = t.strip().replace("\n", " ")[:_NAME_CAP]
|
|
241
|
+
if role == "assistant":
|
|
242
|
+
model = msg.get("model") or model
|
|
243
|
+
usage = msg.get("usage") or {}
|
|
244
|
+
out = content if isinstance(content, str) else ""
|
|
245
|
+
if isinstance(content, list):
|
|
246
|
+
out = "\n".join(
|
|
247
|
+
b.get("text", "")
|
|
248
|
+
for b in content
|
|
249
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
250
|
+
)
|
|
251
|
+
gen += 1
|
|
252
|
+
spans.append(
|
|
253
|
+
{
|
|
254
|
+
"span_id": f"{sid}-gen{gen}",
|
|
255
|
+
"name": "assistant",
|
|
256
|
+
"span_type": "llm",
|
|
257
|
+
"start_time": prev_ts or ts,
|
|
258
|
+
"end_time": ts,
|
|
259
|
+
"model": model,
|
|
260
|
+
"output": out,
|
|
261
|
+
"input_tokens": usage.get("input_tokens", 0),
|
|
262
|
+
"output_tokens": usage.get("output_tokens", 0),
|
|
263
|
+
"cache_read_tokens": usage.get("cache_read_input_tokens", 0),
|
|
264
|
+
"cache_write_tokens": usage.get("cache_creation_input_tokens", 0),
|
|
265
|
+
"status": "ok",
|
|
266
|
+
}
|
|
267
|
+
)
|
|
268
|
+
if isinstance(content, list):
|
|
269
|
+
for b in content:
|
|
270
|
+
if isinstance(b, dict) and b.get("type") == "tool_use":
|
|
271
|
+
tname = b.get("name", "")
|
|
272
|
+
if tname in ("Task", "Agent"):
|
|
273
|
+
stype, label = "agent", f"subagent:{tname}"
|
|
274
|
+
elif tname in ("Read", "Grep", "Glob", "WebFetch", "WebSearch"):
|
|
275
|
+
stype, label = "retrieval", f"tool:{tname}"
|
|
276
|
+
else:
|
|
277
|
+
stype, label = "tool", f"tool:{tname}"
|
|
278
|
+
tid = b.get("id")
|
|
279
|
+
spans.append(
|
|
280
|
+
{
|
|
281
|
+
"span_id": f"{sid}-{tid or 'tool'}",
|
|
282
|
+
"parent_span_id": f"{sid}-gen{gen}",
|
|
283
|
+
"name": label,
|
|
284
|
+
"span_type": stype,
|
|
285
|
+
"start_time": ts,
|
|
286
|
+
"end_time": result_ts.get(tid, ts),
|
|
287
|
+
"input": json.dumps(b.get("input", {}))[:_INPUT_CAP],
|
|
288
|
+
"output": (results.get(tid) or "")[
|
|
289
|
+
:_OUTPUT_CAP
|
|
290
|
+
],
|
|
291
|
+
"status": "ok",
|
|
292
|
+
}
|
|
293
|
+
)
|
|
294
|
+
prev_ts = ts
|
|
295
|
+
if not spans:
|
|
296
|
+
return None
|
|
297
|
+
if not session_name:
|
|
298
|
+
session_name = f"Claude Code: {repo or sid}"
|
|
299
|
+
return {
|
|
300
|
+
"source": "claude-code",
|
|
301
|
+
"session_id": sid,
|
|
302
|
+
"session_name": session_name,
|
|
303
|
+
"user_ref": default_user,
|
|
304
|
+
"repo": repo,
|
|
305
|
+
"model": model,
|
|
306
|
+
"spans": spans,
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ── Codex rollout parser ──────────────────────────────────────────────────────
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _codex_text(content: object) -> str:
|
|
314
|
+
"""Flatten a Codex content value (string or list of typed blocks) to text."""
|
|
315
|
+
if isinstance(content, str):
|
|
316
|
+
return content
|
|
317
|
+
if isinstance(content, list):
|
|
318
|
+
out = []
|
|
319
|
+
for b in content:
|
|
320
|
+
if isinstance(b, dict):
|
|
321
|
+
out.append(b.get("text") or b.get("content") or "")
|
|
322
|
+
elif isinstance(b, str):
|
|
323
|
+
out.append(b)
|
|
324
|
+
return "\n".join(x for x in out if x)
|
|
325
|
+
return ""
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def parse_codex_rollout( # noqa: PLR0915 - top-down jsonl parser; same shape
|
|
329
|
+
path: str, # as parse_claude_transcript, kept linear for clarity.
|
|
330
|
+
default_user: str,
|
|
331
|
+
) -> dict | None:
|
|
332
|
+
"""Map a Codex ``rollout-*.jsonl`` session to a canonical IngestEvent dict.
|
|
333
|
+
|
|
334
|
+
Codex wraps each record in ``{type, payload}``. ``response_item`` /
|
|
335
|
+
``event_msg`` payloads carry ``message`` (role user/assistant), ``reasoning``,
|
|
336
|
+
and ``function_call`` / ``function_call_output`` items. We build an
|
|
337
|
+
assistant ``llm`` span per assistant message and nest tool (function-call)
|
|
338
|
+
spans under the most recent assistant turn for the multi-hop tree.
|
|
339
|
+
|
|
340
|
+
Returns None when the file has no usable spans (e.g. a pure SDK exec call).
|
|
341
|
+
"""
|
|
342
|
+
items: list[dict] = []
|
|
343
|
+
sid: str | None = None
|
|
344
|
+
cwd: str | None = None
|
|
345
|
+
model: str | None = None
|
|
346
|
+
try:
|
|
347
|
+
with Path(path).open(encoding="utf-8", errors="replace") as f:
|
|
348
|
+
for raw_line in f:
|
|
349
|
+
line = raw_line.strip()
|
|
350
|
+
if not line:
|
|
351
|
+
continue
|
|
352
|
+
try:
|
|
353
|
+
obj = json.loads(line)
|
|
354
|
+
except Exception:
|
|
355
|
+
continue
|
|
356
|
+
if not isinstance(obj, dict):
|
|
357
|
+
continue
|
|
358
|
+
payload = obj.get("payload")
|
|
359
|
+
if obj.get("type") == "session_meta" and isinstance(payload, dict):
|
|
360
|
+
sid = payload.get("id") or sid
|
|
361
|
+
cwd = payload.get("cwd") or cwd
|
|
362
|
+
model = payload.get("model") or model
|
|
363
|
+
continue
|
|
364
|
+
if isinstance(payload, dict):
|
|
365
|
+
payload.setdefault("_ts", obj.get("timestamp"))
|
|
366
|
+
items.append(payload)
|
|
367
|
+
except Exception:
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
if not sid:
|
|
371
|
+
# rollout-<iso>-<uuid>.jsonl → take the trailing uuid-ish segment.
|
|
372
|
+
stem = os.path.splitext(os.path.basename(path))[0]
|
|
373
|
+
sid = stem
|
|
374
|
+
m = re.search(r"([0-9a-fA-F-]{8,})$", stem)
|
|
375
|
+
if m:
|
|
376
|
+
sid = m.group(1)
|
|
377
|
+
|
|
378
|
+
repo = _repo_of(cwd)
|
|
379
|
+
spans: list[dict] = []
|
|
380
|
+
session_name: str | None = None
|
|
381
|
+
gen = 0
|
|
382
|
+
last_gen_id: str | None = None
|
|
383
|
+
# Map function_call call_id → its emitted span so outputs can attach.
|
|
384
|
+
call_index: dict[str, dict] = {}
|
|
385
|
+
prev_ts: str | None = None # LLM start ts; tool spans also extend to result ts
|
|
386
|
+
|
|
387
|
+
for it in items:
|
|
388
|
+
ts = it.get("_ts") or _now_iso()
|
|
389
|
+
ptype = it.get("type")
|
|
390
|
+
role = it.get("role") or ""
|
|
391
|
+
model = it.get("model") or model
|
|
392
|
+
|
|
393
|
+
if ptype == "message":
|
|
394
|
+
text = _codex_text(it.get("content"))
|
|
395
|
+
if role == "user":
|
|
396
|
+
# Skip the giant injected developer/instruction preamble for the
|
|
397
|
+
# session name; user prompts are inputs, not billable turns.
|
|
398
|
+
if session_name is None and text and not text.lstrip().startswith("#"):
|
|
399
|
+
session_name = text.strip().replace("\n", " ")[:_NAME_CAP]
|
|
400
|
+
elif role == "assistant":
|
|
401
|
+
gen += 1
|
|
402
|
+
last_gen_id = f"{sid}-gen{gen}"
|
|
403
|
+
spans.append(
|
|
404
|
+
{
|
|
405
|
+
"span_id": last_gen_id,
|
|
406
|
+
"name": "assistant",
|
|
407
|
+
"span_type": "llm",
|
|
408
|
+
"start_time": prev_ts or ts,
|
|
409
|
+
"end_time": ts,
|
|
410
|
+
"model": model,
|
|
411
|
+
"output": text[:_OUTPUT_CAP],
|
|
412
|
+
"status": "ok",
|
|
413
|
+
}
|
|
414
|
+
)
|
|
415
|
+
elif ptype == "function_call":
|
|
416
|
+
name = it.get("name") or "tool"
|
|
417
|
+
call_id = it.get("call_id") or it.get("id") or f"call{len(spans)}"
|
|
418
|
+
if name in ("read_file", "grep", "search", "web_search", "find"):
|
|
419
|
+
stype = "retrieval"
|
|
420
|
+
else:
|
|
421
|
+
stype = "tool"
|
|
422
|
+
arguments = it.get("arguments")
|
|
423
|
+
span = {
|
|
424
|
+
"span_id": f"{sid}-{call_id}",
|
|
425
|
+
"name": f"tool:{name}",
|
|
426
|
+
"span_type": stype,
|
|
427
|
+
"start_time": ts,
|
|
428
|
+
"end_time": ts,
|
|
429
|
+
"input": (
|
|
430
|
+
arguments[:_INPUT_CAP]
|
|
431
|
+
if isinstance(arguments, str)
|
|
432
|
+
else json.dumps(arguments or {})[:_INPUT_CAP]
|
|
433
|
+
),
|
|
434
|
+
"output": "",
|
|
435
|
+
"status": "ok",
|
|
436
|
+
}
|
|
437
|
+
if last_gen_id:
|
|
438
|
+
span["parent_span_id"] = last_gen_id
|
|
439
|
+
spans.append(span)
|
|
440
|
+
call_index[call_id] = span
|
|
441
|
+
elif ptype == "function_call_output":
|
|
442
|
+
call_id = it.get("call_id") or it.get("id")
|
|
443
|
+
target = call_index.get(call_id)
|
|
444
|
+
if target is not None:
|
|
445
|
+
out = it.get("output")
|
|
446
|
+
if isinstance(out, dict):
|
|
447
|
+
out = out.get("content") or json.dumps(out)
|
|
448
|
+
target["output"] = out[:_OUTPUT_CAP] if isinstance(out, str) else ""
|
|
449
|
+
target["end_time"] = ts
|
|
450
|
+
prev_ts = ts
|
|
451
|
+
|
|
452
|
+
if not spans:
|
|
453
|
+
return None
|
|
454
|
+
if not session_name:
|
|
455
|
+
session_name = f"Codex: {repo or sid}"
|
|
456
|
+
return {
|
|
457
|
+
"source": "codex",
|
|
458
|
+
"session_id": sid,
|
|
459
|
+
"session_name": session_name,
|
|
460
|
+
"user_ref": default_user,
|
|
461
|
+
"repo": repo,
|
|
462
|
+
"model": model,
|
|
463
|
+
"spans": spans,
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
# ── Cursor (best-effort) ──────────────────────────────────────────────────────
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def _cursor_storage_root() -> str | None:
|
|
471
|
+
"""Locate the Cursor workspaceStorage dir for the host OS, if present."""
|
|
472
|
+
home = os.path.expanduser("~")
|
|
473
|
+
candidates = [
|
|
474
|
+
os.path.join(
|
|
475
|
+
home, "Library", "Application Support", "Cursor", "User", "workspaceStorage"
|
|
476
|
+
),
|
|
477
|
+
os.path.join(home, ".config", "Cursor", "User", "workspaceStorage"),
|
|
478
|
+
os.path.join(home, "AppData", "Roaming", "Cursor", "User", "workspaceStorage"),
|
|
479
|
+
]
|
|
480
|
+
for c in candidates:
|
|
481
|
+
if os.path.isdir(c):
|
|
482
|
+
return c
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def parse_cursor_sessions(
|
|
487
|
+
default_user: str, cutoff: datetime | None, verbose: bool
|
|
488
|
+
) -> list[dict]:
|
|
489
|
+
"""Best-effort import of Cursor chat history from workspaceStorage.
|
|
490
|
+
|
|
491
|
+
Two parsers run per workspace in priority order:
|
|
492
|
+
|
|
493
|
+
* **aiService** (current Cursor) — reads ``aiService.generations`` and
|
|
494
|
+
``aiService.prompts`` from ``ItemTable``. Generations carry a ``unixMs``
|
|
495
|
+
timestamp and a ``generationUUID``, so we can split a workspace into
|
|
496
|
+
proper time-bounded sessions (one session per >30-min gap) with
|
|
497
|
+
deterministic ``trace_id``/``span_id`` for idempotent re-runs.
|
|
498
|
+
* **bubbles** (older Cursor) — legacy fallback that walks ``cursorDiskKV``
|
|
499
|
+
and ``ItemTable`` for keys containing ``bubble``/``chat``. Used only when
|
|
500
|
+
the aiService keys are absent.
|
|
501
|
+
|
|
502
|
+
Inherently schema-fragile across Cursor versions, so still fail-open: any
|
|
503
|
+
error on any workspace is skipped (logged under --verbose). Live capture
|
|
504
|
+
via the Cursor hook is the authoritative path; this just backfills the
|
|
505
|
+
history sitting on disk.
|
|
506
|
+
"""
|
|
507
|
+
root = _cursor_storage_root()
|
|
508
|
+
if not root:
|
|
509
|
+
if verbose:
|
|
510
|
+
_eprint("[octarin] cursor: no workspaceStorage found; skipping")
|
|
511
|
+
return []
|
|
512
|
+
try:
|
|
513
|
+
import sqlite3
|
|
514
|
+
except Exception:
|
|
515
|
+
return []
|
|
516
|
+
|
|
517
|
+
events: list[dict] = []
|
|
518
|
+
workspaces_seen = 0
|
|
519
|
+
for ws in sorted(os.listdir(root)):
|
|
520
|
+
db = os.path.join(root, ws, "state.vscdb")
|
|
521
|
+
if not os.path.isfile(db):
|
|
522
|
+
continue
|
|
523
|
+
workspaces_seen += 1
|
|
524
|
+
if cutoff is not None:
|
|
525
|
+
try:
|
|
526
|
+
mtime = datetime.fromtimestamp(os.path.getmtime(db), tz=timezone.utc)
|
|
527
|
+
if mtime < cutoff:
|
|
528
|
+
continue
|
|
529
|
+
except Exception:
|
|
530
|
+
pass
|
|
531
|
+
try:
|
|
532
|
+
ws_events = _parse_cursor_workspace(sqlite3, db, ws, default_user, cutoff)
|
|
533
|
+
except Exception as exc: # fail-open per workspace
|
|
534
|
+
if verbose:
|
|
535
|
+
_eprint(f"[octarin] cursor: skipped {ws}: {exc}")
|
|
536
|
+
continue
|
|
537
|
+
if verbose and ws_events:
|
|
538
|
+
_eprint(f"[octarin] cursor: {ws}: {len(ws_events)} session(s)")
|
|
539
|
+
events.extend(ws_events)
|
|
540
|
+
if verbose:
|
|
541
|
+
_eprint(
|
|
542
|
+
f"[octarin] cursor: scanned {workspaces_seen} workspace(s), "
|
|
543
|
+
f"recovered {len(events)} session(s)"
|
|
544
|
+
)
|
|
545
|
+
return events
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
# Generations more than this many seconds apart land in distinct sessions. 30
|
|
549
|
+
# minutes is a pragmatic boundary that matches how people use Cursor (start a
|
|
550
|
+
# task, walk away, come back later → that's two sessions).
|
|
551
|
+
_CURSOR_SESSION_GAP_SECONDS = 30 * 60
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _parse_cursor_workspace(
|
|
555
|
+
sqlite3, db_path: str, ws_id: str, default_user: str, cutoff: datetime | None
|
|
556
|
+
) -> list[dict]:
|
|
557
|
+
"""Read one workspace's state.vscdb → list of IngestEvents (one per session).
|
|
558
|
+
|
|
559
|
+
Reads aiService.generations + aiService.prompts (current Cursor schema). If
|
|
560
|
+
those are absent or empty, falls back to the legacy bubble path so we don't
|
|
561
|
+
regress on older installs.
|
|
562
|
+
"""
|
|
563
|
+
# Open read-only so we never disturb a running Cursor.
|
|
564
|
+
uri = f"file:{db_path}?mode=ro&immutable=1"
|
|
565
|
+
conn = sqlite3.connect(uri, uri=True, timeout=2.0)
|
|
566
|
+
try:
|
|
567
|
+
cur = conn.cursor()
|
|
568
|
+
try:
|
|
569
|
+
cur.execute(
|
|
570
|
+
"SELECT key, value FROM ItemTable "
|
|
571
|
+
"WHERE key IN ('aiService.generations', 'aiService.prompts')"
|
|
572
|
+
)
|
|
573
|
+
ai_rows = dict(cur.fetchall())
|
|
574
|
+
except Exception:
|
|
575
|
+
ai_rows = {}
|
|
576
|
+
|
|
577
|
+
# Fallback: legacy bubble/chat rows (older Cursor versions only — the
|
|
578
|
+
# current schema lives in aiService.* above).
|
|
579
|
+
legacy_rows: list[tuple] = []
|
|
580
|
+
if not ai_rows.get("aiService.generations"):
|
|
581
|
+
for table in ("cursorDiskKV", "ItemTable"):
|
|
582
|
+
try:
|
|
583
|
+
# Fixed table names from a literal tuple — not user input.
|
|
584
|
+
cur.execute(f"SELECT key, value FROM {table}")
|
|
585
|
+
legacy_rows.extend(cur.fetchall())
|
|
586
|
+
except Exception:
|
|
587
|
+
continue
|
|
588
|
+
finally:
|
|
589
|
+
conn.close()
|
|
590
|
+
|
|
591
|
+
# Path 1: the aiService schema (every modern Cursor).
|
|
592
|
+
events = _parse_cursor_aiservice(ai_rows, ws_id, default_user, cutoff)
|
|
593
|
+
if events:
|
|
594
|
+
return events
|
|
595
|
+
|
|
596
|
+
# Path 2: legacy bubbles (no time grouping, one event per workspace).
|
|
597
|
+
legacy = _parse_cursor_bubbles(legacy_rows, ws_id, default_user)
|
|
598
|
+
return [legacy] if legacy else []
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _parse_cursor_aiservice( # noqa: PLR0915 - per-session bookkeeping inline
|
|
602
|
+
rows: dict, ws_id: str, default_user: str, cutoff: datetime | None
|
|
603
|
+
) -> list[dict]:
|
|
604
|
+
"""Build per-session IngestEvents from ``aiService.generations`` (+ ``prompts``).
|
|
605
|
+
|
|
606
|
+
Generations are the authoritative records (they carry ``unixMs`` +
|
|
607
|
+
``generationUUID``). We sort by timestamp, split on ``_CURSOR_SESSION_GAP_SECONDS``
|
|
608
|
+
gaps, and emit one event per resulting session — each with a deterministic
|
|
609
|
+
``session_id`` derived from workspace + session start so re-runs dedupe via
|
|
610
|
+
the backend's trace-id derivation. User prompts get joined into the
|
|
611
|
+
session-level ``input`` (Cursor doesn't timestamp them so per-turn
|
|
612
|
+
correlation isn't safe).
|
|
613
|
+
"""
|
|
614
|
+
raw_gens = rows.get("aiService.generations")
|
|
615
|
+
raw_prompts = rows.get("aiService.prompts")
|
|
616
|
+
if not raw_gens:
|
|
617
|
+
return []
|
|
618
|
+
try:
|
|
619
|
+
gens = json.loads(raw_gens) or []
|
|
620
|
+
except Exception:
|
|
621
|
+
return []
|
|
622
|
+
if not isinstance(gens, list) or not gens:
|
|
623
|
+
return []
|
|
624
|
+
|
|
625
|
+
# Sort generations chronologically. Defensive about missing/non-int unixMs.
|
|
626
|
+
def _ts(g: object) -> int:
|
|
627
|
+
if isinstance(g, dict):
|
|
628
|
+
v = g.get("unixMs")
|
|
629
|
+
if isinstance(v, (int, float)):
|
|
630
|
+
return int(v)
|
|
631
|
+
return 0
|
|
632
|
+
|
|
633
|
+
gens = sorted([g for g in gens if isinstance(g, dict) and _ts(g) > 0], key=_ts)
|
|
634
|
+
if not gens:
|
|
635
|
+
return []
|
|
636
|
+
|
|
637
|
+
# Apply --since cutoff at the generation level (mtime on the file is too
|
|
638
|
+
# coarse — a workspace touched yesterday may have months of older history).
|
|
639
|
+
if cutoff is not None:
|
|
640
|
+
cutoff_ms = int(cutoff.timestamp() * 1000)
|
|
641
|
+
gens = [g for g in gens if _ts(g) >= cutoff_ms]
|
|
642
|
+
if not gens:
|
|
643
|
+
return []
|
|
644
|
+
|
|
645
|
+
# User prompts (no timestamps in Cursor's schema) — we attach the full text
|
|
646
|
+
# as the session-level input. Best-effort join; if the JSON is malformed we
|
|
647
|
+
# quietly proceed without prompts.
|
|
648
|
+
prompt_texts: list[str] = []
|
|
649
|
+
if raw_prompts:
|
|
650
|
+
try:
|
|
651
|
+
prompts = json.loads(raw_prompts) or []
|
|
652
|
+
for p in prompts if isinstance(prompts, list) else []:
|
|
653
|
+
if isinstance(p, dict):
|
|
654
|
+
t = p.get("text")
|
|
655
|
+
if isinstance(t, str) and t.strip():
|
|
656
|
+
prompt_texts.append(t.strip())
|
|
657
|
+
except Exception:
|
|
658
|
+
pass
|
|
659
|
+
|
|
660
|
+
# Split into sessions on the gap boundary.
|
|
661
|
+
sessions: list[list[dict]] = []
|
|
662
|
+
for g in gens:
|
|
663
|
+
if (
|
|
664
|
+
not sessions
|
|
665
|
+
or _ts(g) - _ts(sessions[-1][-1]) > _CURSOR_SESSION_GAP_SECONDS * 1000
|
|
666
|
+
):
|
|
667
|
+
sessions.append([g])
|
|
668
|
+
else:
|
|
669
|
+
sessions[-1].append(g)
|
|
670
|
+
|
|
671
|
+
events: list[dict] = []
|
|
672
|
+
# Distribute prompts proportionally across sessions (best-effort: split
|
|
673
|
+
# evenly by session count — better than dumping all of them onto session
|
|
674
|
+
# one and worse than impossible without timestamps).
|
|
675
|
+
prompts_per_session = (
|
|
676
|
+
[prompt_texts[i :: len(sessions)] for i in range(len(sessions))]
|
|
677
|
+
if prompt_texts and sessions
|
|
678
|
+
else [[] for _ in sessions]
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
for idx, gen_list in enumerate(sessions):
|
|
682
|
+
start_ms = _ts(gen_list[0])
|
|
683
|
+
end_ms = _ts(gen_list[-1])
|
|
684
|
+
# Session-level input from the prompt slice (may be empty).
|
|
685
|
+
my_prompts = prompts_per_session[idx]
|
|
686
|
+
session_input = "\n\n".join(my_prompts)[:_INPUT_CAP] if my_prompts else None
|
|
687
|
+
# Session name = first prompt (truncated), or first generation snippet.
|
|
688
|
+
first_text = (
|
|
689
|
+
my_prompts[0]
|
|
690
|
+
if my_prompts
|
|
691
|
+
else str(gen_list[0].get("textDescription") or "")
|
|
692
|
+
)
|
|
693
|
+
session_name = (
|
|
694
|
+
first_text.strip().replace("\n", " ")[:_NAME_CAP] or f"Cursor: {ws_id}"
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
# Estimated session-level input tokens — the total length of every
|
|
698
|
+
# prompt that landed in this session bucket (the prompts have no
|
|
699
|
+
# timestamps so we attribute their tokens at the session, not span).
|
|
700
|
+
# We surface this on the FIRST span so the trace-level rollup picks it
|
|
701
|
+
# up exactly once without double-counting.
|
|
702
|
+
session_input_tokens = sum(_est_tokens(p) for p in my_prompts)
|
|
703
|
+
|
|
704
|
+
spans: list[dict] = []
|
|
705
|
+
for span_idx, g in enumerate(gen_list):
|
|
706
|
+
gen_uuid = g.get("generationUUID") or g.get("uuid") or ""
|
|
707
|
+
text = g.get("textDescription") or ""
|
|
708
|
+
if isinstance(text, dict):
|
|
709
|
+
text = json.dumps(text)
|
|
710
|
+
if not isinstance(text, str):
|
|
711
|
+
text = ""
|
|
712
|
+
gen_ms = _ts(g)
|
|
713
|
+
gen_type = g.get("type") or "composer"
|
|
714
|
+
out_tokens = _est_tokens(text)
|
|
715
|
+
# First span carries the session's input-token estimate (sum of
|
|
716
|
+
# prompt text lengths). Subsequent spans only have output tokens.
|
|
717
|
+
input_tokens = session_input_tokens if span_idx == 0 else 0
|
|
718
|
+
spans.append(
|
|
719
|
+
{
|
|
720
|
+
"span_id": f"cursor-{ws_id}-{gen_uuid or str(gen_ms)}",
|
|
721
|
+
"name": gen_type,
|
|
722
|
+
"span_type": "llm",
|
|
723
|
+
# We don't have separate start/end timestamps; Cursor only
|
|
724
|
+
# records a single time per generation. Use it for both so
|
|
725
|
+
# duration is 0 — fair: this is post-hoc, not live timing.
|
|
726
|
+
"start_time": _ms_to_iso(gen_ms),
|
|
727
|
+
"end_time": _ms_to_iso(gen_ms),
|
|
728
|
+
# Cursor doesn't expose its underlying model id in this
|
|
729
|
+
# storage path; ``cursor:composer`` (or whatever ``type``
|
|
730
|
+
# the generation declared) flags this as Cursor's own
|
|
731
|
+
# composer flow. The backend pricing layer doesn't know
|
|
732
|
+
# this model, so cost stays $0 (correct: Cursor cost is
|
|
733
|
+
# tier-based, not per-token).
|
|
734
|
+
"model": f"cursor:{gen_type}",
|
|
735
|
+
"output": text[:_OUTPUT_CAP],
|
|
736
|
+
"input_tokens": input_tokens,
|
|
737
|
+
"output_tokens": out_tokens,
|
|
738
|
+
"total_tokens": input_tokens + out_tokens,
|
|
739
|
+
"status": "ok",
|
|
740
|
+
}
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
events.append(
|
|
744
|
+
{
|
|
745
|
+
"source": "cursor",
|
|
746
|
+
# Deterministic per session so re-runs dedupe via backend
|
|
747
|
+
# trace_id derivation (uuid5(project, source, session_id)).
|
|
748
|
+
"session_id": f"cursor-{ws_id}-{start_ms}",
|
|
749
|
+
"session_name": session_name,
|
|
750
|
+
"user_ref": default_user,
|
|
751
|
+
"model": f"cursor:{gen_list[0].get('type') or 'composer'}",
|
|
752
|
+
"input": session_input,
|
|
753
|
+
"start_time": _ms_to_iso(start_ms),
|
|
754
|
+
"end_time": _ms_to_iso(end_ms),
|
|
755
|
+
"spans": spans,
|
|
756
|
+
}
|
|
757
|
+
)
|
|
758
|
+
return events
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def _est_tokens(text: str) -> int:
|
|
762
|
+
"""Char-based token estimate (~4 chars/token).
|
|
763
|
+
|
|
764
|
+
Cursor's ``aiService.generations`` records the rendered text of a model
|
|
765
|
+
reply but NOT a token count — Cursor charges per-request on its own tier,
|
|
766
|
+
not per-token, so it doesn't bother. To make tokens-per-day / tokens-per-
|
|
767
|
+
session analytics meaningful for imported Cursor history we estimate from
|
|
768
|
+
text length using the standard ~4-char-per-token English ratio. Spans
|
|
769
|
+
minted this way get a ``cursor:composer`` model name which the backend's
|
|
770
|
+
pricing layer doesn't know — so $ stays $0 (correct: Cursor cost is opaque
|
|
771
|
+
on a tier plan) but token counts roll up.
|
|
772
|
+
"""
|
|
773
|
+
if not text:
|
|
774
|
+
return 0
|
|
775
|
+
return max(0, (len(text) + 3) // 4)
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def _ms_to_iso(unix_ms: int) -> str:
|
|
779
|
+
"""Convert a unix millisecond timestamp to a UTC ISO 8601 string."""
|
|
780
|
+
try:
|
|
781
|
+
return datetime.fromtimestamp(unix_ms / 1000, tz=timezone.utc).isoformat()
|
|
782
|
+
except Exception:
|
|
783
|
+
return _now_iso()
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def _parse_cursor_bubbles(
|
|
787
|
+
rows: list[tuple], ws_id: str, default_user: str
|
|
788
|
+
) -> dict | None:
|
|
789
|
+
"""Legacy fallback: walk bubble/chat rows from older Cursor versions.
|
|
790
|
+
|
|
791
|
+
Same shape as the original parser this file shipped with — kept untouched
|
|
792
|
+
so any pre-aiService Cursor install still gets best-effort backfill.
|
|
793
|
+
"""
|
|
794
|
+
spans: list[dict] = []
|
|
795
|
+
session_name: str | None = None
|
|
796
|
+
model: str | None = None
|
|
797
|
+
gen = 0
|
|
798
|
+
for key, value in rows:
|
|
799
|
+
if not isinstance(key, str) or not isinstance(value, (str, bytes)):
|
|
800
|
+
continue
|
|
801
|
+
if "bubble" not in key and "chat" not in key.lower():
|
|
802
|
+
continue
|
|
803
|
+
try:
|
|
804
|
+
data = json.loads(value)
|
|
805
|
+
except Exception:
|
|
806
|
+
continue
|
|
807
|
+
for b in _cursor_bubbles(data):
|
|
808
|
+
text = b.get("text") or b.get("richText") or ""
|
|
809
|
+
if isinstance(text, dict):
|
|
810
|
+
text = json.dumps(text)
|
|
811
|
+
if not isinstance(text, str) or not text.strip():
|
|
812
|
+
continue
|
|
813
|
+
btype = b.get("type")
|
|
814
|
+
# Cursor's legacy bubble schema: 1 == user turn, 2 == assistant turn.
|
|
815
|
+
_BUBBLE_USER = 1
|
|
816
|
+
_BUBBLE_ASSISTANT = 2
|
|
817
|
+
is_user = btype == _BUBBLE_USER or b.get("role") == "user"
|
|
818
|
+
is_asst = btype == _BUBBLE_ASSISTANT or b.get("role") == "assistant"
|
|
819
|
+
model = b.get("model") or model
|
|
820
|
+
if is_user and session_name is None:
|
|
821
|
+
session_name = text.strip().replace("\n", " ")[:_NAME_CAP]
|
|
822
|
+
if is_asst:
|
|
823
|
+
gen += 1
|
|
824
|
+
out_tokens = _est_tokens(text)
|
|
825
|
+
spans.append(
|
|
826
|
+
{
|
|
827
|
+
"span_id": f"cursor-{ws_id}-gen{gen}",
|
|
828
|
+
"name": "assistant",
|
|
829
|
+
"span_type": "llm",
|
|
830
|
+
"start_time": _now_iso(),
|
|
831
|
+
"end_time": _now_iso(),
|
|
832
|
+
"model": model,
|
|
833
|
+
"output": text[:_OUTPUT_CAP],
|
|
834
|
+
"output_tokens": out_tokens,
|
|
835
|
+
"total_tokens": out_tokens,
|
|
836
|
+
"status": "ok",
|
|
837
|
+
}
|
|
838
|
+
)
|
|
839
|
+
if not spans:
|
|
840
|
+
return None
|
|
841
|
+
return {
|
|
842
|
+
"source": "cursor",
|
|
843
|
+
"session_id": f"cursor-{ws_id}",
|
|
844
|
+
"session_name": session_name or f"Cursor: {ws_id}",
|
|
845
|
+
"user_ref": default_user,
|
|
846
|
+
"model": model,
|
|
847
|
+
"spans": spans,
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def _cursor_bubbles(data: object) -> list[dict]:
|
|
852
|
+
"""Extract a flat list of chat-bubble dicts from a decoded Cursor value."""
|
|
853
|
+
if isinstance(data, dict):
|
|
854
|
+
if "type" in data and ("text" in data or "richText" in data):
|
|
855
|
+
return [data]
|
|
856
|
+
for k in ("bubbles", "messages", "conversation"):
|
|
857
|
+
v = data.get(k)
|
|
858
|
+
if isinstance(v, list):
|
|
859
|
+
return [x for x in v if isinstance(x, dict)]
|
|
860
|
+
return []
|
|
861
|
+
if isinstance(data, list):
|
|
862
|
+
return [x for x in data if isinstance(x, dict)]
|
|
863
|
+
return []
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
# ── file discovery ────────────────────────────────────────────────────────────
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
def _excluded(path: str) -> bool:
|
|
870
|
+
"""True for sidecar/internal files we must skip (``_*`` files, subagents/)."""
|
|
871
|
+
parts = path.split(os.sep)
|
|
872
|
+
if any(p == "subagents" for p in parts):
|
|
873
|
+
return True
|
|
874
|
+
return os.path.basename(path).startswith("_")
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def discover_claude(cutoff: datetime | None) -> list[str]:
|
|
878
|
+
"""All Claude transcript jsonl files newer than ``cutoff`` (excluding sidecars)."""
|
|
879
|
+
root = os.path.expanduser("~/.claude/projects")
|
|
880
|
+
return _walk(root, lambda n: n.endswith(".jsonl"), cutoff)
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
def discover_codex(cutoff: datetime | None) -> list[str]:
|
|
884
|
+
"""All Codex rollout jsonl files newer than ``cutoff`` (excluding sidecars)."""
|
|
885
|
+
root = os.path.expanduser("~/.codex/sessions")
|
|
886
|
+
return _walk(
|
|
887
|
+
root, lambda n: n.startswith("rollout-") and n.endswith(".jsonl"), cutoff
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
def _walk(root: str, name_ok, cutoff: datetime | None) -> list[str]:
|
|
892
|
+
"""Return matching files under ``root``, excluding sidecars + stale-by-mtime."""
|
|
893
|
+
out: list[str] = []
|
|
894
|
+
if not os.path.isdir(root):
|
|
895
|
+
return out
|
|
896
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
897
|
+
# prune subagents dirs so we never descend into them.
|
|
898
|
+
dirnames[:] = [d for d in dirnames if d != "subagents"]
|
|
899
|
+
for name in filenames:
|
|
900
|
+
if not name_ok(name) or name.startswith("_"):
|
|
901
|
+
continue
|
|
902
|
+
path = os.path.join(dirpath, name)
|
|
903
|
+
if _excluded(path):
|
|
904
|
+
continue
|
|
905
|
+
if cutoff is not None:
|
|
906
|
+
try:
|
|
907
|
+
mtime = datetime.fromtimestamp(
|
|
908
|
+
os.path.getmtime(path), tz=timezone.utc
|
|
909
|
+
)
|
|
910
|
+
if mtime < cutoff:
|
|
911
|
+
continue
|
|
912
|
+
except Exception:
|
|
913
|
+
pass
|
|
914
|
+
out.append(path)
|
|
915
|
+
return out
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
# ── posting ───────────────────────────────────────────────────────────────────
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def _filter_spans_by_cutoff(event: dict, cutoff: datetime | None) -> dict | None:
|
|
922
|
+
"""Drop spans older than the cutoff; return None if nothing survives.
|
|
923
|
+
|
|
924
|
+
File-level mtime already prunes most stale files; this trims partial files so
|
|
925
|
+
a long-running session that started before the cutoff still imports only its
|
|
926
|
+
recent turns. Spans with no/unparseable timestamp are always kept.
|
|
927
|
+
"""
|
|
928
|
+
if cutoff is None:
|
|
929
|
+
return event
|
|
930
|
+
spans = event.get("spans") or []
|
|
931
|
+
kept = []
|
|
932
|
+
for s in spans:
|
|
933
|
+
ts = _parse_ts(s.get("start_time"))
|
|
934
|
+
if ts is None or ts >= cutoff:
|
|
935
|
+
kept.append(s)
|
|
936
|
+
if not kept:
|
|
937
|
+
return None
|
|
938
|
+
event = dict(event)
|
|
939
|
+
event["spans"] = kept
|
|
940
|
+
return event
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def post_event(
|
|
944
|
+
url: str, key: str, event: dict, timeout: float = 20.0
|
|
945
|
+
) -> tuple[bool, str]:
|
|
946
|
+
"""POST one IngestEvent. Returns ``(ok, detail)``; never raises."""
|
|
947
|
+
body = json.dumps(event).encode()
|
|
948
|
+
req = urllib.request.Request(
|
|
949
|
+
url,
|
|
950
|
+
data=body,
|
|
951
|
+
headers={
|
|
952
|
+
"Authorization": f"Bearer {key}",
|
|
953
|
+
"Content-Type": "application/json",
|
|
954
|
+
},
|
|
955
|
+
method="POST",
|
|
956
|
+
)
|
|
957
|
+
try:
|
|
958
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
959
|
+
raw = resp.read().decode("utf-8", "replace")
|
|
960
|
+
try:
|
|
961
|
+
n = json.loads(raw).get("span_count")
|
|
962
|
+
return True, f"span_count={n}" if n is not None else "ok"
|
|
963
|
+
except Exception:
|
|
964
|
+
return True, "ok"
|
|
965
|
+
except urllib.error.HTTPError as exc:
|
|
966
|
+
detail = ""
|
|
967
|
+
with contextlib.suppress(Exception):
|
|
968
|
+
detail = exc.read().decode("utf-8", "replace")[:300]
|
|
969
|
+
return False, f"HTTP {exc.code} {detail}".strip()
|
|
970
|
+
except Exception as exc: # network / timeout — fail-open
|
|
971
|
+
return False, str(exc)
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
# ── orchestration ─────────────────────────────────────────────────────────────
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
def build_arg_parser() -> argparse.ArgumentParser:
|
|
978
|
+
p = argparse.ArgumentParser(
|
|
979
|
+
prog="backfill.py",
|
|
980
|
+
description="Import past Claude Code / Codex / Cursor sessions into Octarin.",
|
|
981
|
+
)
|
|
982
|
+
p.add_argument(
|
|
983
|
+
"--since",
|
|
984
|
+
default=None,
|
|
985
|
+
help="Only import sessions newer than this (e.g. 7d, 12h, 2w, or 2026-01-01).",
|
|
986
|
+
)
|
|
987
|
+
p.add_argument(
|
|
988
|
+
"--dry-run",
|
|
989
|
+
action="store_true",
|
|
990
|
+
help="Parse and count, but do not POST anything.",
|
|
991
|
+
)
|
|
992
|
+
p.add_argument(
|
|
993
|
+
"--verbose",
|
|
994
|
+
action="store_true",
|
|
995
|
+
help="Log every file parsed/posted.",
|
|
996
|
+
)
|
|
997
|
+
return p
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def _handle_event(event, cutoff, args, url, key, label, totals) -> None:
|
|
1001
|
+
"""Filter by cutoff, account it, and (unless dry-run) POST one event."""
|
|
1002
|
+
event = _filter_spans_by_cutoff(event, cutoff)
|
|
1003
|
+
if not event:
|
|
1004
|
+
totals["skipped"] += 1
|
|
1005
|
+
return
|
|
1006
|
+
n_spans = len(event.get("spans") or [])
|
|
1007
|
+
totals["events"] += 1
|
|
1008
|
+
totals["spans"] += n_spans
|
|
1009
|
+
if args.dry_run:
|
|
1010
|
+
if args.verbose:
|
|
1011
|
+
_eprint(
|
|
1012
|
+
f"[octarin] [dry-run] {label} {event['session_id']} "
|
|
1013
|
+
f"({n_spans} spans) {event.get('session_name', '')[:50]}"
|
|
1014
|
+
)
|
|
1015
|
+
return
|
|
1016
|
+
ok, detail = post_event(url, key, event)
|
|
1017
|
+
if ok:
|
|
1018
|
+
totals["posted"] += 1
|
|
1019
|
+
if args.verbose:
|
|
1020
|
+
_eprint(f"[octarin] posted {event['session_id']} ({detail})")
|
|
1021
|
+
else:
|
|
1022
|
+
totals["failed"] += 1
|
|
1023
|
+
_eprint(f"[octarin] FAILED {event['session_id']}: {detail}")
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
def run(argv: list[str] | None = None) -> int:
|
|
1027
|
+
args = build_arg_parser().parse_args(argv)
|
|
1028
|
+
cutoff = parse_since(args.since)
|
|
1029
|
+
|
|
1030
|
+
key = os.environ.get("OCTARIN_API_KEY", "").strip()
|
|
1031
|
+
url = os.environ.get("OCTARIN_INGEST_URL", "").strip() or INGEST_URL_DEFAULT
|
|
1032
|
+
|
|
1033
|
+
if not args.dry_run and not key:
|
|
1034
|
+
_eprint(
|
|
1035
|
+
"[octarin] OCTARIN_API_KEY is not set; cannot upload.\n"
|
|
1036
|
+
" Run: OCTARIN_API_KEY=<your key> python3 backfill.py"
|
|
1037
|
+
)
|
|
1038
|
+
return 1
|
|
1039
|
+
|
|
1040
|
+
default_user = _default_user()
|
|
1041
|
+
since_label = args.since or "all history"
|
|
1042
|
+
_eprint(f"[octarin] backfill starting (since: {since_label}) → {url}")
|
|
1043
|
+
|
|
1044
|
+
totals = {
|
|
1045
|
+
"files": 0,
|
|
1046
|
+
"events": 0,
|
|
1047
|
+
"spans": 0,
|
|
1048
|
+
"posted": 0,
|
|
1049
|
+
"failed": 0,
|
|
1050
|
+
"skipped": 0,
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
# (label, discover_fn, parse_fn) for the file-based sources.
|
|
1054
|
+
sources = [
|
|
1055
|
+
("Claude Code", discover_claude, parse_claude_transcript),
|
|
1056
|
+
("Codex", discover_codex, parse_codex_rollout),
|
|
1057
|
+
]
|
|
1058
|
+
for label, discover, parse in sources:
|
|
1059
|
+
files = discover(cutoff)
|
|
1060
|
+
_eprint(f"[octarin] {label}: {len(files)} session file(s) to scan")
|
|
1061
|
+
for path in files:
|
|
1062
|
+
totals["files"] += 1
|
|
1063
|
+
event = None
|
|
1064
|
+
try:
|
|
1065
|
+
event = parse(path, default_user)
|
|
1066
|
+
except Exception as exc: # fail-open per file
|
|
1067
|
+
if args.verbose:
|
|
1068
|
+
_eprint(f"[octarin] parse error {os.path.basename(path)}: {exc}")
|
|
1069
|
+
if not event:
|
|
1070
|
+
totals["skipped"] += 1
|
|
1071
|
+
continue
|
|
1072
|
+
_handle_event(event, cutoff, args, url, key, label, totals)
|
|
1073
|
+
|
|
1074
|
+
# Cursor (best-effort, separate path: parses straight to events).
|
|
1075
|
+
cursor_events = parse_cursor_sessions(default_user, cutoff, args.verbose)
|
|
1076
|
+
if cursor_events:
|
|
1077
|
+
_eprint(
|
|
1078
|
+
f"[octarin] Cursor: {len(cursor_events)} chat session(s) recovered "
|
|
1079
|
+
"(best-effort)"
|
|
1080
|
+
)
|
|
1081
|
+
for event in cursor_events:
|
|
1082
|
+
_handle_event(event, cutoff, args, url, key, "Cursor", totals)
|
|
1083
|
+
|
|
1084
|
+
_eprint("")
|
|
1085
|
+
_eprint("[octarin] backfill complete:")
|
|
1086
|
+
_eprint(f" files scanned : {totals['files']}")
|
|
1087
|
+
_eprint(f" sessions found: {totals['events']} ({totals['spans']} spans)")
|
|
1088
|
+
if args.dry_run:
|
|
1089
|
+
_eprint(" dry-run : nothing uploaded")
|
|
1090
|
+
else:
|
|
1091
|
+
_eprint(f" uploaded : {totals['posted']}")
|
|
1092
|
+
if totals["failed"]:
|
|
1093
|
+
_eprint(
|
|
1094
|
+
f" failed : {totals['failed']} "
|
|
1095
|
+
"(fail-open; safe to re-run)"
|
|
1096
|
+
)
|
|
1097
|
+
_eprint(" Re-running is safe — sessions dedupe by trace id.")
|
|
1098
|
+
# Fail-open: a backfill with some failed POSTs still exits 0 so it never
|
|
1099
|
+
# breaks an install pipeline. Only a hard config error (missing key) is
|
|
1100
|
+
# nonzero.
|
|
1101
|
+
return 0
|
|
1102
|
+
|
|
1103
|
+
|
|
1104
|
+
def main() -> None:
|
|
1105
|
+
try:
|
|
1106
|
+
sys.exit(run())
|
|
1107
|
+
except KeyboardInterrupt:
|
|
1108
|
+
_eprint("\n[octarin] interrupted")
|
|
1109
|
+
sys.exit(130)
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
if __name__ == "__main__":
|
|
1113
|
+
main()
|