spooling 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spooling/parser.py ADDED
@@ -0,0 +1,614 @@
1
+ """Parse session JSONL files from ~/.sessions/projects/.
2
+
3
+ Outputs both the legacy `ParsedSession` (messages + tool counts) and a
4
+ `Trace` built from parentUuid/isSidechain, so ingest can write to both the
5
+ legacy tables and the new traces/spans tables in the same pass.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import difflib
11
+ import json
12
+ import re
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ from spooling.config import SESSIONS_PROJECTS_DIR, CHARS_PER_TOKEN, DEFAULT_PRICING
19
+ from spooling.tracing import (
20
+ Span,
21
+ SpanKind,
22
+ SpanStatus,
23
+ Trace,
24
+ TraceBuilder,
25
+ )
26
+
27
+ _SYSTEM_PREFIXES = re.compile(
28
+ r"^(<(local-command-caveat|system-reminder|command-name|local-command-stdout)>|<!\[CDATA)"
29
+ )
30
+ _XML_TAG_STRIP = re.compile(r"<[^>]+>[^<]*</[^>]+>\s*")
31
+
32
+
33
+ @dataclass
34
+ class ToolCallDetail:
35
+ tool_use_id: str
36
+ name: str
37
+ input_summary: str
38
+ result_preview: str = ""
39
+
40
+
41
+ @dataclass
42
+ class ParsedMessage:
43
+ uuid: str
44
+ session_id: str
45
+ role: str
46
+ content: str
47
+ timestamp: datetime | None
48
+ cwd: str | None = None
49
+ git_branch: str | None = None
50
+ tools_used: list[str] = field(default_factory=list)
51
+ tool_details: list[ToolCallDetail] = field(default_factory=list)
52
+ estimated_tokens: int = 0
53
+
54
+
55
+ @dataclass
56
+ class ParsedSession:
57
+ session_id: str
58
+ project: str
59
+ messages: list[ParsedMessage] = field(default_factory=list)
60
+ started_at: datetime | None = None
61
+ ended_at: datetime | None = None
62
+ cwd: str | None = None
63
+ git_branch: str | None = None
64
+ agent_version: str | None = None
65
+ model: str | None = None
66
+ title: str | None = None
67
+ provider_id: str = "jsonl-session"
68
+ trace: Optional[Trace] = None
69
+
70
+ @property
71
+ def message_count(self) -> int:
72
+ return len(self.messages)
73
+
74
+ @property
75
+ def tool_call_count(self) -> int:
76
+ return sum(len(m.tools_used) for m in self.messages)
77
+
78
+ @property
79
+ def estimated_input_tokens(self) -> int:
80
+ return sum(m.estimated_tokens for m in self.messages if m.role == "user")
81
+
82
+ @property
83
+ def estimated_output_tokens(self) -> int:
84
+ return sum(m.estimated_tokens for m in self.messages if m.role == "assistant")
85
+
86
+
87
+ def _summarize_tool_input(name: str, inp: dict) -> str:
88
+ """Create a one-line summary of a tool call's input."""
89
+ def _short_path(p: str) -> str:
90
+ parts = p.rsplit("/", 2)
91
+ return "/".join(parts[-2:]) if len(parts) > 2 else p
92
+
93
+ if name == "Read":
94
+ path = inp.get("file_path", "")
95
+ offset = inp.get("offset")
96
+ limit = inp.get("limit")
97
+ if offset and limit:
98
+ return f"{path}:{offset}-{offset + limit}"
99
+ return path
100
+ if name in ("Edit", "Write"):
101
+ return inp.get("file_path", "")
102
+ if name == "Bash":
103
+ cmd = inp.get("command", "")
104
+ return cmd[:120]
105
+ if name == "Grep":
106
+ pattern = inp.get("pattern", "")
107
+ path = _short_path(inp.get("path", "")) if inp.get("path") else ""
108
+ gl = inp.get("glob", "")
109
+ parts = [f'"{pattern}"']
110
+ if path:
111
+ parts.append(f"in {path}")
112
+ if gl:
113
+ parts.append(f"({gl})")
114
+ return " ".join(parts)
115
+ if name == "Glob":
116
+ return inp.get("pattern", "")
117
+ if name == "Agent":
118
+ return inp.get("description", "")[:100]
119
+ if name in ("WebSearch", "WebFetch"):
120
+ return inp.get("query", inp.get("url", ""))[:120]
121
+ if name == "Skill":
122
+ return inp.get("skill", "")
123
+ if name == "TodoWrite":
124
+ todos = inp.get("todos", [])
125
+ return f"{len(todos)} items"
126
+ if name == "LSP":
127
+ return inp.get("operation", "")
128
+ # Fallback: show first key=value
129
+ for k, v in inp.items():
130
+ return f"{k}={str(v)[:80]}"
131
+ return ""
132
+
133
+
134
+ def _extract_content(message: dict) -> str:
135
+ """Extract text content from a message object."""
136
+ msg = message.get("message", {})
137
+ content = msg.get("content", "")
138
+ if isinstance(content, str):
139
+ return content
140
+ if isinstance(content, list):
141
+ parts = []
142
+ for block in content:
143
+ if isinstance(block, str):
144
+ parts.append(block)
145
+ elif isinstance(block, dict):
146
+ if block.get("type") == "text":
147
+ parts.append(block.get("text", ""))
148
+ elif block.get("type") == "tool_use":
149
+ parts.append(f"[tool: {block.get('name', 'unknown')}]")
150
+ elif block.get("type") == "tool_result":
151
+ pass
152
+ return "\n".join(parts)
153
+ return str(content) if content else ""
154
+
155
+
156
+ def _extract_tool_uses(message: dict) -> list[dict]:
157
+ """Return the tool_use content blocks from an assistant message."""
158
+ msg = message.get("message", {})
159
+ content = msg.get("content", "")
160
+ if not isinstance(content, list):
161
+ return []
162
+ return [
163
+ b for b in content
164
+ if isinstance(b, dict) and b.get("type") == "tool_use"
165
+ ]
166
+
167
+
168
+ def _extract_tool_results(message: dict) -> list[dict]:
169
+ """Return the tool_result content blocks from a user message.
170
+
171
+ Used by the Trace builder which needs the raw blocks to match against
172
+ open tool spans and carry the full text/error info.
173
+ """
174
+ msg = message.get("message", {})
175
+ content = msg.get("content", "")
176
+ if not isinstance(content, list):
177
+ return []
178
+ return [
179
+ b for b in content
180
+ if isinstance(b, dict) and b.get("type") == "tool_result"
181
+ ]
182
+
183
+
184
+ def _extract_tool_result_previews(message: dict) -> dict[str, str]:
185
+ """Return {tool_use_id: preview_text} for a user message's tool_results.
186
+
187
+ Used by the legacy tool_details view. Previews are clipped at 500 chars
188
+ so large file reads don't blow up the sessions table.
189
+ """
190
+ results: dict[str, str] = {}
191
+ for block in _extract_tool_results(message):
192
+ tool_use_id = block.get("tool_use_id", "")
193
+ result_content = block.get("content", "")
194
+ if isinstance(result_content, list):
195
+ text_parts = [
196
+ b.get("text", "")
197
+ for b in result_content
198
+ if isinstance(b, dict) and b.get("type") == "text"
199
+ ]
200
+ result_content = "\n".join(text_parts)
201
+ if tool_use_id and isinstance(result_content, str):
202
+ results[tool_use_id] = result_content[:500]
203
+ return results
204
+
205
+
206
+ def _format_edit_diff(inp: dict) -> str:
207
+ """Format an Edit tool input as a unified diff with interleaved hunks."""
208
+ old = inp.get("old_string", "")
209
+ new = inp.get("new_string", "")
210
+ if not old and not new:
211
+ return ""
212
+ old_lines = old.splitlines(keepends=True)
213
+ new_lines = new.splitlines(keepends=True)
214
+ diff = difflib.unified_diff(old_lines, new_lines, lineterm="")
215
+ lines = []
216
+ for line in diff:
217
+ stripped = line.rstrip("\n")
218
+ if not stripped:
219
+ continue
220
+ if stripped.startswith("---") or stripped.startswith("+++") or stripped.startswith("@@"):
221
+ continue
222
+ lines.append(stripped)
223
+ return "\n".join(lines)[:2000]
224
+
225
+
226
+ def _extract_tool_details(message: dict) -> list[ToolCallDetail]:
227
+ """Extract detailed tool call info from an assistant message."""
228
+ msg = message.get("message", {})
229
+ content = msg.get("content", "")
230
+ details = []
231
+ if isinstance(content, list):
232
+ for block in content:
233
+ if isinstance(block, dict) and block.get("type") == "tool_use":
234
+ name = block.get("name", "unknown")
235
+ inp = block.get("input", {}) or {}
236
+ result_preview = ""
237
+ if name == "Edit":
238
+ result_preview = _format_edit_diff(inp)
239
+ elif name == "Write":
240
+ result_preview = (inp.get("content") or "")[:2000]
241
+ details.append(ToolCallDetail(
242
+ tool_use_id=block.get("id", ""),
243
+ name=name,
244
+ input_summary=_summarize_tool_input(name, inp),
245
+ result_preview=result_preview,
246
+ ))
247
+ return details
248
+
249
+
250
+ def _parse_timestamp(raw: str | int | float | None) -> datetime | None:
251
+ """Parse various timestamp formats."""
252
+ if raw is None:
253
+ return None
254
+ if isinstance(raw, (int, float)):
255
+ return datetime.fromtimestamp(raw / 1000, tz=timezone.utc)
256
+ if isinstance(raw, str):
257
+ try:
258
+ return datetime.fromisoformat(raw.replace("Z", "+00:00"))
259
+ except ValueError:
260
+ return None
261
+ return None
262
+
263
+
264
+ def _price_for(model: str | None) -> tuple[float, float]:
265
+ if not model:
266
+ return DEFAULT_PRICING
267
+ return MODEL_PRICING.get(model, DEFAULT_PRICING)
268
+
269
+
270
+ def _cost_for_usage(model: str | None, usage: dict | None) -> float:
271
+ if not usage:
272
+ return 0.0
273
+ from spooling.pricing import get_rates
274
+ rates = get_rates(model)
275
+ return rates.cost(
276
+ input_tokens=usage.get("input_tokens") or 0,
277
+ output_tokens=usage.get("output_tokens") or 0,
278
+ cache_write_tokens=usage.get("cache_creation_input_tokens") or 0,
279
+ cache_read_tokens=usage.get("cache_read_input_tokens") or 0,
280
+ )
281
+
282
+
283
+ def _tool_result_text(block: dict) -> tuple[str, bool | None]:
284
+ """Return (text, is_error) for a tool_result block."""
285
+ content = block.get("content")
286
+ is_error = block.get("is_error")
287
+ if isinstance(content, str):
288
+ return content, is_error
289
+ if isinstance(content, list):
290
+ parts = []
291
+ for c in content:
292
+ if isinstance(c, dict) and c.get("type") == "text":
293
+ parts.append(c.get("text", ""))
294
+ elif isinstance(c, str):
295
+ parts.append(c)
296
+ return "\n".join(parts), is_error
297
+ return "", is_error
298
+
299
+
300
+ def parse_session_file(file_path: Path) -> ParsedSession | None:
301
+ """Parse a single session JSONL file into messages + trace."""
302
+ session_id = file_path.stem
303
+ project = file_path.parent.name
304
+
305
+ # --- Pass 1: load all records into memory keyed by uuid -------------
306
+ records: list[dict] = []
307
+ by_uuid: dict[str, dict] = {}
308
+
309
+ try:
310
+ with open(file_path) as f:
311
+ for line in f:
312
+ line = line.strip()
313
+ if not line:
314
+ continue
315
+ try:
316
+ record = json.loads(line)
317
+ except json.JSONDecodeError:
318
+ continue
319
+ if record.get("type") not in ("user", "assistant"):
320
+ continue
321
+ records.append(record)
322
+ uid = record.get("uuid")
323
+ if uid:
324
+ by_uuid[uid] = record
325
+ except Exception as e:
326
+ print(f"Error reading {file_path}: {e}")
327
+ return None
328
+
329
+ if not records:
330
+ return None
331
+
332
+ # Session-wide metadata from first record with it.
333
+ cwd = git_branch = agent_version = model = None
334
+ for r in records:
335
+ cwd = cwd or r.get("cwd")
336
+ git_branch = git_branch or r.get("gitBranch")
337
+ agent_version = agent_version or r.get("version")
338
+ if not model:
339
+ m = (r.get("message") or {}).get("model")
340
+ if m:
341
+ model = m
342
+
343
+ # --- Build ParsedMessage list (legacy path, with tool details) -------
344
+ messages: list[ParsedMessage] = []
345
+ pending_tool_details: list[ToolCallDetail] = []
346
+ for record in records:
347
+ rec_type = record["type"] # "user" or "assistant"
348
+
349
+ # Pair tool_result previews against the previous assistant's
350
+ # tool_details before we filter empty-content user records out.
351
+ if rec_type == "user" and pending_tool_details:
352
+ previews = _extract_tool_result_previews(record)
353
+ for td in pending_tool_details:
354
+ if td.tool_use_id in previews and not td.result_preview:
355
+ td.result_preview = previews[td.tool_use_id]
356
+ pending_tool_details = []
357
+
358
+ content = _extract_content(record)
359
+ if not content.strip():
360
+ continue
361
+
362
+ tools = (
363
+ [b.get("name", "unknown") for b in _extract_tool_uses(record)]
364
+ if rec_type == "assistant"
365
+ else []
366
+ )
367
+ tool_details: list[ToolCallDetail] = []
368
+ if rec_type == "assistant":
369
+ tool_details = _extract_tool_details(record)
370
+ pending_tool_details = tool_details
371
+
372
+ ts = _parse_timestamp(record.get("timestamp"))
373
+ est_tokens = max(1, len(content) // CHARS_PER_TOKEN)
374
+
375
+ messages.append(ParsedMessage(
376
+ uuid=record.get("uuid", ""),
377
+ session_id=session_id,
378
+ role=rec_type,
379
+ content=content,
380
+ timestamp=ts,
381
+ cwd=record.get("cwd"),
382
+ git_branch=record.get("gitBranch"),
383
+ tools_used=tools,
384
+ tool_details=tool_details,
385
+ estimated_tokens=est_tokens,
386
+ ))
387
+
388
+ if not messages:
389
+ return None
390
+
391
+ # Title from first user message that isn't a system reminder or caveat.
392
+ first_user = next(
393
+ (m for m in messages if m.role == "user" and not _SYSTEM_PREFIXES.match(m.content.strip())),
394
+ None,
395
+ )
396
+ title = None
397
+ if first_user:
398
+ clean = _XML_TAG_STRIP.sub("", first_user.content).strip()
399
+ clean = clean or first_user.content.strip()
400
+ title = clean[:80].replace("\n", " ").strip()
401
+ if len(clean) > 80:
402
+ title += "..."
403
+
404
+ timestamps = [m.timestamp for m in messages if m.timestamp]
405
+ started_at = min(timestamps) if timestamps else None
406
+ ended_at = max(timestamps) if timestamps else None
407
+
408
+ # --- Build the Trace -------------------------------------------------
409
+ trace = _build_trace(
410
+ session_id=session_id,
411
+ project=project,
412
+ records=records,
413
+ by_uuid=by_uuid,
414
+ cwd=cwd,
415
+ git_branch=git_branch,
416
+ model=model,
417
+ title=title,
418
+ )
419
+
420
+ return ParsedSession(
421
+ session_id=session_id,
422
+ project=project,
423
+ messages=messages,
424
+ started_at=started_at,
425
+ ended_at=ended_at,
426
+ cwd=cwd,
427
+ git_branch=git_branch,
428
+ agent_version=agent_version,
429
+ model=model,
430
+ title=title,
431
+ trace=trace,
432
+ )
433
+
434
+
435
+ def _walk_to_primary(uuid_: str, by_uuid: dict[str, dict]) -> str | None:
436
+ """Walk parentUuid chain until we hit a non-sidechain record.
437
+
438
+ Returns the uuid of the primary-chain assistant message that spawned
439
+ this sidechain (or None if the chain ends outside sidechain-land).
440
+ """
441
+ seen = set()
442
+ cur = uuid_
443
+ while cur and cur not in seen:
444
+ seen.add(cur)
445
+ rec = by_uuid.get(cur)
446
+ if not rec:
447
+ return None
448
+ if not rec.get("isSidechain"):
449
+ return cur
450
+ cur = rec.get("parentUuid")
451
+ return None
452
+
453
+
454
+ def _build_trace(
455
+ session_id: str,
456
+ project: str,
457
+ records: list[dict],
458
+ by_uuid: dict[str, dict],
459
+ cwd: str | None,
460
+ git_branch: str | None,
461
+ model: str | None,
462
+ title: str | None,
463
+ ) -> Trace:
464
+ """Build a Trace with span tree from session records.
465
+
466
+ Structure:
467
+ session (root)
468
+ ├─ llm_call (per assistant msg, with usage/cost)
469
+ ├─ tool (per tool_use inside an assistant msg; closed by its tool_result)
470
+ └─ agent (created for each Task tool_use; parents the sidechain sub-trace)
471
+ ├─ llm_call
472
+ └─ tool ...
473
+ """
474
+ tb = TraceBuilder(
475
+ provider_id="jsonl-session",
476
+ session_id=session_id,
477
+ project=project,
478
+ cwd=cwd,
479
+ git_branch=git_branch,
480
+ model=model,
481
+ trace_id=f"trace-{session_id}",
482
+ )
483
+
484
+ session_start = _parse_timestamp(records[0].get("timestamp")) if records else None
485
+ root = tb.start_session(
486
+ name=title or f"Session {session_id[:8]}",
487
+ started_at=session_start,
488
+ )
489
+
490
+ # Map: Task tool_use_id -> agent Span (so sidechain messages parented under the right agent)
491
+ agent_by_tool_id: dict[str, Span] = {}
492
+ # Map: primary assistant msg uuid containing the Task -> agent Span
493
+ agent_by_primary_uuid: dict[str, Span] = {}
494
+ # Map: open tool span by tool_use id (closed when tool_result arrives)
495
+ open_tools: dict[str, Span] = {}
496
+
497
+ for rec in records:
498
+ uid = rec.get("uuid", "")
499
+ rec_type = rec.get("type")
500
+ ts = _parse_timestamp(rec.get("timestamp"))
501
+ is_side = bool(rec.get("isSidechain"))
502
+
503
+ # Determine this record's parent span in the tree.
504
+ parent_span: Span = root
505
+ if is_side:
506
+ primary_uid = _walk_to_primary(uid, by_uuid)
507
+ if primary_uid and primary_uid in agent_by_primary_uuid:
508
+ parent_span = agent_by_primary_uuid[primary_uid]
509
+ # else: fall through to root (orphan sidechain — rare)
510
+
511
+ if rec_type == "assistant":
512
+ msg = rec.get("message") or {}
513
+ usage = msg.get("usage") or {}
514
+ model_id = msg.get("model") or model
515
+
516
+ input_tokens = usage.get("input_tokens") or 0
517
+ cache_write = usage.get("cache_creation_input_tokens") or 0
518
+ cache_read = usage.get("cache_read_input_tokens") or 0
519
+ output_tokens = usage.get("output_tokens") or 0
520
+ cost = _cost_for_usage(model_id, usage)
521
+
522
+ llm_span = tb.start_llm_call(
523
+ parent=parent_span,
524
+ name="assistant.turn",
525
+ started_at=ts,
526
+ model=model_id,
527
+ sidechain=is_side,
528
+ message_uuid=uid,
529
+ )
530
+ tb.end_span(
531
+ llm_span,
532
+ ended_at=ts,
533
+ input_tokens=input_tokens,
534
+ output_tokens=output_tokens,
535
+ cache_read_tokens=cache_read,
536
+ cache_write_tokens=cache_write,
537
+ cost_usd=cost,
538
+ )
539
+
540
+ # Open a span for every tool_use in this assistant turn.
541
+ for tu in _extract_tool_uses(rec):
542
+ tool_name = tu.get("name", "unknown")
543
+ tool_id = tu.get("id") or f"tu-{uid}-{tool_name}"
544
+ tool_input = tu.get("input") if isinstance(tu.get("input"), dict) else None
545
+
546
+ if tool_name == "Task":
547
+ # Agent span — child of whatever parent_span is.
548
+ sub_type = (tool_input or {}).get("subagent_type") or "generic"
549
+ prompt = (tool_input or {}).get("prompt") or (tool_input or {}).get("description")
550
+ agent = tb.start_agent(
551
+ parent=parent_span,
552
+ name=f"agent:{sub_type}",
553
+ started_at=ts,
554
+ agent_type=sub_type,
555
+ agent_prompt=prompt,
556
+ task_tool_id=tool_id,
557
+ )
558
+ agent_by_tool_id[tool_id] = agent
559
+ agent_by_primary_uuid[uid] = agent
560
+ # Also track it as an open "tool" so the tool_result closes the agent.
561
+ open_tools[tool_id] = agent
562
+ else:
563
+ tool_span = tb.start_tool(
564
+ parent=parent_span,
565
+ name=f"tool:{tool_name}",
566
+ tool_name=tool_name,
567
+ started_at=ts,
568
+ tool_input=tool_input,
569
+ tool_use_id=tool_id,
570
+ )
571
+ open_tools[tool_id] = tool_span
572
+
573
+ elif rec_type == "user":
574
+ # A user record may wrap tool_results; close the matching tool spans.
575
+ for tr in _extract_tool_results(rec):
576
+ tool_use_id = tr.get("tool_use_id") or ""
577
+ span = open_tools.pop(tool_use_id, None)
578
+ if span is None:
579
+ continue
580
+ text, is_error = _tool_result_text(tr)
581
+ status = SpanStatus.ERROR if is_error else SpanStatus.OK
582
+ tb.end_span(
583
+ span,
584
+ ended_at=ts,
585
+ status=status,
586
+ tool_output=text[:4000] if text else None,
587
+ tool_is_error=bool(is_error) if is_error is not None else None,
588
+ )
589
+
590
+ # Close any still-open tool spans with the last known timestamp.
591
+ last_ts = None
592
+ for rec in reversed(records):
593
+ last_ts = _parse_timestamp(rec.get("timestamp"))
594
+ if last_ts:
595
+ break
596
+ for span in open_tools.values():
597
+ tb.end_span(span, ended_at=last_ts, status=SpanStatus.OK)
598
+
599
+ return tb.finalize()
600
+
601
+
602
+ def discover_session_files() -> list[Path]:
603
+ """Find all session JSONL files in the projects directory."""
604
+ if not SESSIONS_PROJECTS_DIR.exists():
605
+ return []
606
+ files = []
607
+ for project_dir in SESSIONS_PROJECTS_DIR.iterdir():
608
+ if not project_dir.is_dir():
609
+ continue
610
+ for f in project_dir.glob("*.jsonl"):
611
+ name = f.stem
612
+ if len(name) == 36 and name.count("-") == 4:
613
+ files.append(f)
614
+ return sorted(files, key=lambda f: f.stat().st_mtime, reverse=True)