gora-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gora/parsers.py ADDED
@@ -0,0 +1,626 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import hashlib
5
+ import json
6
+ from pathlib import Path
7
+ import re
8
+ from typing import Any, Iterable
9
+
10
+
11
+ PROVIDERS = ("codex", "claude", "pi")
12
+ CONTEXT_INJECTION_PREFIXES = (
13
+ "# AGENTS.md instructions",
14
+ "AGENTS.md instructions",
15
+ )
16
+ TITLE_NOISE_PREFIXES = (
17
+ *CONTEXT_INJECTION_PREFIXES,
18
+ "<turn_aborted>",
19
+ "<user_action>",
20
+ "<environment_context>",
21
+ )
22
+ IMAGE_TAG_PATTERN = re.compile(r"(?s)</?image\b[^>]*(>|$)")
23
+ DIRECT_SECRET_PATTERNS: tuple[re.Pattern[str], ...] = (
24
+ re.compile(r"\bsk-proj-[A-Za-z0-9_-]{12,}\b"),
25
+ re.compile(r"\bsk-ant-[A-Za-z0-9_-]{12,}\b"),
26
+ re.compile(r"\bsk-[A-Za-z0-9_-]{20,}\b"),
27
+ re.compile(r"\bgithub_pat_[A-Za-z0-9_]{20,}\b"),
28
+ re.compile(r"\bgh[pousr]_[A-Za-z0-9_]{20,}\b"),
29
+ re.compile(r"\bAKIA[0-9A-Z]{16}\b"),
30
+ )
31
+ KEY_VALUE_SECRET_PATTERN = re.compile(
32
+ r"(?i)\b([A-Z0-9_]*(?:TOKEN|SECRET|API_KEY|PASSWORD|PASS|PWD|AUTH)[A-Z0-9_]*\s*[:=]\s*)([^\s]+)"
33
+ )
34
+ AUTHORIZATION_BEARER_PATTERN = re.compile(r"(?i)\b(authorization\s*:\s*bearer\s+)([^\s]+)")
35
+ DIRECT_SECRET_MARKERS = ("sk-", "github_pat_", "ghp_", "gho_", "ghu_", "ghs_", "ghr_", "AKIA")
36
+ KEY_VALUE_SECRET_MARKERS = (
37
+ "TOKEN=",
38
+ "TOKEN =",
39
+ "TOKEN:",
40
+ "TOKEN :",
41
+ "SECRET=",
42
+ "SECRET =",
43
+ "SECRET:",
44
+ "SECRET :",
45
+ "API_KEY=",
46
+ "API_KEY =",
47
+ "API_KEY:",
48
+ "API_KEY :",
49
+ "PASSWORD=",
50
+ "PASSWORD =",
51
+ "PASSWORD:",
52
+ "PASSWORD :",
53
+ "PASS=",
54
+ "PASS =",
55
+ "PASS:",
56
+ "PASS :",
57
+ "PWD=",
58
+ "PWD =",
59
+ "PWD:",
60
+ "PWD :",
61
+ "AUTH=",
62
+ "AUTH =",
63
+ "AUTH:",
64
+ "AUTH :",
65
+ "AUTHORIZATION:",
66
+ "AUTHORIZATION :",
67
+ )
68
+ THINKING_BLOCK_TYPES = {"thinking", "reasoning"}
69
+ TOOL_CALL_BLOCK_TYPES = {
70
+ "custom-tool-call",
71
+ "function-call",
72
+ "server-tool-call",
73
+ "tool-call",
74
+ "tool-use",
75
+ "toolcall",
76
+ "web-search-call",
77
+ }
78
+ TOOL_RESULT_BLOCK_TYPES = {
79
+ "custom-tool-call-output",
80
+ "function-call-output",
81
+ "tool-result",
82
+ "toolresult",
83
+ }
84
+
85
+
86
+ @dataclass(frozen=True)
87
+ class ChatMessage:
88
+ ordinal: int
89
+ role: str
90
+ text: str
91
+ timestamp: str | None
92
+ raw_type: str | None = None
93
+ model: str | None = None
94
+ model_provider: str | None = None
95
+
96
+
97
+ @dataclass(frozen=True)
98
+ class ChatSession:
99
+ provider: str
100
+ session_id: str
101
+ source_path: Path
102
+ cwd: str | None
103
+ started_at: str | None
104
+ updated_at: str | None
105
+ title: str | None
106
+ messages: tuple[ChatMessage, ...]
107
+ source_mtime: float
108
+ source_size: int
109
+ parent_session_id: str | None = None
110
+ thread_source: str | None = None
111
+ source_label: str | None = None
112
+
113
+
114
+ def discover_history_files(provider: str, home: Path | None = None) -> list[Path]:
115
+ home = home or Path.home()
116
+ roots = {
117
+ "codex": home / ".codex" / "sessions",
118
+ "claude": home / ".claude" / "projects",
119
+ "pi": home / ".pi" / "agent" / "sessions",
120
+ }
121
+ if provider not in roots:
122
+ raise ValueError(f"unsupported provider: {provider}")
123
+
124
+ root = roots[provider]
125
+ if not root.exists():
126
+ return []
127
+ return sorted(root.rglob("*.jsonl"), key=lambda path: path.stat().st_mtime, reverse=True)
128
+
129
+
130
+ def parse_history_file(provider: str, path: Path, *, include_tool_results: bool = False) -> ChatSession:
131
+ if provider == "codex":
132
+ return parse_codex(path, include_tool_results=include_tool_results)
133
+ if provider == "claude":
134
+ return parse_claude(path, include_tool_results=include_tool_results)
135
+ if provider == "pi":
136
+ return parse_pi(path, include_tool_results=include_tool_results)
137
+ raise ValueError(f"unsupported provider: {provider}")
138
+
139
+
140
+ def parse_codex(path: Path, *, include_tool_results: bool = False) -> ChatSession:
141
+ session_id = path.stem
142
+ parent_session_id: str | None = None
143
+ thread_source: str | None = None
144
+ source_label: str | None = None
145
+ cwd: str | None = None
146
+ model_provider: str | None = None
147
+ current_model: str | None = None
148
+ started_at: str | None = None
149
+ updated_at: str | None = None
150
+ messages: list[ChatMessage] = []
151
+
152
+ for obj in _iter_jsonl(path):
153
+ timestamp = _string(obj.get("timestamp"))
154
+ updated_at = _max_time(updated_at, timestamp)
155
+
156
+ if obj.get("type") == "session_meta":
157
+ payload = _dict(obj.get("payload"))
158
+ session_id = _string(payload.get("id")) or session_id
159
+ parent_session_id = _string(payload.get("parent_thread_id")) or parent_session_id
160
+ thread_source = _string(payload.get("thread_source")) or thread_source
161
+ source_label = _source_label(payload.get("source")) or source_label
162
+ cwd = _string(payload.get("cwd")) or cwd
163
+ model_provider = _string(payload.get("model_provider")) or model_provider
164
+ started_at = _string(payload.get("timestamp")) or timestamp or started_at
165
+ updated_at = _max_time(updated_at, started_at)
166
+ continue
167
+
168
+ if obj.get("type") == "turn_context":
169
+ payload = _dict(obj.get("payload"))
170
+ cwd = _string(payload.get("cwd")) or cwd
171
+ current_model = _codex_turn_model(payload) or current_model
172
+ continue
173
+
174
+ if obj.get("type") != "response_item":
175
+ continue
176
+
177
+ payload = _dict(obj.get("payload"))
178
+ payload_type = _normalized_type(_string(payload.get("type")))
179
+ if payload_type == "message":
180
+ role = normalize_role(_string(payload.get("role")) or "unknown")
181
+ role = _role_from_blocks(role, payload.get("content"))
182
+ text = content_to_text(payload.get("content"), include_tool_results=include_tool_results)
183
+ elif payload_type in TOOL_CALL_BLOCK_TYPES:
184
+ role = "tool-call"
185
+ text = _format_tool_call(payload)
186
+ elif payload_type in TOOL_RESULT_BLOCK_TYPES:
187
+ role = "tool"
188
+ text = _format_tool_result(payload)
189
+ else:
190
+ continue
191
+
192
+ _append_message(
193
+ messages,
194
+ role=role,
195
+ text=text,
196
+ timestamp=timestamp,
197
+ raw_type=_string(payload.get("type")),
198
+ model=_string(payload.get("model")) or current_model,
199
+ model_provider=_string(payload.get("model_provider")) or model_provider,
200
+ )
201
+
202
+ return _session(
203
+ "codex",
204
+ session_id,
205
+ path,
206
+ cwd,
207
+ started_at,
208
+ updated_at,
209
+ messages,
210
+ parent_session_id=parent_session_id,
211
+ thread_source=thread_source,
212
+ source_label=source_label,
213
+ )
214
+
215
+
216
+ def parse_claude(path: Path, *, include_tool_results: bool = False) -> ChatSession:
217
+ session_id = path.stem
218
+ explicit_session_id = False
219
+ cwd: str | None = None
220
+ started_at: str | None = None
221
+ updated_at: str | None = None
222
+ messages: list[ChatMessage] = []
223
+
224
+ for obj in _iter_jsonl(path):
225
+ typ = _string(obj.get("type"))
226
+ if typ not in {"user", "assistant"}:
227
+ continue
228
+
229
+ timestamp = _string(obj.get("timestamp"))
230
+ started_at = started_at or timestamp
231
+ updated_at = _max_time(updated_at, timestamp)
232
+ parsed_session_id = _string(obj.get("sessionId"))
233
+ if parsed_session_id:
234
+ session_id = parsed_session_id
235
+ explicit_session_id = True
236
+ cwd = _string(obj.get("cwd")) or cwd
237
+
238
+ message = _dict(obj.get("message"))
239
+ content = message.get("content")
240
+ role = normalize_role(_string(message.get("role")) or typ or "unknown")
241
+ role = _role_from_blocks(role, content)
242
+ text = content_to_text(content, include_tool_results=include_tool_results)
243
+ _append_message(
244
+ messages,
245
+ role=role,
246
+ text=text,
247
+ timestamp=timestamp,
248
+ raw_type=typ,
249
+ model=_string(message.get("model")),
250
+ model_provider=_string(message.get("provider")),
251
+ )
252
+
253
+ if not explicit_session_id:
254
+ session_id = _fallback_session_id(path)
255
+
256
+ return _session("claude", session_id, path, cwd, started_at, updated_at, messages)
257
+
258
+
259
+ def parse_pi(path: Path, *, include_tool_results: bool = False) -> ChatSession:
260
+ session_id = _session_id_from_pi_filename(path)
261
+ cwd: str | None = None
262
+ current_model: str | None = None
263
+ current_model_provider: str | None = None
264
+ started_at: str | None = None
265
+ updated_at: str | None = None
266
+ messages: list[ChatMessage] = []
267
+
268
+ for obj in _iter_jsonl(path):
269
+ typ = _string(obj.get("type"))
270
+ timestamp = _string(obj.get("timestamp"))
271
+ updated_at = _max_time(updated_at, timestamp)
272
+
273
+ if typ == "session":
274
+ session_id = _string(obj.get("id")) or session_id
275
+ cwd = _string(obj.get("cwd")) or cwd
276
+ started_at = timestamp or started_at
277
+ continue
278
+
279
+ if typ == "model_change":
280
+ current_model = _string(obj.get("modelId")) or current_model
281
+ current_model_provider = _string(obj.get("provider")) or current_model_provider
282
+ continue
283
+
284
+ if typ != "message":
285
+ continue
286
+
287
+ message = _dict(obj.get("message"))
288
+ role = normalize_role(_string(message.get("role")) or "unknown")
289
+ role = _role_from_blocks(role, message.get("content"))
290
+ text = content_to_text(message.get("content"), include_tool_results=include_tool_results)
291
+ _append_message(
292
+ messages,
293
+ role=role,
294
+ text=text,
295
+ timestamp=timestamp,
296
+ raw_type=typ,
297
+ model=_string(message.get("model")) or current_model,
298
+ model_provider=_string(message.get("provider")) or current_model_provider,
299
+ )
300
+
301
+ return _session("pi", session_id, path, cwd, started_at, updated_at, messages)
302
+
303
+
304
+ def content_to_text(value: Any, *, include_tool_results: bool) -> str:
305
+ parts: list[str] = []
306
+ _collect_text(value, parts, include_tool_results=include_tool_results)
307
+ return redact_secrets("\n".join(part.strip() for part in parts if part and part.strip()).strip())
308
+
309
+
310
+ def redact_secrets(text: str) -> str:
311
+ redacted = text
312
+ if any(marker in redacted for marker in DIRECT_SECRET_MARKERS):
313
+ for pattern in DIRECT_SECRET_PATTERNS:
314
+ redacted = pattern.sub("<redacted>", redacted)
315
+ upper = redacted.upper()
316
+ if any(marker in upper for marker in KEY_VALUE_SECRET_MARKERS):
317
+ redacted = AUTHORIZATION_BEARER_PATTERN.sub(r"\1<redacted>", redacted)
318
+ redacted = KEY_VALUE_SECRET_PATTERN.sub(r"\1<redacted>", redacted)
319
+ return redacted
320
+
321
+
322
+ def is_context_injection_text(text: str | None) -> bool:
323
+ if not text:
324
+ return False
325
+ stripped = text.lstrip()
326
+ return any(stripped.startswith(prefix) for prefix in CONTEXT_INJECTION_PREFIXES)
327
+
328
+
329
+ def is_title_noise_text(text: str | None) -> bool:
330
+ if not text:
331
+ return False
332
+ stripped = text.lstrip()
333
+ return any(stripped.startswith(prefix) for prefix in TITLE_NOISE_PREFIXES)
334
+
335
+
336
+ def is_image_reference_text(text: str | None) -> bool:
337
+ if not text:
338
+ return False
339
+ return bool(IMAGE_TAG_PATTERN.search(text))
340
+
341
+
342
+ def normalize_role(role: str) -> str:
343
+ normalized = role.strip().lower().replace("_", "-")
344
+ if normalized in {"toolresult", "tool-result"}:
345
+ return "tool"
346
+ if normalized in TOOL_CALL_BLOCK_TYPES:
347
+ return "tool-call"
348
+ return normalized or "unknown"
349
+
350
+
351
+ def _collect_text(value: Any, parts: list[str], *, include_tool_results: bool) -> None:
352
+ if value is None:
353
+ return
354
+
355
+ if isinstance(value, str):
356
+ parts.append(value)
357
+ return
358
+
359
+ if isinstance(value, list):
360
+ for item in value:
361
+ _collect_text(item, parts, include_tool_results=include_tool_results)
362
+ return
363
+
364
+ if not isinstance(value, dict):
365
+ return
366
+
367
+ block_type = _normalized_type(_string(value.get("type")))
368
+ if block_type in THINKING_BLOCK_TYPES:
369
+ return
370
+
371
+ if block_type in TOOL_CALL_BLOCK_TYPES:
372
+ parts.append(_format_tool_call(value))
373
+ return
374
+
375
+ if block_type in {"text", "input-text", "output-text"} and isinstance(value.get("text"), str):
376
+ parts.append(value["text"])
377
+ return
378
+
379
+ if block_type in TOOL_RESULT_BLOCK_TYPES:
380
+ _collect_text(value.get("content"), parts, include_tool_results=include_tool_results)
381
+ return
382
+
383
+ if block_type in {"image", "input-image"}:
384
+ mime_type = _string(value.get("mimeType")) or _string(value.get("mime_type"))
385
+ parts.append(f"[image: {mime_type or 'attachment'}]")
386
+ return
387
+
388
+ if isinstance(value.get("text"), str):
389
+ parts.append(value["text"])
390
+ return
391
+
392
+ if "content" in value:
393
+ _collect_text(value.get("content"), parts, include_tool_results=include_tool_results)
394
+
395
+
396
+ def _role_from_blocks(role: str, content: Any) -> str:
397
+ if not isinstance(content, list) or not content:
398
+ return role
399
+
400
+ block_types = [
401
+ _normalized_type(_string(block.get("type")))
402
+ for block in content
403
+ if isinstance(block, dict)
404
+ ]
405
+ meaningful = [block_type for block_type in block_types if block_type not in THINKING_BLOCK_TYPES]
406
+ if meaningful and all(block_type in TOOL_RESULT_BLOCK_TYPES for block_type in meaningful):
407
+ return "tool"
408
+ if meaningful and all(block_type in TOOL_CALL_BLOCK_TYPES for block_type in meaningful):
409
+ return "tool-call"
410
+ return role
411
+
412
+
413
+ def _append_message(
414
+ messages: list[ChatMessage],
415
+ *,
416
+ role: str,
417
+ text: str,
418
+ timestamp: str | None,
419
+ raw_type: str | None,
420
+ model: str | None,
421
+ model_provider: str | None,
422
+ ) -> None:
423
+ if not text:
424
+ return
425
+ messages.append(
426
+ ChatMessage(
427
+ ordinal=len(messages),
428
+ role=role,
429
+ text=text,
430
+ timestamp=timestamp,
431
+ raw_type=raw_type,
432
+ model=model,
433
+ model_provider=model_provider,
434
+ )
435
+ )
436
+
437
+
438
+ def _format_tool_call(value: dict[str, Any]) -> str:
439
+ block_type = _normalized_type(_string(value.get("type")))
440
+ name = (
441
+ _string(value.get("name"))
442
+ or _string(value.get("tool_name"))
443
+ or _string(value.get("toolName"))
444
+ or _display_type(block_type)
445
+ )
446
+ call_id = _string(value.get("call_id")) or _string(value.get("id")) or _string(value.get("tool_use_id"))
447
+ status = _string(value.get("status"))
448
+ arguments = _first_present(value, "arguments", "input", "args", "query")
449
+
450
+ lines = [f"Tool call: {name}"]
451
+ if call_id:
452
+ lines.append(f"Call ID: {call_id}")
453
+ if status:
454
+ lines.append(f"Status: {status}")
455
+ if arguments is not None and arguments != "":
456
+ lines.append(f"Arguments:\n{_stringify_value(arguments)}")
457
+ return "\n".join(lines)
458
+
459
+
460
+ def _format_tool_result(value: dict[str, Any]) -> str:
461
+ call_id = _string(value.get("call_id")) or _string(value.get("id")) or _string(value.get("tool_use_id"))
462
+ output = _first_present(value, "output", "content", "result", "text")
463
+ output_text = content_to_text(output, include_tool_results=True)
464
+ if not output_text and output is not None:
465
+ output_text = redact_secrets(_stringify_value(output))
466
+
467
+ heading = "Tool result"
468
+ if call_id:
469
+ heading += f": {call_id}"
470
+ if not output_text:
471
+ return heading
472
+ return f"{heading}\n{output_text}"
473
+
474
+
475
+ def _first_present(value: dict[str, Any], *keys: str) -> Any:
476
+ for key in keys:
477
+ if key in value:
478
+ return value[key]
479
+ return None
480
+
481
+
482
+ def _stringify_value(value: Any) -> str:
483
+ if isinstance(value, str):
484
+ return value
485
+ try:
486
+ return json.dumps(value, indent=2, sort_keys=True)
487
+ except (TypeError, ValueError):
488
+ return str(value)
489
+
490
+
491
+ def _normalized_type(value: str | None) -> str:
492
+ if not value:
493
+ return ""
494
+ normalized = value.strip().replace("_", "-").lower()
495
+ if normalized == "toolcall":
496
+ return "tool-call"
497
+ if normalized == "toolresult":
498
+ return "tool-result"
499
+ return normalized
500
+
501
+
502
+ def _display_type(value: str) -> str:
503
+ return value.replace("-", " ") if value else "tool"
504
+
505
+
506
+ def _codex_turn_model(payload: dict[str, Any]) -> str | None:
507
+ model = _string(payload.get("model"))
508
+ if model:
509
+ return model
510
+
511
+ collaboration_mode = _dict(payload.get("collaboration_mode"))
512
+ settings = _dict(collaboration_mode.get("settings"))
513
+ return _string(settings.get("model"))
514
+
515
+
516
+ def _iter_jsonl(path: Path) -> Iterable[dict[str, Any]]:
517
+ with path.open("r", encoding="utf-8") as handle:
518
+ for line in handle:
519
+ stripped = line.strip()
520
+ if not stripped:
521
+ continue
522
+ try:
523
+ obj = json.loads(stripped)
524
+ except json.JSONDecodeError:
525
+ continue
526
+ if isinstance(obj, dict):
527
+ yield obj
528
+
529
+
530
+ def _session(
531
+ provider: str,
532
+ session_id: str,
533
+ path: Path,
534
+ cwd: str | None,
535
+ started_at: str | None,
536
+ updated_at: str | None,
537
+ messages: list[ChatMessage],
538
+ *,
539
+ parent_session_id: str | None = None,
540
+ thread_source: str | None = None,
541
+ source_label: str | None = None,
542
+ ) -> ChatSession:
543
+ stat = path.stat()
544
+ if not updated_at and messages:
545
+ updated_at = messages[-1].timestamp
546
+ return ChatSession(
547
+ provider=provider,
548
+ session_id=session_id,
549
+ source_path=path,
550
+ cwd=cwd,
551
+ started_at=started_at,
552
+ updated_at=updated_at or started_at,
553
+ title=_title_from_messages(messages),
554
+ messages=tuple(messages),
555
+ source_mtime=stat.st_mtime,
556
+ source_size=stat.st_size,
557
+ parent_session_id=parent_session_id,
558
+ thread_source=thread_source,
559
+ source_label=source_label,
560
+ )
561
+
562
+
563
+ def _source_label(value: Any) -> str | None:
564
+ if isinstance(value, str):
565
+ return value
566
+ if isinstance(value, dict):
567
+ subagent = _string(value.get("subagent"))
568
+ if subagent:
569
+ return f"subagent:{subagent}"
570
+ return None
571
+
572
+
573
+ def _title_from_messages(messages: list[ChatMessage]) -> str | None:
574
+ for message in messages:
575
+ if message.role == "user" and not is_title_noise_text(message.text):
576
+ title = _title_text(message.text)
577
+ if title:
578
+ return _limit_title(title, limit=96)
579
+ for message in messages:
580
+ if message.text and not is_title_noise_text(message.text):
581
+ title = _title_text(message.text)
582
+ if title:
583
+ return _limit_title(title, limit=96)
584
+ return None
585
+
586
+
587
+ def _title_text(text: str) -> str:
588
+ without_images = IMAGE_TAG_PATTERN.sub(" ", text)
589
+ line = " ".join(without_images.split())
590
+ if not line and IMAGE_TAG_PATTERN.search(text):
591
+ return "image attachment"
592
+ return line
593
+
594
+
595
+ def _limit_title(line: str, *, limit: int) -> str:
596
+ if len(line) <= limit:
597
+ return line
598
+ return line[: limit - 1].rstrip() + "..."
599
+
600
+
601
+ def _max_time(left: str | None, right: str | None) -> str | None:
602
+ if not left:
603
+ return right
604
+ if not right:
605
+ return left
606
+ return max(left, right)
607
+
608
+
609
+ def _string(value: Any) -> str | None:
610
+ return value if isinstance(value, str) else None
611
+
612
+
613
+ def _dict(value: Any) -> dict[str, Any]:
614
+ return value if isinstance(value, dict) else {}
615
+
616
+
617
+ def _session_id_from_pi_filename(path: Path) -> str:
618
+ name = path.stem
619
+ if "_" in name:
620
+ return name.rsplit("_", 1)[1]
621
+ return name
622
+
623
+
624
+ def _fallback_session_id(path: Path) -> str:
625
+ digest = hashlib.sha1(str(path).encode("utf-8")).hexdigest()[:10]
626
+ return f"{path.stem}-{digest}"