argus-code 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. argus/__init__.py +3 -0
  2. argus/adapters/__init__.py +7 -0
  3. argus/adapters/base.py +108 -0
  4. argus/adapters/claude_code/__init__.py +5 -0
  5. argus/adapters/claude_code/adapter.py +63 -0
  6. argus/adapters/claude_code/discover.py +72 -0
  7. argus/adapters/claude_code/extract_tool_calls.py +86 -0
  8. argus/adapters/claude_code/extract_transcript.py +111 -0
  9. argus/adapters/claude_code/extract_turns.py +69 -0
  10. argus/adapters/claude_code/history_jsonl.py +138 -0
  11. argus/adapters/claude_code/ingest_file.py +137 -0
  12. argus/adapters/claude_code/model.py +11 -0
  13. argus/adapters/claude_code/schemas.py +77 -0
  14. argus/adapters/registry.py +30 -0
  15. argus/cli.py +384 -0
  16. argus/collector/__init__.py +0 -0
  17. argus/collector/aggregate.py +102 -0
  18. argus/collector/first_run.py +189 -0
  19. argus/collector/pipeline.py +140 -0
  20. argus/collector/rollup_subagents.py +27 -0
  21. argus/collector/scheduler.py +89 -0
  22. argus/collector/search_backfill.py +109 -0
  23. argus/collector/watcher.py +178 -0
  24. argus/dashboard-dist/_astro/charts.BIevw6Es.js +1 -0
  25. argus/dashboard-dist/_astro/format.DxC1NGYT.js +1 -0
  26. argus/dashboard-dist/_astro/index.astro_astro_type_script_index_0_lang.CgwSARdD.js +24 -0
  27. argus/dashboard-dist/_astro/index.astro_astro_type_script_index_0_lang.W18SJsr7.js +11 -0
  28. argus/dashboard-dist/_astro/installCanvasRenderer.D_tC6TXz.js +18 -0
  29. argus/dashboard-dist/_astro/models.astro_astro_type_script_index_0_lang.BHTHXYHC.js +13 -0
  30. argus/dashboard-dist/_astro/prompts.astro_astro_type_script_index_0_lang.DfNgiDv9.js +17 -0
  31. argus/dashboard-dist/_astro/session.astro_astro_type_script_index_0_lang.Dj_bfrIa.js +86 -0
  32. argus/dashboard-dist/_astro/settings.astro_astro_type_script_index_0_lang.d_a-uvdi.js +24 -0
  33. argus/dashboard-dist/_astro/tools.astro_astro_type_script_index_0_lang.Dzzau3Yt.js +12 -0
  34. argus/dashboard-dist/_astro/trends.astro_astro_type_script_index_0_lang.BLLeGRNa.js +5 -0
  35. argus/dashboard-dist/index.html +2 -0
  36. argus/dashboard-dist/models/index.html +1 -0
  37. argus/dashboard-dist/prompts/index.html +18 -0
  38. argus/dashboard-dist/session/index.html +2 -0
  39. argus/dashboard-dist/sessions/index.html +1 -0
  40. argus/dashboard-dist/settings/index.html +8 -0
  41. argus/dashboard-dist/styles/global.css +307 -0
  42. argus/dashboard-dist/tools/index.html +1 -0
  43. argus/dashboard-dist/trends/index.html +1 -0
  44. argus/detectors/__init__.py +6 -0
  45. argus/detectors/base.py +34 -0
  46. argus/detectors/registry.py +20 -0
  47. argus/detectors/tool_error_rate_spike.py +138 -0
  48. argus/pricing/2026-05-02.json +24 -0
  49. argus/pricing/__init__.py +0 -0
  50. argus/pricing/compute.py +46 -0
  51. argus/pricing/load.py +45 -0
  52. argus/pricing/refresh.py +91 -0
  53. argus/pricing/types.py +21 -0
  54. argus/scaffold/__init__.py +0 -0
  55. argus/scaffold/scaffolder.py +45 -0
  56. argus/scaffold/snapshot.py +73 -0
  57. argus/scaffold/storage.py +60 -0
  58. argus/schema/__init__.py +0 -0
  59. argus/schema/types.py +157 -0
  60. argus/server/__init__.py +0 -0
  61. argus/server/api.py +661 -0
  62. argus/server/app.py +97 -0
  63. argus/store/__init__.py +0 -0
  64. argus/store/db.py +103 -0
  65. argus/store/migrations/__init__.py +0 -0
  66. argus/store/migrations/inline.py +180 -0
  67. argus/store/repository.py +778 -0
  68. argus/templates/default/.claude/agents/code-reviewer.md +27 -0
  69. argus/templates/default/.claude/agents/security-auditor.md +28 -0
  70. argus/templates/default/.claude/commands/commit.md +38 -0
  71. argus/templates/default/.claude/commands/deploy.md +13 -0
  72. argus/templates/default/.claude/commands/fix-issue.md +15 -0
  73. argus/templates/default/.claude/commands/pr.md +38 -0
  74. argus/templates/default/.claude/commands/review.md +14 -0
  75. argus/templates/default/.claude/rules/api-conventions.md +27 -0
  76. argus/templates/default/.claude/rules/code-style.md +25 -0
  77. argus/templates/default/.claude/rules/testing.md +19 -0
  78. argus/templates/default/.claude/settings.json +28 -0
  79. argus/templates/default/.claude/skills/example/SKILL.md +11 -0
  80. argus/templates/default/CLAUDE.md +57 -0
  81. argus_code-0.2.0.dist-info/METADATA +247 -0
  82. argus_code-0.2.0.dist-info/RECORD +86 -0
  83. argus_code-0.2.0.dist-info/WHEEL +4 -0
  84. argus_code-0.2.0.dist-info/entry_points.txt +2 -0
  85. argus_code-0.2.0.dist-info/licenses/LICENSE +21 -0
  86. argus_code-0.2.0.dist-info/licenses/NOTICE +22 -0
argus/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Argus — local-first analytics for Claude Code and other coding agents."""
2
+
3
+ __version__ = "0.2.0"
@@ -0,0 +1,7 @@
1
+ """Adapters — one subpackage per coding-agent data source.
2
+
3
+ Importing this module auto-imports every known adapter so their
4
+ ``@register`` decorators run and the registry is populated.
5
+ """
6
+ # Side-effect imports — each adapter's package self-registers via @register.
7
+ from . import claude_code # noqa: F401
argus/adapters/base.py ADDED
@@ -0,0 +1,108 @@
1
+ """Adapter contract.
2
+
3
+ Adapters know how to find and parse one coding-agent's session logs. The
4
+ pipeline, watcher, and server are agent-agnostic — they only see this
5
+ protocol. Anything agent-specific (e.g., Claude Code's ``history.jsonl``
6
+ sidecar, sub-agent rollup) is expressed through the optional extension
7
+ points below; defaults are no-ops so simple adapters stay simple.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
13
+
14
+ from pydantic import BaseModel
15
+
16
+ from ..schema.types import RawSessionHeader, RawTurnEvent
17
+
18
+ if TYPE_CHECKING:
19
+ from ..store.repository import Repository
20
+
21
+
22
+ class ParseError(BaseModel):
23
+ """One JSONL line (or whole-file failure) that couldn't be parsed."""
24
+
25
+ file: str
26
+ byte_offset: int
27
+ reason: str
28
+ raw_line_truncated: str
29
+
30
+
31
+ class RawToolCall(BaseModel):
32
+ """Adapter-local tool_call shape. Pipeline stamps session_id + composite id."""
33
+
34
+ native_turn_id: str
35
+ turn_index: int
36
+ block_index: int
37
+ tool_name: str
38
+ tool_use_id: str
39
+ is_error: int # 0 | 1
40
+ input_size: int
41
+ subagent_type: str | None
42
+ timestamp: str
43
+
44
+
45
+ class RawSegment(BaseModel):
46
+ """Adapter-local transcript segment. uid_suffix = '{line_uuid}:{block_index}'."""
47
+
48
+ uid_suffix: str
49
+ timestamp: str
50
+ role: str # 'user' | 'assistant' | 'thinking' | 'tool_result'
51
+ text: str
52
+
53
+
54
+ class AdapterIngestResult(BaseModel):
55
+ """What an adapter returns from ingesting one file."""
56
+
57
+ model_config = {"arbitrary_types_allowed": True}
58
+
59
+ header: RawSessionHeader
60
+ turns: list[RawTurnEvent]
61
+ parse_errors: list[ParseError] = []
62
+ # Optional — adapters that don't populate these leave them empty. The
63
+ # pipeline treats empty as "no data", not "delete existing rows".
64
+ tool_calls: list[RawToolCall] = []
65
+ segments: list[RawSegment] = []
66
+
67
+
68
+ @runtime_checkable
69
+ class Adapter(Protocol):
70
+ """The pluggable contract every agent integration implements."""
71
+
72
+ agent: str # e.g., "claude_code", "codex", "openclaw", "hermes"
73
+
74
+ def root_path(self) -> Path: ...
75
+
76
+ def is_present(self) -> bool:
77
+ """True if this agent's data exists on the current machine."""
78
+ ...
79
+
80
+ def discover_session_files(self) -> list[Path]: ...
81
+
82
+ def ingest_file(
83
+ self, path: Path, from_offset: int = 0
84
+ ) -> tuple[AdapterIngestResult, int]:
85
+ """Read from ``from_offset``; return (result, new_offset)."""
86
+ ...
87
+
88
+ # ─── Optional extension points (default no-op) ─────────────────────
89
+
90
+ def extra_watch_paths(self) -> list[Path]:
91
+ """Side-channel files this adapter wants tailed (e.g., history.jsonl)."""
92
+ return []
93
+
94
+ def ingest_extra(self, path: Path, repo: "Repository") -> None:
95
+ """Handle an event on one of ``extra_watch_paths()``."""
96
+ return None
97
+
98
+ def sub_session_files_for(self, session_file: Path) -> list[Path]:
99
+ """Files that roll up into ``session_file`` (e.g., sub-agent JSONLs)."""
100
+ return []
101
+
102
+ def should_skip(self, path: Path) -> bool:
103
+ """Watcher predicate — true to silently ignore events on this path."""
104
+ return False
105
+
106
+ def normalize_model_name(self, raw: str) -> str:
107
+ """Map raw model identifier to the form the pricing table keys on."""
108
+ return raw
@@ -0,0 +1,5 @@
1
+ """Claude Code adapter package.
2
+
3
+ Importing triggers the @register side effect on ClaudeCodeAdapter.
4
+ """
5
+ from .adapter import ClaudeCodeAdapter # noqa: F401
@@ -0,0 +1,63 @@
1
+ """The Claude Code adapter — façade that wires the package together."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING
6
+
7
+ from ..base import AdapterIngestResult
8
+ from ..registry import register
9
+ from .discover import discover_session_files, sub_agent_files_for
10
+ from .history_jsonl import ingest_history_file
11
+ from .ingest_file import ingest_claude_code_file
12
+ from .model import canonicalize_claude_model
13
+
14
+ if TYPE_CHECKING:
15
+ from ...store.repository import Repository
16
+
17
+
18
+ def _default_root() -> Path:
19
+ return Path.home() / ".claude"
20
+
21
+
22
+ @register
23
+ class ClaudeCodeAdapter:
24
+ agent = "claude_code"
25
+
26
+ def __init__(self, root: Path | None = None) -> None:
27
+ self._root = (root or _default_root()).resolve(strict=False)
28
+
29
+ def root_path(self) -> Path:
30
+ return self._root
31
+
32
+ def is_present(self) -> bool:
33
+ return self._root.exists()
34
+
35
+ def discover_session_files(self) -> list[Path]:
36
+ return discover_session_files(self._root)
37
+
38
+ def ingest_file(
39
+ self, path: Path, from_offset: int = 0
40
+ ) -> tuple[AdapterIngestResult, int]:
41
+ return ingest_claude_code_file(path, from_offset)
42
+
43
+ # ─── Extension-point overrides ─────────────────────────────────────
44
+
45
+ def extra_watch_paths(self) -> list[Path]:
46
+ history = self._root / "history.jsonl"
47
+ return [history] if history.exists() else []
48
+
49
+ def ingest_extra(self, path: Path, repo: "Repository") -> None:
50
+ # Only the history file is interesting for this adapter.
51
+ if path.name == "history.jsonl":
52
+ ingest_history_file(path, repo)
53
+
54
+ def sub_session_files_for(self, session_file: Path) -> list[Path]:
55
+ return sub_agent_files_for(session_file)
56
+
57
+ def should_skip(self, path: Path) -> bool:
58
+ # The subagents/ tree is walked as part of the parent's ingest,
59
+ # never as standalone sessions.
60
+ return "subagents" in path.parts
61
+
62
+ def normalize_model_name(self, raw: str) -> str:
63
+ return canonicalize_claude_model(raw)
@@ -0,0 +1,72 @@
1
+ """Discover Claude Code session JSONL files under ``~/.claude/``.
2
+
3
+ Path-safety rules:
4
+
5
+ - Realpath every candidate; reject anything that doesn't canonicalize
6
+ under the claude root (defends against a hostile symlink pointing at
7
+ ``/etc/passwd``).
8
+ - On Windows, lowercase the comparison since the filesystem is
9
+ case-insensitive but Python string equality isn't.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ _IS_WIN = sys.platform == "win32"
18
+
19
+
20
+ def _norm(p: str) -> str:
21
+ return p.lower() if _IS_WIN else p
22
+
23
+
24
+ def _safe_realpath_under(candidate: Path, canonical_root: Path) -> Path | None:
25
+ """Return resolved path if it's the root or a descendant, else None."""
26
+ try:
27
+ resolved = candidate.resolve(strict=True)
28
+ except (OSError, RuntimeError):
29
+ return None
30
+ a = _norm(str(resolved))
31
+ b = _norm(str(canonical_root))
32
+ if a == b:
33
+ return resolved
34
+ if a.startswith(b + os.sep):
35
+ return resolved
36
+ return None
37
+
38
+
39
+ def discover_session_files(claude_root: Path) -> list[Path]:
40
+ """Find every top-level session JSONL under ``~/.claude/projects/*/``.
41
+
42
+ Sub-agent files inside ``<sid>/subagents/`` are intentionally excluded;
43
+ they're rolled up under their parent session by the pipeline.
44
+ """
45
+ projects_dir = claude_root / "projects"
46
+ if not projects_dir.exists():
47
+ return []
48
+ try:
49
+ canonical_root = claude_root.resolve(strict=True)
50
+ except (OSError, RuntimeError):
51
+ return []
52
+
53
+ out: list[Path] = []
54
+ for proj in projects_dir.iterdir():
55
+ if not proj.is_dir():
56
+ continue
57
+ for f in proj.iterdir():
58
+ if f.suffix != ".jsonl":
59
+ continue
60
+ safe = _safe_realpath_under(f, canonical_root)
61
+ if safe is not None:
62
+ out.append(f)
63
+ return out
64
+
65
+
66
+ def sub_agent_files_for(session_file: Path) -> list[Path]:
67
+ """Return ``<session_dir>/<sid>/subagents/*.jsonl`` if it exists."""
68
+ sid = session_file.stem # filename without .jsonl
69
+ sub = session_file.parent / sid / "subagents"
70
+ if not sub.exists():
71
+ return []
72
+ return sorted(f for f in sub.iterdir() if f.suffix == ".jsonl")
@@ -0,0 +1,86 @@
1
+ """Extract one RawToolCall per tool_use block."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+
6
+ from ..base import RawToolCall
7
+ from .schemas import AssistantLine, UserLine
8
+
9
+
10
+ def _safe_json_length(v: object) -> int:
11
+ """Length of JSON-stringified ``v``, matching JS ``JSON.stringify`` byte count."""
12
+ try:
13
+ # Compact separators reproduce JS default JSON.stringify output
14
+ # (no whitespace), so input_size stays consistent with the TS impl.
15
+ return len(json.dumps(v, separators=(",", ":")))
16
+ except (TypeError, ValueError):
17
+ return 0
18
+
19
+
20
+ def extract_tool_calls(
21
+ assistant_lines: list[AssistantLine],
22
+ user_lines: list[UserLine],
23
+ ) -> list[RawToolCall]:
24
+ """Emit one RawToolCall per tool_use block; attribute is_error from results."""
25
+ # 1) Build tool_use_id → is_error from tool_result blocks in user messages.
26
+ error_map: dict[str, bool] = {}
27
+ for u in user_lines:
28
+ content = u.message.content
29
+ if not isinstance(content, list):
30
+ continue
31
+ for block in content:
32
+ if not isinstance(block, dict):
33
+ continue
34
+ if block.get("type") != "tool_result":
35
+ continue
36
+ tool_use_id = block.get("tool_use_id")
37
+ if not isinstance(tool_use_id, str):
38
+ continue
39
+ is_err_raw = block.get("is_error", False)
40
+ flag = is_err_raw is True or is_err_raw == "true"
41
+ error_map[tool_use_id] = flag
42
+
43
+ # 2) Group assistant lines by message.id, preserve order.
44
+ by_id: dict[str, list[AssistantLine]] = {}
45
+ order: list[str] = []
46
+ for line in assistant_lines:
47
+ mid = line.message.id
48
+ if mid not in by_id:
49
+ by_id[mid] = []
50
+ order.append(mid)
51
+ by_id[mid].append(line)
52
+
53
+ # 3) Emit one RawToolCall per tool_use block.
54
+ out: list[RawToolCall] = []
55
+ turn_index = 0
56
+ for mid in order:
57
+ group = by_id[mid]
58
+ block_index = 0
59
+ for line in group:
60
+ for block in line.message.content:
61
+ if block.get("type") != "tool_use":
62
+ continue
63
+ inp = block.get("input") or {}
64
+ name = block.get("name") or ""
65
+ use_id = block.get("id") or ""
66
+ subagent = (
67
+ inp.get("subagent_type")
68
+ if name == "Task" and isinstance(inp, dict) and isinstance(inp.get("subagent_type"), str)
69
+ else None
70
+ )
71
+ out.append(
72
+ RawToolCall(
73
+ native_turn_id=mid,
74
+ turn_index=turn_index,
75
+ block_index=block_index,
76
+ tool_name=name,
77
+ tool_use_id=use_id,
78
+ is_error=1 if error_map.get(use_id) else 0,
79
+ input_size=_safe_json_length(inp),
80
+ subagent_type=subagent,
81
+ timestamp=line.timestamp,
82
+ )
83
+ )
84
+ block_index += 1
85
+ turn_index += 1
86
+ return out
@@ -0,0 +1,111 @@
1
+ """Extract searchable transcript segments from a session's lines."""
2
+ from __future__ import annotations
3
+
4
+ from ..base import RawSegment
5
+ from .schemas import AssistantLine, UserLine
6
+
7
+ # Per-segment byte cap. Tool_result content (a Read of a file, a Bash of a
8
+ # build log) can be enormous; indexing the entire body bloats FTS5 for
9
+ # very little search value.
10
+ SEGMENT_CAP_BYTES = 16 * 1024
11
+
12
+
13
+ def _cap_text(s: str) -> str:
14
+ """Truncate ``s`` to ``SEGMENT_CAP_BYTES`` UTF-8 bytes with an ellipsis."""
15
+ encoded = s.encode("utf-8")
16
+ if len(encoded) <= SEGMENT_CAP_BYTES:
17
+ return s
18
+ # Walk back to a unicode-safe boundary by decoding with errors='ignore'.
19
+ truncated = encoded[:SEGMENT_CAP_BYTES].decode("utf-8", errors="ignore")
20
+ return truncated + "…"
21
+
22
+
23
+ def extract_transcript_segments(
24
+ assistant_lines: list[AssistantLine],
25
+ user_lines: list[UserLine],
26
+ ) -> list[RawSegment]:
27
+ out: list[RawSegment] = []
28
+
29
+ for line in assistant_lines:
30
+ block_idx = 0
31
+ for block in line.message.content:
32
+ btype = block.get("type")
33
+ if btype == "text":
34
+ text = block.get("text")
35
+ if isinstance(text, str) and text.strip():
36
+ out.append(
37
+ RawSegment(
38
+ uid_suffix=f"{line.uuid}:{block_idx}",
39
+ timestamp=line.timestamp,
40
+ role="assistant",
41
+ text=_cap_text(text),
42
+ )
43
+ )
44
+ elif btype == "thinking":
45
+ text = block.get("thinking")
46
+ if isinstance(text, str) and text.strip():
47
+ out.append(
48
+ RawSegment(
49
+ uid_suffix=f"{line.uuid}:{block_idx}",
50
+ timestamp=line.timestamp,
51
+ role="thinking",
52
+ text=_cap_text(text),
53
+ )
54
+ )
55
+ block_idx += 1
56
+
57
+ for line in user_lines:
58
+ c = line.message.content
59
+ if isinstance(c, str):
60
+ if c.strip():
61
+ out.append(
62
+ RawSegment(
63
+ uid_suffix=f"{line.uuid}:0",
64
+ timestamp=line.timestamp,
65
+ role="user",
66
+ text=_cap_text(c),
67
+ )
68
+ )
69
+ continue
70
+ if not isinstance(c, list):
71
+ continue
72
+ block_idx = 0
73
+ for block in c:
74
+ if not isinstance(block, dict):
75
+ block_idx += 1
76
+ continue
77
+ btype = block.get("type")
78
+ if btype == "text" and isinstance(block.get("text"), str) and block["text"].strip():
79
+ out.append(
80
+ RawSegment(
81
+ uid_suffix=f"{line.uuid}:{block_idx}",
82
+ timestamp=line.timestamp,
83
+ role="user",
84
+ text=_cap_text(block["text"]),
85
+ )
86
+ )
87
+ elif btype == "tool_result":
88
+ content = block.get("content")
89
+ combined = ""
90
+ if isinstance(content, str):
91
+ combined = content
92
+ elif isinstance(content, list):
93
+ parts = []
94
+ for sub in content:
95
+ if isinstance(sub, dict):
96
+ t = sub.get("text")
97
+ if isinstance(t, str):
98
+ parts.append(t)
99
+ combined = "\n".join(p for p in parts if p)
100
+ if combined.strip():
101
+ out.append(
102
+ RawSegment(
103
+ uid_suffix=f"{line.uuid}:{block_idx}",
104
+ timestamp=line.timestamp,
105
+ role="tool_result",
106
+ text=_cap_text(combined),
107
+ )
108
+ )
109
+ block_idx += 1
110
+
111
+ return out
@@ -0,0 +1,69 @@
1
+ """Build per-turn records from a session's assistant lines."""
2
+ from __future__ import annotations
3
+
4
+ from ...schema.types import RawTurnEvent
5
+ from .model import canonicalize_claude_model
6
+ from .schemas import AssistantLine
7
+
8
+
9
+ def extract_turns(lines: list[AssistantLine]) -> list[RawTurnEvent]:
10
+ """Group assistant lines by message.id, emit one RawTurnEvent per group.
11
+
12
+ A single message can be split across multiple JSONL lines (one per
13
+ content block in some Claude Code versions) — all sharing message.id.
14
+ We preserve first-seen order and walk every line in the group.
15
+ """
16
+ by_id: dict[str, list[AssistantLine]] = {}
17
+ order: list[str] = []
18
+ for line in lines:
19
+ mid = line.message.id
20
+ if mid not in by_id:
21
+ by_id[mid] = []
22
+ order.append(mid)
23
+ by_id[mid].append(line)
24
+
25
+ turns: list[RawTurnEvent] = []
26
+ seq = 0
27
+ for mid in order:
28
+ group = by_id[mid]
29
+ first = group[0]
30
+ usage = first.message.usage
31
+ cache_5m = (
32
+ usage.cache_creation.ephemeral_5m_input_tokens
33
+ if usage.cache_creation is not None
34
+ else None
35
+ )
36
+ cache_1h = (
37
+ usage.cache_creation.ephemeral_1h_input_tokens
38
+ if usage.cache_creation is not None
39
+ else None
40
+ )
41
+ tool_calls = sum(
42
+ sum(1 for b in line.message.content if b.get("type") == "tool_use")
43
+ for line in group
44
+ )
45
+
46
+ turns.append(
47
+ RawTurnEvent(
48
+ native_turn_id=mid,
49
+ sequence=seq,
50
+ timestamp=first.timestamp,
51
+ model=canonicalize_claude_model(first.message.model),
52
+ model_raw=first.message.model,
53
+ fresh_input_tokens=usage.input_tokens,
54
+ output_tokens=usage.output_tokens,
55
+ cache_read_tokens=usage.cache_read_input_tokens,
56
+ cache_write_tokens=usage.cache_creation_input_tokens,
57
+ cache_write_5m_tokens=cache_5m,
58
+ cache_write_1h_tokens=cache_1h,
59
+ tool_calls_count=tool_calls,
60
+ metadata={
61
+ "service_tier": usage.service_tier,
62
+ "agentId": first.agentId,
63
+ "attribution_agent": first.attribution_agent,
64
+ "isSidechain": first.isSidechain if first.isSidechain is not None else False,
65
+ },
66
+ )
67
+ )
68
+ seq += 1
69
+ return turns
@@ -0,0 +1,138 @@
1
+ """Ingest ``~/.claude/history.jsonl`` — the global prompt history file.
2
+
3
+ Byte-offset tail like sessions. Partial last line is held back until
4
+ newline. If the file shrinks below the recorded offset (rotation /
5
+ truncation), we reset to 0 and re-ingest.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError
14
+
15
+ from ...schema.types import Prompt
16
+ from ...store.repository import normalize_project_path
17
+
18
+ if TYPE_CHECKING:
19
+ from ...store.repository import Repository
20
+
21
+ # Hard cap on the indexed body. 8 KB is past any realistic hand-typed
22
+ # prompt; longer values are almost always a paste-by-mistake.
23
+ DISPLAY_CAP_BYTES = 8 * 1024
24
+ MAX_TICK_BYTES = 64 * 1024 * 1024
25
+
26
+
27
+ class HistoryLine(BaseModel):
28
+ """One line of ~/.claude/history.jsonl."""
29
+
30
+ model_config = ConfigDict(extra="allow")
31
+
32
+ display: str
33
+ pastedContents: dict[str, Any] = Field(default_factory=dict)
34
+ timestamp: int
35
+ project: str
36
+
37
+
38
+ class HistoryIngestStats(BaseModel):
39
+ inserted: int
40
+ skipped_empty: int
41
+ parse_errors: int
42
+ new_offset: int
43
+
44
+
45
+ def line_to_prompt(line: HistoryLine) -> Prompt | None:
46
+ """Pure transformation: history line → Prompt row, or None if skipped."""
47
+ trimmed = line.display.strip()
48
+ if trimmed == "":
49
+ return None
50
+
51
+ display = line.display
52
+ if len(display.encode("utf-8")) > DISPLAY_CAP_BYTES:
53
+ # Walk back to a unicode-safe boundary by decoding with errors='ignore'.
54
+ truncated = display.encode("utf-8")[: DISPLAY_CAP_BYTES - 1].decode(
55
+ "utf-8", errors="ignore"
56
+ )
57
+ display = truncated + "…"
58
+
59
+ try:
60
+ # Compact separators match JS ``JSON.stringify`` byte count, so
61
+ # the pasted_chars upper bound is identical across stacks.
62
+ pasted_chars = len(json.dumps(line.pastedContents or {}, separators=(",", ":")))
63
+ if pasted_chars <= 2: # "{}"
64
+ pasted_chars = 0
65
+ except (TypeError, ValueError):
66
+ pasted_chars = 0
67
+
68
+ return Prompt(
69
+ timestamp_ms=line.timestamp,
70
+ project_path=normalize_project_path(line.project),
71
+ display=display,
72
+ pasted_chars=pasted_chars,
73
+ is_slash=1 if trimmed.startswith("/") else 0,
74
+ )
75
+
76
+
77
+ def ingest_history_file(
78
+ file_path: Path, repo: "Repository"
79
+ ) -> HistoryIngestStats:
80
+ """Read ``file_path`` from the stored offset, append new prompts."""
81
+ offset_key = f"history:{file_path}"
82
+ from_offset = repo.get_file_offset(offset_key)
83
+
84
+ with open(file_path, "rb") as fh:
85
+ fh.seek(0, 2)
86
+ size = fh.tell()
87
+
88
+ # Rotation / truncation: file smaller than the offset → re-ingest.
89
+ if size < from_offset:
90
+ from_offset = 0
91
+
92
+ if size <= from_offset:
93
+ return HistoryIngestStats(
94
+ inserted=0, skipped_empty=0, parse_errors=0, new_offset=from_offset
95
+ )
96
+
97
+ read_len = min(size - from_offset, MAX_TICK_BYTES)
98
+ fh.seek(from_offset)
99
+ raw = fh.read(read_len)
100
+
101
+ text = raw.decode("utf-8", errors="replace")
102
+ last_nl = text.rfind("\n")
103
+ consumable = text[: last_nl + 1] if last_nl != -1 else ""
104
+ consumed_bytes = len(consumable.encode("utf-8"))
105
+ new_offset = from_offset + consumed_bytes
106
+
107
+ rows: list[Prompt] = []
108
+ parse_errors = 0
109
+ skipped_empty = 0
110
+
111
+ for line in consumable.split("\n"):
112
+ if not line:
113
+ continue
114
+ try:
115
+ obj = json.loads(line)
116
+ except json.JSONDecodeError:
117
+ parse_errors += 1
118
+ continue
119
+ try:
120
+ parsed = HistoryLine.model_validate(obj)
121
+ except ValidationError:
122
+ parse_errors += 1
123
+ continue
124
+ row = line_to_prompt(parsed)
125
+ if row is None:
126
+ skipped_empty += 1
127
+ continue
128
+ rows.append(row)
129
+
130
+ repo.insert_prompts(rows)
131
+ repo.set_file_offset(offset_key, new_offset)
132
+
133
+ return HistoryIngestStats(
134
+ inserted=len(rows),
135
+ skipped_empty=skipped_empty,
136
+ parse_errors=parse_errors,
137
+ new_offset=new_offset,
138
+ )