argus-code 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- argus/__init__.py +3 -0
- argus/adapters/__init__.py +7 -0
- argus/adapters/base.py +108 -0
- argus/adapters/claude_code/__init__.py +5 -0
- argus/adapters/claude_code/adapter.py +63 -0
- argus/adapters/claude_code/discover.py +72 -0
- argus/adapters/claude_code/extract_tool_calls.py +86 -0
- argus/adapters/claude_code/extract_transcript.py +111 -0
- argus/adapters/claude_code/extract_turns.py +69 -0
- argus/adapters/claude_code/history_jsonl.py +138 -0
- argus/adapters/claude_code/ingest_file.py +137 -0
- argus/adapters/claude_code/model.py +11 -0
- argus/adapters/claude_code/schemas.py +77 -0
- argus/adapters/registry.py +30 -0
- argus/cli.py +384 -0
- argus/collector/__init__.py +0 -0
- argus/collector/aggregate.py +102 -0
- argus/collector/first_run.py +189 -0
- argus/collector/pipeline.py +140 -0
- argus/collector/rollup_subagents.py +27 -0
- argus/collector/scheduler.py +89 -0
- argus/collector/search_backfill.py +109 -0
- argus/collector/watcher.py +178 -0
- argus/dashboard-dist/_astro/charts.BIevw6Es.js +1 -0
- argus/dashboard-dist/_astro/format.DxC1NGYT.js +1 -0
- argus/dashboard-dist/_astro/index.astro_astro_type_script_index_0_lang.CgwSARdD.js +24 -0
- argus/dashboard-dist/_astro/index.astro_astro_type_script_index_0_lang.W18SJsr7.js +11 -0
- argus/dashboard-dist/_astro/installCanvasRenderer.D_tC6TXz.js +18 -0
- argus/dashboard-dist/_astro/models.astro_astro_type_script_index_0_lang.BHTHXYHC.js +13 -0
- argus/dashboard-dist/_astro/prompts.astro_astro_type_script_index_0_lang.DfNgiDv9.js +17 -0
- argus/dashboard-dist/_astro/session.astro_astro_type_script_index_0_lang.Dj_bfrIa.js +86 -0
- argus/dashboard-dist/_astro/settings.astro_astro_type_script_index_0_lang.d_a-uvdi.js +24 -0
- argus/dashboard-dist/_astro/tools.astro_astro_type_script_index_0_lang.Dzzau3Yt.js +12 -0
- argus/dashboard-dist/_astro/trends.astro_astro_type_script_index_0_lang.BLLeGRNa.js +5 -0
- argus/dashboard-dist/index.html +2 -0
- argus/dashboard-dist/models/index.html +1 -0
- argus/dashboard-dist/prompts/index.html +18 -0
- argus/dashboard-dist/session/index.html +2 -0
- argus/dashboard-dist/sessions/index.html +1 -0
- argus/dashboard-dist/settings/index.html +8 -0
- argus/dashboard-dist/styles/global.css +307 -0
- argus/dashboard-dist/tools/index.html +1 -0
- argus/dashboard-dist/trends/index.html +1 -0
- argus/detectors/__init__.py +6 -0
- argus/detectors/base.py +34 -0
- argus/detectors/registry.py +20 -0
- argus/detectors/tool_error_rate_spike.py +138 -0
- argus/pricing/2026-05-02.json +24 -0
- argus/pricing/__init__.py +0 -0
- argus/pricing/compute.py +46 -0
- argus/pricing/load.py +45 -0
- argus/pricing/refresh.py +91 -0
- argus/pricing/types.py +21 -0
- argus/scaffold/__init__.py +0 -0
- argus/scaffold/scaffolder.py +45 -0
- argus/scaffold/snapshot.py +73 -0
- argus/scaffold/storage.py +60 -0
- argus/schema/__init__.py +0 -0
- argus/schema/types.py +157 -0
- argus/server/__init__.py +0 -0
- argus/server/api.py +661 -0
- argus/server/app.py +97 -0
- argus/store/__init__.py +0 -0
- argus/store/db.py +103 -0
- argus/store/migrations/__init__.py +0 -0
- argus/store/migrations/inline.py +180 -0
- argus/store/repository.py +778 -0
- argus/templates/default/.claude/agents/code-reviewer.md +27 -0
- argus/templates/default/.claude/agents/security-auditor.md +28 -0
- argus/templates/default/.claude/commands/commit.md +38 -0
- argus/templates/default/.claude/commands/deploy.md +13 -0
- argus/templates/default/.claude/commands/fix-issue.md +15 -0
- argus/templates/default/.claude/commands/pr.md +38 -0
- argus/templates/default/.claude/commands/review.md +14 -0
- argus/templates/default/.claude/rules/api-conventions.md +27 -0
- argus/templates/default/.claude/rules/code-style.md +25 -0
- argus/templates/default/.claude/rules/testing.md +19 -0
- argus/templates/default/.claude/settings.json +28 -0
- argus/templates/default/.claude/skills/example/SKILL.md +11 -0
- argus/templates/default/CLAUDE.md +57 -0
- argus_code-0.2.0.dist-info/METADATA +247 -0
- argus_code-0.2.0.dist-info/RECORD +86 -0
- argus_code-0.2.0.dist-info/WHEEL +4 -0
- argus_code-0.2.0.dist-info/entry_points.txt +2 -0
- argus_code-0.2.0.dist-info/licenses/LICENSE +21 -0
- argus_code-0.2.0.dist-info/licenses/NOTICE +22 -0
argus/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Adapters — one subpackage per coding-agent data source.
|
|
2
|
+
|
|
3
|
+
Importing this module auto-imports every known adapter so their
|
|
4
|
+
``@register`` decorators run and the registry is populated.
|
|
5
|
+
"""
|
|
6
|
+
# Side-effect imports — each adapter's package self-registers via @register.
|
|
7
|
+
from . import claude_code # noqa: F401
|
argus/adapters/base.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Adapter contract.
|
|
2
|
+
|
|
3
|
+
Adapters know how to find and parse one coding-agent's session logs. The
|
|
4
|
+
pipeline, watcher, and server are agent-agnostic — they only see this
|
|
5
|
+
protocol. Anything agent-specific (e.g., Claude Code's ``history.jsonl``
|
|
6
|
+
sidecar, sub-agent rollup) is expressed through the optional extension
|
|
7
|
+
points below; defaults are no-ops so simple adapters stay simple.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
from ..schema.types import RawSessionHeader, RawTurnEvent
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from ..store.repository import Repository
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ParseError(BaseModel):
|
|
23
|
+
"""One JSONL line (or whole-file failure) that couldn't be parsed."""
|
|
24
|
+
|
|
25
|
+
file: str
|
|
26
|
+
byte_offset: int
|
|
27
|
+
reason: str
|
|
28
|
+
raw_line_truncated: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RawToolCall(BaseModel):
|
|
32
|
+
"""Adapter-local tool_call shape. Pipeline stamps session_id + composite id."""
|
|
33
|
+
|
|
34
|
+
native_turn_id: str
|
|
35
|
+
turn_index: int
|
|
36
|
+
block_index: int
|
|
37
|
+
tool_name: str
|
|
38
|
+
tool_use_id: str
|
|
39
|
+
is_error: int # 0 | 1
|
|
40
|
+
input_size: int
|
|
41
|
+
subagent_type: str | None
|
|
42
|
+
timestamp: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RawSegment(BaseModel):
|
|
46
|
+
"""Adapter-local transcript segment. uid_suffix = '{line_uuid}:{block_index}'."""
|
|
47
|
+
|
|
48
|
+
uid_suffix: str
|
|
49
|
+
timestamp: str
|
|
50
|
+
role: str # 'user' | 'assistant' | 'thinking' | 'tool_result'
|
|
51
|
+
text: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AdapterIngestResult(BaseModel):
|
|
55
|
+
"""What an adapter returns from ingesting one file."""
|
|
56
|
+
|
|
57
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
58
|
+
|
|
59
|
+
header: RawSessionHeader
|
|
60
|
+
turns: list[RawTurnEvent]
|
|
61
|
+
parse_errors: list[ParseError] = []
|
|
62
|
+
# Optional — adapters that don't populate these leave them empty. The
|
|
63
|
+
# pipeline treats empty as "no data", not "delete existing rows".
|
|
64
|
+
tool_calls: list[RawToolCall] = []
|
|
65
|
+
segments: list[RawSegment] = []
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@runtime_checkable
|
|
69
|
+
class Adapter(Protocol):
|
|
70
|
+
"""The pluggable contract every agent integration implements."""
|
|
71
|
+
|
|
72
|
+
agent: str # e.g., "claude_code", "codex", "openclaw", "hermes"
|
|
73
|
+
|
|
74
|
+
def root_path(self) -> Path: ...
|
|
75
|
+
|
|
76
|
+
def is_present(self) -> bool:
|
|
77
|
+
"""True if this agent's data exists on the current machine."""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
def discover_session_files(self) -> list[Path]: ...
|
|
81
|
+
|
|
82
|
+
def ingest_file(
|
|
83
|
+
self, path: Path, from_offset: int = 0
|
|
84
|
+
) -> tuple[AdapterIngestResult, int]:
|
|
85
|
+
"""Read from ``from_offset``; return (result, new_offset)."""
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
# ─── Optional extension points (default no-op) ─────────────────────
|
|
89
|
+
|
|
90
|
+
def extra_watch_paths(self) -> list[Path]:
|
|
91
|
+
"""Side-channel files this adapter wants tailed (e.g., history.jsonl)."""
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
def ingest_extra(self, path: Path, repo: "Repository") -> None:
|
|
95
|
+
"""Handle an event on one of ``extra_watch_paths()``."""
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
def sub_session_files_for(self, session_file: Path) -> list[Path]:
|
|
99
|
+
"""Files that roll up into ``session_file`` (e.g., sub-agent JSONLs)."""
|
|
100
|
+
return []
|
|
101
|
+
|
|
102
|
+
def should_skip(self, path: Path) -> bool:
|
|
103
|
+
"""Watcher predicate — true to silently ignore events on this path."""
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
def normalize_model_name(self, raw: str) -> str:
|
|
107
|
+
"""Map raw model identifier to the form the pricing table keys on."""
|
|
108
|
+
return raw
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""The Claude Code adapter — façade that wires the package together."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from ..base import AdapterIngestResult
|
|
8
|
+
from ..registry import register
|
|
9
|
+
from .discover import discover_session_files, sub_agent_files_for
|
|
10
|
+
from .history_jsonl import ingest_history_file
|
|
11
|
+
from .ingest_file import ingest_claude_code_file
|
|
12
|
+
from .model import canonicalize_claude_model
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from ...store.repository import Repository
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _default_root() -> Path:
|
|
19
|
+
return Path.home() / ".claude"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@register
|
|
23
|
+
class ClaudeCodeAdapter:
|
|
24
|
+
agent = "claude_code"
|
|
25
|
+
|
|
26
|
+
def __init__(self, root: Path | None = None) -> None:
|
|
27
|
+
self._root = (root or _default_root()).resolve(strict=False)
|
|
28
|
+
|
|
29
|
+
def root_path(self) -> Path:
|
|
30
|
+
return self._root
|
|
31
|
+
|
|
32
|
+
def is_present(self) -> bool:
|
|
33
|
+
return self._root.exists()
|
|
34
|
+
|
|
35
|
+
def discover_session_files(self) -> list[Path]:
|
|
36
|
+
return discover_session_files(self._root)
|
|
37
|
+
|
|
38
|
+
def ingest_file(
|
|
39
|
+
self, path: Path, from_offset: int = 0
|
|
40
|
+
) -> tuple[AdapterIngestResult, int]:
|
|
41
|
+
return ingest_claude_code_file(path, from_offset)
|
|
42
|
+
|
|
43
|
+
# ─── Extension-point overrides ─────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
def extra_watch_paths(self) -> list[Path]:
|
|
46
|
+
history = self._root / "history.jsonl"
|
|
47
|
+
return [history] if history.exists() else []
|
|
48
|
+
|
|
49
|
+
def ingest_extra(self, path: Path, repo: "Repository") -> None:
|
|
50
|
+
# Only the history file is interesting for this adapter.
|
|
51
|
+
if path.name == "history.jsonl":
|
|
52
|
+
ingest_history_file(path, repo)
|
|
53
|
+
|
|
54
|
+
def sub_session_files_for(self, session_file: Path) -> list[Path]:
|
|
55
|
+
return sub_agent_files_for(session_file)
|
|
56
|
+
|
|
57
|
+
def should_skip(self, path: Path) -> bool:
|
|
58
|
+
# The subagents/ tree is walked as part of the parent's ingest,
|
|
59
|
+
# never as standalone sessions.
|
|
60
|
+
return "subagents" in path.parts
|
|
61
|
+
|
|
62
|
+
def normalize_model_name(self, raw: str) -> str:
|
|
63
|
+
return canonicalize_claude_model(raw)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Discover Claude Code session JSONL files under ``~/.claude/``.
|
|
2
|
+
|
|
3
|
+
Path-safety rules:
|
|
4
|
+
|
|
5
|
+
- Realpath every candidate; reject anything that doesn't canonicalize
|
|
6
|
+
under the claude root (defends against a hostile symlink pointing at
|
|
7
|
+
``/etc/passwd``).
|
|
8
|
+
- On Windows, lowercase the comparison since the filesystem is
|
|
9
|
+
case-insensitive but Python string equality isn't.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
_IS_WIN = sys.platform == "win32"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _norm(p: str) -> str:
|
|
21
|
+
return p.lower() if _IS_WIN else p
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _safe_realpath_under(candidate: Path, canonical_root: Path) -> Path | None:
|
|
25
|
+
"""Return resolved path if it's the root or a descendant, else None."""
|
|
26
|
+
try:
|
|
27
|
+
resolved = candidate.resolve(strict=True)
|
|
28
|
+
except (OSError, RuntimeError):
|
|
29
|
+
return None
|
|
30
|
+
a = _norm(str(resolved))
|
|
31
|
+
b = _norm(str(canonical_root))
|
|
32
|
+
if a == b:
|
|
33
|
+
return resolved
|
|
34
|
+
if a.startswith(b + os.sep):
|
|
35
|
+
return resolved
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def discover_session_files(claude_root: Path) -> list[Path]:
|
|
40
|
+
"""Find every top-level session JSONL under ``~/.claude/projects/*/``.
|
|
41
|
+
|
|
42
|
+
Sub-agent files inside ``<sid>/subagents/`` are intentionally excluded;
|
|
43
|
+
they're rolled up under their parent session by the pipeline.
|
|
44
|
+
"""
|
|
45
|
+
projects_dir = claude_root / "projects"
|
|
46
|
+
if not projects_dir.exists():
|
|
47
|
+
return []
|
|
48
|
+
try:
|
|
49
|
+
canonical_root = claude_root.resolve(strict=True)
|
|
50
|
+
except (OSError, RuntimeError):
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
out: list[Path] = []
|
|
54
|
+
for proj in projects_dir.iterdir():
|
|
55
|
+
if not proj.is_dir():
|
|
56
|
+
continue
|
|
57
|
+
for f in proj.iterdir():
|
|
58
|
+
if f.suffix != ".jsonl":
|
|
59
|
+
continue
|
|
60
|
+
safe = _safe_realpath_under(f, canonical_root)
|
|
61
|
+
if safe is not None:
|
|
62
|
+
out.append(f)
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def sub_agent_files_for(session_file: Path) -> list[Path]:
|
|
67
|
+
"""Return ``<session_dir>/<sid>/subagents/*.jsonl`` if it exists."""
|
|
68
|
+
sid = session_file.stem # filename without .jsonl
|
|
69
|
+
sub = session_file.parent / sid / "subagents"
|
|
70
|
+
if not sub.exists():
|
|
71
|
+
return []
|
|
72
|
+
return sorted(f for f in sub.iterdir() if f.suffix == ".jsonl")
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Extract one RawToolCall per tool_use block."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
from ..base import RawToolCall
|
|
7
|
+
from .schemas import AssistantLine, UserLine
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _safe_json_length(v: object) -> int:
|
|
11
|
+
"""Length of JSON-stringified ``v``, matching JS ``JSON.stringify`` byte count."""
|
|
12
|
+
try:
|
|
13
|
+
# Compact separators reproduce JS default JSON.stringify output
|
|
14
|
+
# (no whitespace), so input_size stays consistent with the TS impl.
|
|
15
|
+
return len(json.dumps(v, separators=(",", ":")))
|
|
16
|
+
except (TypeError, ValueError):
|
|
17
|
+
return 0
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def extract_tool_calls(
|
|
21
|
+
assistant_lines: list[AssistantLine],
|
|
22
|
+
user_lines: list[UserLine],
|
|
23
|
+
) -> list[RawToolCall]:
|
|
24
|
+
"""Emit one RawToolCall per tool_use block; attribute is_error from results."""
|
|
25
|
+
# 1) Build tool_use_id → is_error from tool_result blocks in user messages.
|
|
26
|
+
error_map: dict[str, bool] = {}
|
|
27
|
+
for u in user_lines:
|
|
28
|
+
content = u.message.content
|
|
29
|
+
if not isinstance(content, list):
|
|
30
|
+
continue
|
|
31
|
+
for block in content:
|
|
32
|
+
if not isinstance(block, dict):
|
|
33
|
+
continue
|
|
34
|
+
if block.get("type") != "tool_result":
|
|
35
|
+
continue
|
|
36
|
+
tool_use_id = block.get("tool_use_id")
|
|
37
|
+
if not isinstance(tool_use_id, str):
|
|
38
|
+
continue
|
|
39
|
+
is_err_raw = block.get("is_error", False)
|
|
40
|
+
flag = is_err_raw is True or is_err_raw == "true"
|
|
41
|
+
error_map[tool_use_id] = flag
|
|
42
|
+
|
|
43
|
+
# 2) Group assistant lines by message.id, preserve order.
|
|
44
|
+
by_id: dict[str, list[AssistantLine]] = {}
|
|
45
|
+
order: list[str] = []
|
|
46
|
+
for line in assistant_lines:
|
|
47
|
+
mid = line.message.id
|
|
48
|
+
if mid not in by_id:
|
|
49
|
+
by_id[mid] = []
|
|
50
|
+
order.append(mid)
|
|
51
|
+
by_id[mid].append(line)
|
|
52
|
+
|
|
53
|
+
# 3) Emit one RawToolCall per tool_use block.
|
|
54
|
+
out: list[RawToolCall] = []
|
|
55
|
+
turn_index = 0
|
|
56
|
+
for mid in order:
|
|
57
|
+
group = by_id[mid]
|
|
58
|
+
block_index = 0
|
|
59
|
+
for line in group:
|
|
60
|
+
for block in line.message.content:
|
|
61
|
+
if block.get("type") != "tool_use":
|
|
62
|
+
continue
|
|
63
|
+
inp = block.get("input") or {}
|
|
64
|
+
name = block.get("name") or ""
|
|
65
|
+
use_id = block.get("id") or ""
|
|
66
|
+
subagent = (
|
|
67
|
+
inp.get("subagent_type")
|
|
68
|
+
if name == "Task" and isinstance(inp, dict) and isinstance(inp.get("subagent_type"), str)
|
|
69
|
+
else None
|
|
70
|
+
)
|
|
71
|
+
out.append(
|
|
72
|
+
RawToolCall(
|
|
73
|
+
native_turn_id=mid,
|
|
74
|
+
turn_index=turn_index,
|
|
75
|
+
block_index=block_index,
|
|
76
|
+
tool_name=name,
|
|
77
|
+
tool_use_id=use_id,
|
|
78
|
+
is_error=1 if error_map.get(use_id) else 0,
|
|
79
|
+
input_size=_safe_json_length(inp),
|
|
80
|
+
subagent_type=subagent,
|
|
81
|
+
timestamp=line.timestamp,
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
block_index += 1
|
|
85
|
+
turn_index += 1
|
|
86
|
+
return out
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Extract searchable transcript segments from a session's lines."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from ..base import RawSegment
|
|
5
|
+
from .schemas import AssistantLine, UserLine
|
|
6
|
+
|
|
7
|
+
# Per-segment byte cap. Tool_result content (a Read of a file, a Bash of a
|
|
8
|
+
# build log) can be enormous; indexing the entire body bloats FTS5 for
|
|
9
|
+
# very little search value.
|
|
10
|
+
SEGMENT_CAP_BYTES = 16 * 1024
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _cap_text(s: str) -> str:
|
|
14
|
+
"""Truncate ``s`` to ``SEGMENT_CAP_BYTES`` UTF-8 bytes with an ellipsis."""
|
|
15
|
+
encoded = s.encode("utf-8")
|
|
16
|
+
if len(encoded) <= SEGMENT_CAP_BYTES:
|
|
17
|
+
return s
|
|
18
|
+
# Walk back to a unicode-safe boundary by decoding with errors='ignore'.
|
|
19
|
+
truncated = encoded[:SEGMENT_CAP_BYTES].decode("utf-8", errors="ignore")
|
|
20
|
+
return truncated + "…"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def extract_transcript_segments(
|
|
24
|
+
assistant_lines: list[AssistantLine],
|
|
25
|
+
user_lines: list[UserLine],
|
|
26
|
+
) -> list[RawSegment]:
|
|
27
|
+
out: list[RawSegment] = []
|
|
28
|
+
|
|
29
|
+
for line in assistant_lines:
|
|
30
|
+
block_idx = 0
|
|
31
|
+
for block in line.message.content:
|
|
32
|
+
btype = block.get("type")
|
|
33
|
+
if btype == "text":
|
|
34
|
+
text = block.get("text")
|
|
35
|
+
if isinstance(text, str) and text.strip():
|
|
36
|
+
out.append(
|
|
37
|
+
RawSegment(
|
|
38
|
+
uid_suffix=f"{line.uuid}:{block_idx}",
|
|
39
|
+
timestamp=line.timestamp,
|
|
40
|
+
role="assistant",
|
|
41
|
+
text=_cap_text(text),
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
elif btype == "thinking":
|
|
45
|
+
text = block.get("thinking")
|
|
46
|
+
if isinstance(text, str) and text.strip():
|
|
47
|
+
out.append(
|
|
48
|
+
RawSegment(
|
|
49
|
+
uid_suffix=f"{line.uuid}:{block_idx}",
|
|
50
|
+
timestamp=line.timestamp,
|
|
51
|
+
role="thinking",
|
|
52
|
+
text=_cap_text(text),
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
block_idx += 1
|
|
56
|
+
|
|
57
|
+
for line in user_lines:
|
|
58
|
+
c = line.message.content
|
|
59
|
+
if isinstance(c, str):
|
|
60
|
+
if c.strip():
|
|
61
|
+
out.append(
|
|
62
|
+
RawSegment(
|
|
63
|
+
uid_suffix=f"{line.uuid}:0",
|
|
64
|
+
timestamp=line.timestamp,
|
|
65
|
+
role="user",
|
|
66
|
+
text=_cap_text(c),
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
continue
|
|
70
|
+
if not isinstance(c, list):
|
|
71
|
+
continue
|
|
72
|
+
block_idx = 0
|
|
73
|
+
for block in c:
|
|
74
|
+
if not isinstance(block, dict):
|
|
75
|
+
block_idx += 1
|
|
76
|
+
continue
|
|
77
|
+
btype = block.get("type")
|
|
78
|
+
if btype == "text" and isinstance(block.get("text"), str) and block["text"].strip():
|
|
79
|
+
out.append(
|
|
80
|
+
RawSegment(
|
|
81
|
+
uid_suffix=f"{line.uuid}:{block_idx}",
|
|
82
|
+
timestamp=line.timestamp,
|
|
83
|
+
role="user",
|
|
84
|
+
text=_cap_text(block["text"]),
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
elif btype == "tool_result":
|
|
88
|
+
content = block.get("content")
|
|
89
|
+
combined = ""
|
|
90
|
+
if isinstance(content, str):
|
|
91
|
+
combined = content
|
|
92
|
+
elif isinstance(content, list):
|
|
93
|
+
parts = []
|
|
94
|
+
for sub in content:
|
|
95
|
+
if isinstance(sub, dict):
|
|
96
|
+
t = sub.get("text")
|
|
97
|
+
if isinstance(t, str):
|
|
98
|
+
parts.append(t)
|
|
99
|
+
combined = "\n".join(p for p in parts if p)
|
|
100
|
+
if combined.strip():
|
|
101
|
+
out.append(
|
|
102
|
+
RawSegment(
|
|
103
|
+
uid_suffix=f"{line.uuid}:{block_idx}",
|
|
104
|
+
timestamp=line.timestamp,
|
|
105
|
+
role="tool_result",
|
|
106
|
+
text=_cap_text(combined),
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
block_idx += 1
|
|
110
|
+
|
|
111
|
+
return out
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Build per-turn records from a session's assistant lines."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from ...schema.types import RawTurnEvent
|
|
5
|
+
from .model import canonicalize_claude_model
|
|
6
|
+
from .schemas import AssistantLine
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def extract_turns(lines: list[AssistantLine]) -> list[RawTurnEvent]:
|
|
10
|
+
"""Group assistant lines by message.id, emit one RawTurnEvent per group.
|
|
11
|
+
|
|
12
|
+
A single message can be split across multiple JSONL lines (one per
|
|
13
|
+
content block in some Claude Code versions) — all sharing message.id.
|
|
14
|
+
We preserve first-seen order and walk every line in the group.
|
|
15
|
+
"""
|
|
16
|
+
by_id: dict[str, list[AssistantLine]] = {}
|
|
17
|
+
order: list[str] = []
|
|
18
|
+
for line in lines:
|
|
19
|
+
mid = line.message.id
|
|
20
|
+
if mid not in by_id:
|
|
21
|
+
by_id[mid] = []
|
|
22
|
+
order.append(mid)
|
|
23
|
+
by_id[mid].append(line)
|
|
24
|
+
|
|
25
|
+
turns: list[RawTurnEvent] = []
|
|
26
|
+
seq = 0
|
|
27
|
+
for mid in order:
|
|
28
|
+
group = by_id[mid]
|
|
29
|
+
first = group[0]
|
|
30
|
+
usage = first.message.usage
|
|
31
|
+
cache_5m = (
|
|
32
|
+
usage.cache_creation.ephemeral_5m_input_tokens
|
|
33
|
+
if usage.cache_creation is not None
|
|
34
|
+
else None
|
|
35
|
+
)
|
|
36
|
+
cache_1h = (
|
|
37
|
+
usage.cache_creation.ephemeral_1h_input_tokens
|
|
38
|
+
if usage.cache_creation is not None
|
|
39
|
+
else None
|
|
40
|
+
)
|
|
41
|
+
tool_calls = sum(
|
|
42
|
+
sum(1 for b in line.message.content if b.get("type") == "tool_use")
|
|
43
|
+
for line in group
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
turns.append(
|
|
47
|
+
RawTurnEvent(
|
|
48
|
+
native_turn_id=mid,
|
|
49
|
+
sequence=seq,
|
|
50
|
+
timestamp=first.timestamp,
|
|
51
|
+
model=canonicalize_claude_model(first.message.model),
|
|
52
|
+
model_raw=first.message.model,
|
|
53
|
+
fresh_input_tokens=usage.input_tokens,
|
|
54
|
+
output_tokens=usage.output_tokens,
|
|
55
|
+
cache_read_tokens=usage.cache_read_input_tokens,
|
|
56
|
+
cache_write_tokens=usage.cache_creation_input_tokens,
|
|
57
|
+
cache_write_5m_tokens=cache_5m,
|
|
58
|
+
cache_write_1h_tokens=cache_1h,
|
|
59
|
+
tool_calls_count=tool_calls,
|
|
60
|
+
metadata={
|
|
61
|
+
"service_tier": usage.service_tier,
|
|
62
|
+
"agentId": first.agentId,
|
|
63
|
+
"attribution_agent": first.attribution_agent,
|
|
64
|
+
"isSidechain": first.isSidechain if first.isSidechain is not None else False,
|
|
65
|
+
},
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
seq += 1
|
|
69
|
+
return turns
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Ingest ``~/.claude/history.jsonl`` — the global prompt history file.
|
|
2
|
+
|
|
3
|
+
Byte-offset tail like sessions. Partial last line is held back until
|
|
4
|
+
newline. If the file shrinks below the recorded offset (rotation /
|
|
5
|
+
truncation), we reset to 0 and re-ingest.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
|
|
14
|
+
|
|
15
|
+
from ...schema.types import Prompt
|
|
16
|
+
from ...store.repository import normalize_project_path
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from ...store.repository import Repository
|
|
20
|
+
|
|
21
|
+
# Hard cap on the indexed body. 8 KB is past any realistic hand-typed
|
|
22
|
+
# prompt; longer values are almost always a paste-by-mistake.
|
|
23
|
+
DISPLAY_CAP_BYTES = 8 * 1024
|
|
24
|
+
MAX_TICK_BYTES = 64 * 1024 * 1024
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HistoryLine(BaseModel):
|
|
28
|
+
"""One line of ~/.claude/history.jsonl."""
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(extra="allow")
|
|
31
|
+
|
|
32
|
+
display: str
|
|
33
|
+
pastedContents: dict[str, Any] = Field(default_factory=dict)
|
|
34
|
+
timestamp: int
|
|
35
|
+
project: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class HistoryIngestStats(BaseModel):
|
|
39
|
+
inserted: int
|
|
40
|
+
skipped_empty: int
|
|
41
|
+
parse_errors: int
|
|
42
|
+
new_offset: int
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def line_to_prompt(line: HistoryLine) -> Prompt | None:
|
|
46
|
+
"""Pure transformation: history line → Prompt row, or None if skipped."""
|
|
47
|
+
trimmed = line.display.strip()
|
|
48
|
+
if trimmed == "":
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
display = line.display
|
|
52
|
+
if len(display.encode("utf-8")) > DISPLAY_CAP_BYTES:
|
|
53
|
+
# Walk back to a unicode-safe boundary by decoding with errors='ignore'.
|
|
54
|
+
truncated = display.encode("utf-8")[: DISPLAY_CAP_BYTES - 1].decode(
|
|
55
|
+
"utf-8", errors="ignore"
|
|
56
|
+
)
|
|
57
|
+
display = truncated + "…"
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
# Compact separators match JS ``JSON.stringify`` byte count, so
|
|
61
|
+
# the pasted_chars upper bound is identical across stacks.
|
|
62
|
+
pasted_chars = len(json.dumps(line.pastedContents or {}, separators=(",", ":")))
|
|
63
|
+
if pasted_chars <= 2: # "{}"
|
|
64
|
+
pasted_chars = 0
|
|
65
|
+
except (TypeError, ValueError):
|
|
66
|
+
pasted_chars = 0
|
|
67
|
+
|
|
68
|
+
return Prompt(
|
|
69
|
+
timestamp_ms=line.timestamp,
|
|
70
|
+
project_path=normalize_project_path(line.project),
|
|
71
|
+
display=display,
|
|
72
|
+
pasted_chars=pasted_chars,
|
|
73
|
+
is_slash=1 if trimmed.startswith("/") else 0,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def ingest_history_file(
|
|
78
|
+
file_path: Path, repo: "Repository"
|
|
79
|
+
) -> HistoryIngestStats:
|
|
80
|
+
"""Read ``file_path`` from the stored offset, append new prompts."""
|
|
81
|
+
offset_key = f"history:{file_path}"
|
|
82
|
+
from_offset = repo.get_file_offset(offset_key)
|
|
83
|
+
|
|
84
|
+
with open(file_path, "rb") as fh:
|
|
85
|
+
fh.seek(0, 2)
|
|
86
|
+
size = fh.tell()
|
|
87
|
+
|
|
88
|
+
# Rotation / truncation: file smaller than the offset → re-ingest.
|
|
89
|
+
if size < from_offset:
|
|
90
|
+
from_offset = 0
|
|
91
|
+
|
|
92
|
+
if size <= from_offset:
|
|
93
|
+
return HistoryIngestStats(
|
|
94
|
+
inserted=0, skipped_empty=0, parse_errors=0, new_offset=from_offset
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
read_len = min(size - from_offset, MAX_TICK_BYTES)
|
|
98
|
+
fh.seek(from_offset)
|
|
99
|
+
raw = fh.read(read_len)
|
|
100
|
+
|
|
101
|
+
text = raw.decode("utf-8", errors="replace")
|
|
102
|
+
last_nl = text.rfind("\n")
|
|
103
|
+
consumable = text[: last_nl + 1] if last_nl != -1 else ""
|
|
104
|
+
consumed_bytes = len(consumable.encode("utf-8"))
|
|
105
|
+
new_offset = from_offset + consumed_bytes
|
|
106
|
+
|
|
107
|
+
rows: list[Prompt] = []
|
|
108
|
+
parse_errors = 0
|
|
109
|
+
skipped_empty = 0
|
|
110
|
+
|
|
111
|
+
for line in consumable.split("\n"):
|
|
112
|
+
if not line:
|
|
113
|
+
continue
|
|
114
|
+
try:
|
|
115
|
+
obj = json.loads(line)
|
|
116
|
+
except json.JSONDecodeError:
|
|
117
|
+
parse_errors += 1
|
|
118
|
+
continue
|
|
119
|
+
try:
|
|
120
|
+
parsed = HistoryLine.model_validate(obj)
|
|
121
|
+
except ValidationError:
|
|
122
|
+
parse_errors += 1
|
|
123
|
+
continue
|
|
124
|
+
row = line_to_prompt(parsed)
|
|
125
|
+
if row is None:
|
|
126
|
+
skipped_empty += 1
|
|
127
|
+
continue
|
|
128
|
+
rows.append(row)
|
|
129
|
+
|
|
130
|
+
repo.insert_prompts(rows)
|
|
131
|
+
repo.set_file_offset(offset_key, new_offset)
|
|
132
|
+
|
|
133
|
+
return HistoryIngestStats(
|
|
134
|
+
inserted=len(rows),
|
|
135
|
+
skipped_empty=skipped_empty,
|
|
136
|
+
parse_errors=parse_errors,
|
|
137
|
+
new_offset=new_offset,
|
|
138
|
+
)
|