coding-agent-roi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ """Collector registry.
2
+
3
+ New collectors register here so the CLI and config can refer to them by name.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from agent_roi.collectors.base import Collector
9
+ from agent_roi.collectors.claude_code import ClaudeCodeCollector
10
+ from agent_roi.collectors.codex import CodexCollector
11
+ from agent_roi.collectors.copilot import CopilotCollector
12
+ from agent_roi.collectors.gemini import GeminiCollector
13
+
14
+ _REGISTRY: dict[str, type[Collector]] = {
15
+ ClaudeCodeCollector.name: ClaudeCodeCollector,
16
+ CodexCollector.name: CodexCollector,
17
+ CopilotCollector.name: CopilotCollector,
18
+ GeminiCollector.name: GeminiCollector,
19
+ }
20
+
21
+
22
+ def get_collectors(names: list[str]) -> list[Collector]:
23
+ """Instantiate the named collectors, skipping unknown names."""
24
+ return [_REGISTRY[name]() for name in names if name in _REGISTRY]
25
+
26
+
27
+ def all_collector_names() -> list[str]:
28
+ return list(_REGISTRY)
29
+
30
+
31
+ __all__ = ["Collector", "get_collectors", "all_collector_names"]
@@ -0,0 +1,49 @@
1
+ """Collector interface.
2
+
3
+ A collector knows how to find one tool's local logs and turn them into a stream
4
+ of normalized :class:`Interaction` objects. Collectors must be read-only and
5
+ idempotent: running ingest twice should not double-count, which is enforced
6
+ upstream by the stable ``Interaction.id``.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from abc import ABC, abstractmethod
12
+ from collections.abc import Iterator
13
+ from pathlib import Path
14
+
15
+ from agent_roi.core.models import Interaction, Tool
16
+
17
+
18
+ class Collector(ABC):
19
+ """Base class for all tool log collectors."""
20
+
21
+ #: Which tool this collector produces interactions for.
22
+ tool: Tool
23
+
24
+ #: Stable name used in config's ``collectors.enabled`` list.
25
+ name: str
26
+
27
+ @abstractmethod
28
+ def is_available(self) -> bool:
29
+ """Return True if this tool's logs exist on the current machine."""
30
+
31
+ @abstractmethod
32
+ def collect(self) -> Iterator[Interaction]:
33
+ """Yield normalized interactions parsed from local logs."""
34
+
35
+ def search_paths(self) -> list[Path]:
36
+ """Directories this collector looked in (for the diagnostics report).
37
+
38
+ Default is empty; collectors override to expose where they searched so
39
+ users can see why a tool was or wasn't detected.
40
+ """
41
+ return []
42
+
43
+ def count_files(self) -> int:
44
+ """Number of log files this collector can see (cheap; no parsing)."""
45
+ return 0
46
+
47
+ def note(self) -> str:
48
+ """Optional human-readable hint shown in diagnostics (e.g. why empty)."""
49
+ return ""
@@ -0,0 +1,165 @@
1
+ """Collector for Claude Code.
2
+
3
+ Claude Code stores one JSONL file per session under
4
+ ``~/.claude/projects/<encoded-project-path>/<session-id>.jsonl``. Each line is a
5
+ JSON object; assistant turns carry a ``message.usage`` block with token counts.
6
+ We read these files read-only and never modify them.
7
+
8
+ We emit one :class:`Interaction` per assistant turn (those are the ones with
9
+ token usage), but we build each interaction's ``summary`` from both the user's
10
+ request and the assistant's reply. Many assistant turns are pure tool calls with
11
+ no prose, so using assistant text alone leaves most summaries empty — which makes
12
+ topic discovery impossible. Carrying the preceding user message keeps the topic
13
+ signal intact.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ from collections.abc import Iterator
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+ from typing import Any
23
+
24
+ from agent_roi.collectors.base import Collector
25
+ from agent_roi.core.models import Interaction, Tool
26
+ from agent_roi.core.platform import find_tool_dirs
27
+ from agent_roi.core.project import project_for
28
+
29
+ _SUMMARY_MAX = 600
30
+
31
+
32
+ class ClaudeCodeCollector(Collector):
33
+ tool = Tool.CLAUDE_CODE
34
+ name = "claude_code"
35
+
36
+ def __init__(self, roots: list[Path] | None = None) -> None:
37
+ # Supports multiple roots so WSL can also read Windows-side logs.
38
+ self.roots = roots if roots is not None else find_tool_dirs(".claude", "projects")
39
+
40
+ def is_available(self) -> bool:
41
+ return bool(self.roots)
42
+
43
+ def search_paths(self) -> list[Path]:
44
+ return list(self.roots)
45
+
46
+ def count_files(self) -> int:
47
+ return sum(1 for root in self.roots for _ in root.rglob("*.jsonl"))
48
+
49
+ def collect(self) -> Iterator[Interaction]:
50
+ for root in self.roots:
51
+ for jsonl in root.rglob("*.jsonl"):
52
+ yield from self._parse_file(jsonl)
53
+
54
+ def _parse_file(self, path: Path) -> Iterator[Interaction]:
55
+ session_id = path.stem
56
+ try:
57
+ lines = path.read_text(encoding="utf-8").splitlines()
58
+ except OSError:
59
+ return
60
+
61
+ last_user_text = ""
62
+ for line in lines:
63
+ line = line.strip()
64
+ if not line:
65
+ continue
66
+ try:
67
+ record = json.loads(line)
68
+ except json.JSONDecodeError:
69
+ continue
70
+
71
+ message = record.get("message")
72
+ if not isinstance(message, dict):
73
+ continue
74
+
75
+ role = message.get("role") or record.get("type")
76
+ text = _text_from_content(message.get("content"))
77
+
78
+ if role == "user":
79
+ # Remember the latest user intent to attach to the next reply.
80
+ if text:
81
+ last_user_text = text
82
+ continue
83
+
84
+ usage = message.get("usage")
85
+ if not isinstance(usage, dict):
86
+ continue
87
+
88
+ interaction = self._to_interaction(
89
+ record, message, usage, session_id, last_user_text, text
90
+ )
91
+ if interaction is not None:
92
+ yield interaction
93
+
94
+ def _to_interaction(
95
+ self,
96
+ record: dict[str, Any],
97
+ message: dict[str, Any],
98
+ usage: dict[str, Any],
99
+ session_id: str,
100
+ user_text: str,
101
+ assistant_text: str,
102
+ ) -> Interaction | None:
103
+ msg_id = message.get("id") or record.get("uuid")
104
+ if not msg_id:
105
+ return None
106
+
107
+ ts_raw = record.get("timestamp")
108
+ try:
109
+ timestamp = (
110
+ datetime.fromisoformat(ts_raw.replace("Z", "+00:00"))
111
+ if isinstance(ts_raw, str)
112
+ else datetime.now()
113
+ )
114
+ except ValueError:
115
+ timestamp = datetime.now()
116
+
117
+ cwd = str(record.get("cwd", ""))
118
+ return Interaction(
119
+ id=str(msg_id),
120
+ tool=self.tool,
121
+ session_id=session_id,
122
+ timestamp=timestamp,
123
+ model=message.get("model", "unknown"),
124
+ input_tokens=int(usage.get("input_tokens", 0)),
125
+ output_tokens=int(usage.get("output_tokens", 0)),
126
+ cache_read_tokens=int(usage.get("cache_read_input_tokens", 0)),
127
+ cache_write_tokens=int(usage.get("cache_creation_input_tokens", 0)),
128
+ cwd=cwd,
129
+ project=project_for(cwd),
130
+ summary=_combine_summary(user_text, assistant_text),
131
+ )
132
+
133
+
134
+ def _combine_summary(user_text: str, assistant_text: str) -> str:
135
+ """Build a topic-bearing summary, preferring the user's request first."""
136
+ parts = [p for p in (user_text, assistant_text) if p]
137
+ return " ".join(parts)[:_SUMMARY_MAX]
138
+
139
+
140
+ def _text_from_content(content: object) -> str:
141
+ """Extract human-readable text from a Claude message ``content`` field.
142
+
143
+ Content may be a plain string or a list of typed blocks. We keep prose
144
+ (``text``) and tool *names* (``tool_use``) as topic signal, but deliberately
145
+ skip ``tool_result`` bodies: those carry command output (e.g. ``ls -l``
146
+ listings, file dumps) that pollutes topic labels with noise like permission
147
+ bits and paths rather than describing the task.
148
+ """
149
+ if isinstance(content, str):
150
+ return content.strip()
151
+ if not isinstance(content, list):
152
+ return ""
153
+
154
+ pieces: list[str] = []
155
+ for block in content:
156
+ if not isinstance(block, dict):
157
+ continue
158
+ btype = block.get("type")
159
+ if btype == "text":
160
+ pieces.append(str(block.get("text", "")))
161
+ elif btype == "tool_use":
162
+ name = block.get("name")
163
+ if name:
164
+ pieces.append(str(name))
165
+ return " ".join(p for p in pieces if p).strip()
@@ -0,0 +1,157 @@
1
+ """Collector for OpenAI Codex CLI.
2
+
3
+ Codex CLI stores one rollout log per session as JSONL under
4
+ ``~/.codex/sessions/<YYYY>/<MM>/<DD>/rollout-*.jsonl``. Each line is a typed
5
+ record. The shapes we care about:
6
+
7
+ - ``session_meta`` — session id and start time.
8
+ - ``turn_context`` — carries the active ``model`` for subsequent turns.
9
+ - ``event_msg`` with ``payload.type == "token_count"`` — per-turn token usage in
10
+ ``info.last_token_usage`` (input/cached/output/reasoning tokens).
11
+ - ``event_msg`` with ``payload.type in {"user_message","agent_message"}`` — text
12
+ we keep a short snippet of for the classifier.
13
+
14
+ We emit one :class:`Interaction` per ``token_count`` event, using
15
+ ``last_token_usage`` (the delta for that turn) so usage isn't double-counted from
16
+ the running ``total_token_usage``. The parser is defensive: unknown records are
17
+ skipped rather than failing the whole ingest.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ from collections.abc import Iterator
24
+ from datetime import datetime
25
+ from pathlib import Path
26
+ from typing import Any
27
+
28
+ from agent_roi.collectors.base import Collector
29
+ from agent_roi.core.models import Interaction, Tool
30
+ from agent_roi.core.platform import find_tool_dirs
31
+ from agent_roi.core.project import project_for
32
+
33
+
34
+ class CodexCollector(Collector):
35
+ tool = Tool.CODEX
36
+ name = "codex"
37
+
38
+ def __init__(self, roots: list[Path] | None = None) -> None:
39
+ self.roots = roots if roots is not None else find_tool_dirs(".codex", "sessions")
40
+
41
+ def is_available(self) -> bool:
42
+ return bool(self.roots)
43
+
44
+ def search_paths(self) -> list[Path]:
45
+ return list(self.roots)
46
+
47
+ def count_files(self) -> int:
48
+ return sum(1 for root in self.roots for _ in root.rglob("rollout-*.jsonl"))
49
+
50
+ def collect(self) -> Iterator[Interaction]:
51
+ for root in self.roots:
52
+ for jsonl in root.rglob("*.jsonl"):
53
+ yield from self._parse_file(jsonl)
54
+
55
+ def _parse_file(self, path: Path) -> Iterator[Interaction]:
56
+ # Session id is the uuid at the end of the filename if present, else stem.
57
+ session_id = path.stem.split("-")[-1] if "-" in path.stem else path.stem
58
+
59
+ model = "unknown"
60
+ cwd = ""
61
+ last_user = ""
62
+ last_agent = ""
63
+ seq = 0
64
+
65
+ try:
66
+ lines = path.read_text(encoding="utf-8").splitlines()
67
+ except OSError:
68
+ return
69
+
70
+ for line in lines:
71
+ line = line.strip()
72
+ if not line:
73
+ continue
74
+ try:
75
+ record = json.loads(line)
76
+ except json.JSONDecodeError:
77
+ continue
78
+
79
+ rtype = record.get("type")
80
+ payload = record.get("payload")
81
+ payload = payload if isinstance(payload, dict) else {}
82
+
83
+ if rtype == "turn_context":
84
+ model = str(payload.get("model") or model)
85
+ cwd = str(payload.get("cwd") or cwd)
86
+ continue
87
+
88
+ if rtype == "session_meta":
89
+ cwd = str(payload.get("cwd") or cwd)
90
+ continue
91
+
92
+ if rtype != "event_msg":
93
+ continue
94
+
95
+ ptype = payload.get("type")
96
+ if ptype == "user_message":
97
+ text = _event_text(payload)
98
+ if text:
99
+ last_user = text
100
+ elif ptype == "agent_message":
101
+ text = _event_text(payload)
102
+ if text:
103
+ last_agent = text
104
+ elif ptype == "token_count":
105
+ usage = _last_usage(payload)
106
+ if usage is None:
107
+ continue
108
+ seq += 1
109
+ yield Interaction(
110
+ id=f"codex:{session_id}:{seq}",
111
+ tool=self.tool,
112
+ session_id=session_id,
113
+ timestamp=_parse_ts(record.get("timestamp")),
114
+ model=_normalize_model(model),
115
+ input_tokens=int(usage.get("input_tokens", 0)),
116
+ output_tokens=(
117
+ int(usage.get("output_tokens", 0))
118
+ + int(usage.get("reasoning_output_tokens", 0))
119
+ ),
120
+ cache_read_tokens=int(usage.get("cached_input_tokens", 0)),
121
+ cwd=cwd,
122
+ project=project_for(cwd),
123
+ summary=_combine(last_user, last_agent),
124
+ )
125
+
126
+
127
+ def _event_text(payload: dict[str, Any]) -> str:
128
+ text = payload.get("message") or payload.get("text") or ""
129
+ return text.strip() if isinstance(text, str) else ""
130
+
131
+
132
+ def _combine(user_text: str, agent_text: str) -> str:
133
+ parts = [p for p in (user_text, agent_text) if p]
134
+ return " ".join(parts)[:600]
135
+
136
+
137
+ def _last_usage(payload: dict[str, Any]) -> dict[str, Any] | None:
138
+ """Pull the per-turn token usage from a token_count event payload."""
139
+ info = payload.get("info")
140
+ if not isinstance(info, dict):
141
+ return None
142
+ usage = info.get("last_token_usage") or info.get("total_token_usage")
143
+ return usage if isinstance(usage, dict) else None
144
+
145
+
146
+ def _normalize_model(model: str) -> str:
147
+ # Codex reports e.g. "gpt-5.5"; normalize dots to dashes for pricing lookup.
148
+ return model.replace(".", "-")
149
+
150
+
151
+ def _parse_ts(raw: object) -> datetime:
152
+ if isinstance(raw, str):
153
+ try:
154
+ return datetime.fromisoformat(raw.replace("Z", "+00:00"))
155
+ except ValueError:
156
+ pass
157
+ return datetime.now()
@@ -0,0 +1,210 @@
1
+ """Collector for GitHub Copilot Chat in VS Code.
2
+
3
+ Copilot stores chat sessions under each VS Code workspace's
4
+ ``workspaceStorage/<id>/chatSessions/*.json(l)``. Crucially, these logs record
5
+ the conversation text and the model id, but **not** real token usage (Copilot is
6
+ subscription-billed, so GitHub doesn't write token counts). We therefore
7
+ *estimate* token counts from the message text with a tokenizer and flag the
8
+ interactions as ``estimated`` so reports never present them as exact.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from collections.abc import Iterator
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from agent_roi.collectors.base import Collector
20
+ from agent_roi.core.models import Interaction, Tool
21
+ from agent_roi.core.platform import vscode_user_dirs
22
+ from agent_roi.core.project import project_for
23
+ from agent_roi.core.tokens import estimate_tokens
24
+
25
+
26
+ class CopilotCollector(Collector):
27
+ tool = Tool.COPILOT
28
+ name = "copilot"
29
+
30
+ def __init__(self, roots: list[Path] | None = None) -> None:
31
+ # Each root is a VS Code "User" dir; chat sessions live under its
32
+ # workspaceStorage subtree.
33
+ self.roots = roots if roots is not None else vscode_user_dirs()
34
+
35
+ def is_available(self) -> bool:
36
+ return any((r / "workspaceStorage").is_dir() for r in self.roots)
37
+
38
+ def search_paths(self) -> list[Path]:
39
+ return list(self.roots)
40
+
41
+ def count_files(self) -> int:
42
+ total = 0
43
+ for root in self.roots:
44
+ ws = root / "workspaceStorage"
45
+ if ws.is_dir():
46
+ total += sum(1 for _ in ws.glob("*/chatSessions/*.json*"))
47
+ return total
48
+
49
+ def collect(self) -> Iterator[Interaction]:
50
+ for root in self.roots:
51
+ ws = root / "workspaceStorage"
52
+ if not ws.is_dir():
53
+ continue
54
+ for ws_dir in ws.iterdir():
55
+ if not ws_dir.is_dir():
56
+ continue
57
+ cwd = _workspace_cwd(ws_dir)
58
+ chat_dir = ws_dir / "chatSessions"
59
+ if not chat_dir.is_dir():
60
+ continue
61
+ for session_file in chat_dir.glob("*.json*"):
62
+ yield from self._parse_file(session_file, cwd)
63
+
64
+ def _parse_file(self, path: Path, cwd: str) -> Iterator[Interaction]:
65
+ session_id = path.stem
66
+ try:
67
+ raw = path.read_text(encoding="utf-8")
68
+ except OSError:
69
+ return
70
+ for obj in _load_objects(raw):
71
+ yield from self._requests_in(obj, session_id, cwd)
72
+
73
+ def _requests_in(self, obj: Any, session_id: str, cwd: str) -> Iterator[Interaction]:
74
+ """Walk an arbitrary JSON structure, yielding an Interaction per Copilot
75
+ chat request found."""
76
+ if isinstance(obj, dict):
77
+ if obj.get("requestId") and "modelId" in obj:
78
+ itx = self._to_interaction(obj, session_id, cwd)
79
+ if itx is not None:
80
+ yield itx
81
+ for value in obj.values():
82
+ yield from self._requests_in(value, session_id, cwd)
83
+ elif isinstance(obj, list):
84
+ for value in obj:
85
+ yield from self._requests_in(value, session_id, cwd)
86
+
87
+ def _to_interaction(self, req: dict[str, Any], session_id: str, cwd: str) -> Interaction | None:
88
+ request_id = req.get("requestId")
89
+ if not request_id:
90
+ return None
91
+
92
+ user_text = _message_text(req.get("message"))
93
+ response_text = _response_text(req.get("response"))
94
+
95
+ model = str(req.get("modelId") or "unknown")
96
+ summary = " ".join(p for p in (user_text, response_text) if p)[:600]
97
+ return Interaction(
98
+ id=f"copilot:{request_id}",
99
+ tool=self.tool,
100
+ session_id=session_id,
101
+ timestamp=_parse_ts(req.get("timestamp")),
102
+ model=_normalize_model(model),
103
+ input_tokens=estimate_tokens(user_text),
104
+ output_tokens=estimate_tokens(response_text),
105
+ cwd=cwd,
106
+ project=project_for(cwd),
107
+ summary=summary,
108
+ estimated=True,
109
+ )
110
+
111
+
112
+ def _workspace_cwd(ws_dir: Path) -> str:
113
+ """Read the workspace folder from VS Code's ``workspace.json``.
114
+
115
+ VS Code writes ``workspaceStorage/<hash>/workspace.json`` with a ``folder``
116
+ key that is a URI such as:
117
+ - ``file:///Users/yen/repo`` → local path (most common)
118
+ - ``vscode-remote://ssh-remote%2B<host>/home/yen/repo`` → SSH remote
119
+
120
+ We convert both to the plain path portion so ``project_for`` can derive a
121
+ project name. For remote workspaces we keep a ``ssh:<host>:`` prefix so the
122
+ project name stays meaningful (e.g. ``repo`` on host ``100.120.0.60``).
123
+ """
124
+ from urllib.parse import unquote
125
+
126
+ wj = ws_dir / "workspace.json"
127
+ try:
128
+ data = json.loads(wj.read_text(encoding="utf-8"))
129
+ except (OSError, json.JSONDecodeError):
130
+ return ""
131
+ folder = str(data.get("folder", ""))
132
+
133
+ if folder.startswith("file:///"):
134
+ # file:///Users/yen/repo -> /Users/yen/repo
135
+ return unquote(folder[len("file://"):])
136
+
137
+ if folder.startswith("vscode-remote://"):
138
+ # vscode-remote://ssh-remote%2B<host>/path/to/repo
139
+ rest = folder[len("vscode-remote://"):]
140
+ slash = rest.find("/")
141
+ if slash != -1:
142
+ path = unquote(rest[slash:])
143
+ return path # project_for will pick up the last meaningful segment
144
+ return unquote(rest)
145
+
146
+ return folder
147
+
148
+
149
+ def _load_objects(raw: str) -> list[Any]:
150
+ """Parse a session file that may be a single JSON object or JSONL."""
151
+ try:
152
+ return [json.loads(raw)]
153
+ except json.JSONDecodeError:
154
+ objects: list[Any] = []
155
+ for line in raw.splitlines():
156
+ line = line.strip()
157
+ if not line:
158
+ continue
159
+ try:
160
+ objects.append(json.loads(line))
161
+ except json.JSONDecodeError:
162
+ continue
163
+ return objects
164
+
165
+
166
+ def _message_text(message: Any) -> str:
167
+ if isinstance(message, dict):
168
+ return str(message.get("text", ""))
169
+ if isinstance(message, str):
170
+ return message
171
+ return ""
172
+
173
+
174
+ def _response_text(response: Any) -> str:
175
+ """Copilot responses are usually a list of parts with a ``value``/``text``."""
176
+ if isinstance(response, str):
177
+ return response
178
+ if isinstance(response, dict):
179
+ return str(response.get("value") or response.get("text") or "")
180
+ if isinstance(response, list):
181
+ parts = []
182
+ for part in response:
183
+ if isinstance(part, dict):
184
+ parts.append(str(part.get("value") or part.get("text") or ""))
185
+ elif isinstance(part, str):
186
+ parts.append(part)
187
+ return "".join(parts)
188
+ return ""
189
+
190
+
191
+ def _normalize_model(model: str) -> str:
192
+ # Copilot reports e.g. "copilot/claude-opus-4.6"; strip the vendor prefix and
193
+ # map dots to dashes so it matches the pricing table where possible.
194
+ name = model.split("/", 1)[-1]
195
+ return name.replace(".", "-")
196
+
197
+
198
+ def _parse_ts(raw: Any) -> datetime:
199
+ # Copilot timestamps are epoch milliseconds.
200
+ if isinstance(raw, (int, float)):
201
+ try:
202
+ return datetime.fromtimestamp(raw / 1000, tz=timezone.utc)
203
+ except (ValueError, OSError):
204
+ pass
205
+ if isinstance(raw, str):
206
+ try:
207
+ return datetime.fromisoformat(raw.replace("Z", "+00:00"))
208
+ except ValueError:
209
+ pass
210
+ return datetime.now(tz=timezone.utc)