coding-agent-roi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,220 @@
1
+ """Collector for the Gemini CLI.
2
+
3
+ The Gemini CLI keeps one chat log per session under a per-project temp tree::
4
+
5
+ ~/.gemini/tmp/<projectHash>/chats/session-<timestamp>-<id>.json
6
+ ~/.gemini/tmp/<projectHash>/chats/session-<timestamp>-<id>.jsonl # newer
7
+
8
+ ``<projectHash>`` is ``sha256(cwd)``; the real working directory is written
9
+ verbatim next to the chats in ``~/.gemini/tmp/<projectHash>/.project_root`` (or
10
+ the parent of ``chats/``), which we read to recover a meaningful ``project``.
11
+
12
+ Both file shapes carry the same per-message structure — the difference is only
13
+ the container:
14
+
15
+ - ``.json`` — a single object ``{"sessionId", "projectHash", "messages": [...]}``
16
+ - ``.jsonl`` — one record per line; a ``kind: "main"`` header line, then one
17
+ record per message (lines like ``{"$set": ...}`` are state deltas we skip).
18
+
19
+ Either way, ``type == "gemini"`` messages carry **real** token usage in a
20
+ ``tokens`` block (``input``/``output``/``cached``/``thoughts``/``total``) plus the
21
+ ``model``, so Gemini interactions are exact, not estimated. ``thoughts``
22
+ (reasoning) tokens are folded into output, and ``cached`` maps to cache reads.
23
+ Files are read read-only and never modified.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import hashlib
29
+ import json
30
+ from collections.abc import Iterator
31
+ from datetime import datetime
32
+ from functools import lru_cache
33
+ from pathlib import Path
34
+ from typing import Any
35
+
36
+ from agent_roi.collectors.base import Collector
37
+ from agent_roi.core.models import Interaction, Tool
38
+ from agent_roi.core.platform import find_tool_dirs
39
+ from agent_roi.core.project import project_for
40
+
41
+ _SUMMARY_MAX = 600
42
+
43
+
44
+ class GeminiCollector(Collector):
45
+ tool = Tool.GEMINI
46
+ name = "gemini"
47
+
48
+ def __init__(self, roots: list[Path] | None = None) -> None:
49
+ # Each root is a ``~/.gemini/tmp`` dir; chat logs live under
50
+ # ``<projectHash>/chats/``. Multiple roots support WSL reading the
51
+ # Windows-side profile too.
52
+ self.roots = roots if roots is not None else find_tool_dirs(".gemini", "tmp")
53
+
54
+ def is_available(self) -> bool:
55
+ return any(self._chat_files(root) for root in self.roots)
56
+
57
+ def search_paths(self) -> list[Path]:
58
+ return list(self.roots)
59
+
60
+ def count_files(self) -> int:
61
+ return sum(1 for root in self.roots for _ in self._chat_files(root))
62
+
63
+ def collect(self) -> Iterator[Interaction]:
64
+ for root in self.roots:
65
+ for chat in self._chat_files(root):
66
+ yield from self._parse_file(chat)
67
+
68
+ @staticmethod
69
+ def _chat_files(root: Path) -> Iterator[Path]:
70
+ # ``<projectHash>/chats/session-*.json{,l}`` — glob both extensions.
71
+ yield from root.glob("*/chats/session-*.json")
72
+ yield from root.glob("*/chats/session-*.jsonl")
73
+
74
+ def _parse_file(self, path: Path) -> Iterator[Interaction]:
75
+ try:
76
+ raw = path.read_text(encoding="utf-8")
77
+ except OSError:
78
+ return
79
+
80
+ cwd = _project_root_for(path)
81
+ project = project_for(cwd)
82
+ session_id = ""
83
+ last_user = ""
84
+ seq = 0
85
+
86
+ for record in _records(raw):
87
+ if not isinstance(record, dict):
88
+ continue
89
+
90
+ # Header / metadata: capture the session id once.
91
+ sid = record.get("sessionId")
92
+ if isinstance(sid, str) and sid:
93
+ session_id = sid
94
+
95
+ rtype = record.get("type")
96
+ if rtype == "user":
97
+ text = _content_text(record.get("content"))
98
+ if text:
99
+ last_user = text
100
+ continue
101
+ if rtype != "gemini":
102
+ continue
103
+
104
+ tokens = record.get("tokens")
105
+ if not isinstance(tokens, dict):
106
+ continue
107
+
108
+ seq += 1
109
+ sess = session_id or path.stem
110
+ assistant_text = _content_text(record.get("content"))
111
+ yield Interaction(
112
+ id=f"gemini:{sess}:{seq}",
113
+ tool=self.tool,
114
+ session_id=sess,
115
+ timestamp=_parse_ts(record.get("timestamp")),
116
+ model=str(record.get("model") or "gemini"),
117
+ input_tokens=int(tokens.get("input", 0)),
118
+ # Reasoning ("thoughts") tokens are billed like output.
119
+ output_tokens=int(tokens.get("output", 0)) + int(tokens.get("thoughts", 0)),
120
+ cache_read_tokens=int(tokens.get("cached", 0)),
121
+ cwd=cwd,
122
+ project=project,
123
+ summary=_combine(last_user, assistant_text),
124
+ )
125
+
126
+
127
+ def _records(raw: str) -> Iterator[Any]:
128
+ """Yield message records from either container shape.
129
+
130
+ A ``.json`` file is one object with a ``messages`` list; a ``.jsonl`` file is
131
+ one record per line. We also yield the top-level object so callers can read
132
+ ``sessionId`` from a ``.json``'s header.
133
+ """
134
+ stripped = raw.lstrip()
135
+ if stripped.startswith("{") and '"messages"' in raw:
136
+ try:
137
+ obj = json.loads(raw)
138
+ except json.JSONDecodeError:
139
+ return
140
+ if isinstance(obj, dict):
141
+ yield obj # header (sessionId, projectHash, ...)
142
+ yield from obj.get("messages", [])
143
+ return
144
+
145
+ for line in raw.splitlines():
146
+ line = line.strip()
147
+ if not line:
148
+ continue
149
+ try:
150
+ yield json.loads(line)
151
+ except json.JSONDecodeError:
152
+ continue
153
+
154
+
155
+ def _project_root_for(chat_file: Path) -> str:
156
+ """Recover the real cwd for a chat file.
157
+
158
+ Layout is ``<root>/<projectHash>/chats/<file>``. We try, in order:
159
+
160
+ 1. the ``.project_root`` marker in the ``<projectHash>`` dir (authoritative,
161
+ written by newer Gemini CLI), then
162
+ 2. a reverse lookup of ``<projectHash>`` (which is ``sha256(cwd)``) against
163
+ the cwds Gemini recorded in ``~/.gemini/projects.json`` — this recovers a
164
+ real path for older sessions that predate the marker file.
165
+ """
166
+ project_dir = chat_file.parent.parent # .../<projectHash>
167
+ marker = project_dir / ".project_root"
168
+ try:
169
+ text = marker.read_text(encoding="utf-8").strip()
170
+ if text:
171
+ return text
172
+ except OSError:
173
+ pass
174
+
175
+ gemini_home = project_dir.parent.parent # .../.gemini
176
+ return _hash_to_cwd(gemini_home).get(project_dir.name, "")
177
+
178
+
179
+ @lru_cache(maxsize=8)
180
+ def _hash_to_cwd(gemini_home: Path) -> dict[str, str]:
181
+ """Map ``sha256(cwd) -> cwd`` for every project in ``projects.json``."""
182
+ try:
183
+ data = json.loads((gemini_home / "projects.json").read_text(encoding="utf-8"))
184
+ except (OSError, json.JSONDecodeError):
185
+ return {}
186
+ projects = data.get("projects") if isinstance(data, dict) else None
187
+ if not isinstance(projects, dict):
188
+ return {}
189
+ return {hashlib.sha256(cwd.encode()).hexdigest(): cwd for cwd in projects}
190
+
191
+
192
+ def _content_text(content: object) -> str:
193
+ """Extract prose from a Gemini message ``content`` field (string or blocks)."""
194
+ if isinstance(content, str):
195
+ return content.strip()
196
+ if isinstance(content, list):
197
+ pieces: list[str] = []
198
+ for block in content:
199
+ if isinstance(block, dict):
200
+ text = block.get("text")
201
+ if isinstance(text, str):
202
+ pieces.append(text)
203
+ elif isinstance(block, str):
204
+ pieces.append(block)
205
+ return " ".join(p for p in pieces if p).strip()
206
+ return ""
207
+
208
+
209
+ def _combine(user_text: str, assistant_text: str) -> str:
210
+ parts = [p for p in (user_text, assistant_text) if p]
211
+ return " ".join(parts)[:_SUMMARY_MAX]
212
+
213
+
214
+ def _parse_ts(raw: object) -> datetime:
215
+ if isinstance(raw, str):
216
+ try:
217
+ return datetime.fromisoformat(raw.replace("Z", "+00:00"))
218
+ except ValueError:
219
+ pass
220
+ return datetime.now()
@@ -0,0 +1 @@
1
+ """Core domain: models, config, pricing, and the orchestration service."""
@@ -0,0 +1,58 @@
1
+ """Configuration loading for Agent-ROI.
2
+
3
+ Config is read from ``~/.config/agent-roi/config.toml`` (override with the
4
+ ``AGENT_ROI_CONFIG`` env var). Every field has a sensible default so the tool
5
+ works out of the box with zero configuration.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ from platformdirs import user_config_dir, user_data_dir
15
+ from pydantic import BaseModel
16
+
17
+ if sys.version_info >= (3, 11):
18
+ import tomllib
19
+ else: # pragma: no cover
20
+ import tomli as tomllib # type: ignore
21
+
22
+ APP_NAME = "agent-roi"
23
+
24
+
25
+ class ClassifierConfig(BaseModel):
26
+ # Only "semantic" exists: model-free, offline topic discovery.
27
+ provider: str = "semantic"
28
+ # Cosine similarity at/above which two sessions are grouped into the same
29
+ # topic. Higher = stricter (more, smaller topics); lower = broader topics.
30
+ similarity_threshold: float = 0.18
31
+ # Number of distinctive terms used to name each discovered topic.
32
+ label_terms: int = 3
33
+
34
+
35
+ class CollectorsConfig(BaseModel):
36
+ enabled: list[str] = ["claude_code", "codex", "copilot", "gemini"]
37
+
38
+
39
+ class Config(BaseModel):
40
+ classifier: ClassifierConfig = ClassifierConfig()
41
+ collectors: CollectorsConfig = CollectorsConfig()
42
+ db_path: Path = Path(user_data_dir(APP_NAME)) / "agent_roi.db"
43
+
44
+ @classmethod
45
+ def load(cls) -> Config:
46
+ path = _config_path()
47
+ if not path.exists():
48
+ return cls()
49
+ with path.open("rb") as f:
50
+ data = tomllib.load(f)
51
+ return cls.model_validate(data)
52
+
53
+
54
+ def _config_path() -> Path:
55
+ override = os.environ.get("AGENT_ROI_CONFIG")
56
+ if override:
57
+ return Path(override)
58
+ return Path(user_config_dir(APP_NAME)) / "config.toml"
@@ -0,0 +1,241 @@
1
+ """Core domain models shared across collectors, classifier, and storage."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from enum import Enum
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class Tool(str, Enum):
12
+ """Supported AI coding tools."""
13
+
14
+ CLAUDE_CODE = "claude_code"
15
+ CODEX = "codex"
16
+ COPILOT = "copilot"
17
+ GEMINI = "gemini"
18
+ CURSOR = "cursor"
19
+ UNKNOWN = "unknown"
20
+
21
+
22
+ class Interaction(BaseModel):
23
+ """A single normalized request/response turn parsed from a tool's logs.
24
+
25
+ This is the canonical unit produced by collectors and stored in the database.
26
+ Collectors translate each tool's native log format into this shape.
27
+ """
28
+
29
+ id: str = Field(..., description="Stable unique id (usually tool's own message id).")
30
+ tool: Tool
31
+ session_id: str = Field(..., description="Groups interactions from one agent session.")
32
+ timestamp: datetime
33
+ model: str = Field(..., description="Model name reported by the tool, e.g. 'claude-opus-4-8'.")
34
+
35
+ input_tokens: int = 0
36
+ output_tokens: int = 0
37
+ cache_read_tokens: int = 0
38
+ cache_write_tokens: int = 0
39
+
40
+ # Working directory the agent ran in, when the tool records it. Used to derive
41
+ # ``project`` and as a signal for topic classification.
42
+ cwd: str = ""
43
+ # A coarse grouping derived from cwd (git root / folder name). Not the final
44
+ # topic — the classifier still assigns a semantic ``topic`` per session.
45
+ project: str = ""
46
+
47
+ # Free-text summary the classifier reads to derive a topic. Kept short and
48
+ # never includes full prompt bodies. Classification is local and offline, so
49
+ # nothing here ever leaves the machine.
50
+ summary: str = ""
51
+
52
+ # Populated by the classifier; null until classification runs.
53
+ topic: str | None = None
54
+
55
+ # True when token counts are estimated (e.g. via a tokenizer) rather than
56
+ # reported by the tool. Copilot doesn't expose real usage, so its
57
+ # interactions are flagged here and shown as "estimated" in reports.
58
+ estimated: bool = False
59
+
60
+ @property
61
+ def total_tokens(self) -> int:
62
+ return (
63
+ self.input_tokens
64
+ + self.output_tokens
65
+ + self.cache_read_tokens
66
+ + self.cache_write_tokens
67
+ )
68
+
69
+
70
+ class Rollup(BaseModel):
71
+ """Aggregated token usage and cost for one value of a grouping dimension.
72
+
73
+ ``key`` is the dimension value (a topic, a tool, or a model, depending on how
74
+ the rollup was requested). ``estimated`` is true if *any* interaction in the
75
+ group has estimated rather than tool-reported tokens, so the UI can mark the
76
+ number as approximate.
77
+ """
78
+
79
+ key: str
80
+ interactions: int
81
+ input_tokens: int
82
+ output_tokens: int
83
+ cache_read_tokens: int
84
+ cache_write_tokens: int
85
+ cost_usd: float
86
+ estimated: bool = False
87
+
88
+ @property
89
+ def total_tokens(self) -> int:
90
+ return (
91
+ self.input_tokens
92
+ + self.output_tokens
93
+ + self.cache_read_tokens
94
+ + self.cache_write_tokens
95
+ )
96
+
97
+
98
+ class TopicBreakdown(BaseModel):
99
+ """A topic's total, plus how it splits across tools and models.
100
+
101
+ This is what lets a user answer "this topic's tokens came from which tools,
102
+ at what price?" — the drill-down behind a single topic row.
103
+ """
104
+
105
+ topic: str
106
+ total: Rollup
107
+ by_tool: list[Rollup]
108
+ by_model: list[Rollup]
109
+
110
+
111
+ class ModelPricing(BaseModel):
112
+ """Per-model unit prices (USD per 1M tokens), exposed so users can verify
113
+ that cost = usage x these numbers."""
114
+
115
+ model: str
116
+ input: float
117
+ output: float
118
+ cache_read: float
119
+ cache_write: float
120
+
121
+
122
+ class SessionSummary(BaseModel):
123
+ """One agent session aggregated: the unit a topic is made of.
124
+
125
+ A topic groups many sessions; a session groups many interactions. This is the
126
+ middle layer of the topic -> session -> interaction drill-down.
127
+ """
128
+
129
+ session_id: str
130
+ topic: str
131
+ project: str
132
+ tools: list[str]
133
+ models: list[str]
134
+ started: datetime
135
+ ended: datetime
136
+ interactions: int
137
+ input_tokens: int
138
+ output_tokens: int
139
+ cache_read_tokens: int
140
+ cache_write_tokens: int
141
+ cost_usd: float
142
+ estimated: bool = False
143
+
144
+ @property
145
+ def total_tokens(self) -> int:
146
+ return (
147
+ self.input_tokens
148
+ + self.output_tokens
149
+ + self.cache_read_tokens
150
+ + self.cache_write_tokens
151
+ )
152
+
153
+
154
+ class InteractionView(BaseModel):
155
+ """A single interaction as shown when drilling into a session."""
156
+
157
+ id: str
158
+ tool: str
159
+ model: str
160
+ timestamp: datetime
161
+ input_tokens: int
162
+ output_tokens: int
163
+ cache_read_tokens: int
164
+ cache_write_tokens: int
165
+ cost_usd: float
166
+ estimated: bool
167
+ summary: str
168
+
169
+ @property
170
+ def total_tokens(self) -> int:
171
+ return (
172
+ self.input_tokens
173
+ + self.output_tokens
174
+ + self.cache_read_tokens
175
+ + self.cache_write_tokens
176
+ )
177
+
178
+
179
+ class SessionDetail(BaseModel):
180
+ """A session's aggregate plus the interactions (conversation turns) in it."""
181
+
182
+ session: SessionSummary
183
+ interactions: list[InteractionView]
184
+
185
+
186
+ class TimeSeriesPoint(BaseModel):
187
+ """Daily usage bucket for trend charts."""
188
+
189
+ date: str
190
+ interactions: int
191
+ input_tokens: int
192
+ output_tokens: int
193
+ cache_read_tokens: int = 0
194
+ cache_write_tokens: int = 0
195
+ cost_usd: float
196
+
197
+ @property
198
+ def total_tokens(self) -> int:
199
+ return (
200
+ self.input_tokens
201
+ + self.output_tokens
202
+ + self.cache_read_tokens
203
+ + self.cache_write_tokens
204
+ )
205
+
206
+
207
+ class TimeSeriesSplitRow(BaseModel):
208
+ """One day of token usage split across a dimension (tool, model, …)."""
209
+
210
+ date: str
211
+ values: dict[str, int]
212
+ cost_usd: float
213
+ interactions: int
214
+
215
+
216
+ class TimeSeriesBundle(BaseModel):
217
+ """Everything the trends dashboard needs in one round trip."""
218
+
219
+ totals: list[TimeSeriesPoint]
220
+ by_tool: list[TimeSeriesSplitRow]
221
+ by_model: list[TimeSeriesSplitRow]
222
+ tool_keys: list[str]
223
+ model_keys: list[str]
224
+
225
+
226
+ class CollectorStatus(BaseModel):
227
+ """Diagnostics for one tool collector: where it looked and what it found.
228
+
229
+ This powers the `doctor` command and the dashboard's "data sources" panel so
230
+ users can see *why* a tool was or wasn't picked up, instead of guessing.
231
+ """
232
+
233
+ name: str
234
+ tool: str
235
+ available: bool
236
+ search_paths: list[str]
237
+ log_files: int
238
+ interactions: int = 0
239
+ tokens: int = 0
240
+ cost_usd: float = 0.0
241
+ note: str = ""
@@ -0,0 +1,113 @@
1
+ """Cross-platform helpers for locating tool log directories.
2
+
3
+ Agent-ROI runs on Windows, macOS, and Linux. The tricky case is WSL: a user may
4
+ run their AI tools from Windows (logs under ``C:\\Users\\<name>``, visible from
5
+ WSL at ``/mnt/c/Users/<name>``) while running Agent-ROI from inside the Linux
6
+ distro. We therefore search several candidate roots and use whichever exist.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def is_wsl() -> bool:
17
+ """Detect Windows Subsystem for Linux."""
18
+ if not sys.platform.startswith("linux"):
19
+ return False
20
+ return "microsoft" in _osrelease()
21
+
22
+
23
+ def platform_label() -> str:
24
+ """A short human-readable label for the current OS (for diagnostics)."""
25
+ names = {"darwin": "macOS", "win32": "Windows", "linux": "Linux"}
26
+ base = names.get(sys.platform, sys.platform)
27
+ return f"{base} (WSL)" if is_wsl() else base
28
+
29
+
30
+ def _osrelease() -> str:
31
+ try:
32
+ return Path("/proc/sys/kernel/osrelease").read_text().lower()
33
+ except OSError:
34
+ return ""
35
+
36
+
37
+ def home_candidates() -> list[Path]:
38
+ """Home directories worth searching, most-specific first.
39
+
40
+ Always includes the native home. Under WSL it also includes the mounted
41
+ Windows user profile(s), since tools are commonly run from the Windows side.
42
+ """
43
+ candidates: list[Path] = [Path.home()]
44
+ if is_wsl():
45
+ candidates.extend(_windows_homes_from_wsl())
46
+ # De-duplicate while preserving order.
47
+ seen: set[Path] = set()
48
+ result: list[Path] = []
49
+ for c in candidates:
50
+ if c not in seen:
51
+ seen.add(c)
52
+ result.append(c)
53
+ return result
54
+
55
+
56
+ def _windows_homes_from_wsl() -> list[Path]:
57
+ """Best-effort discovery of Windows user homes from inside WSL."""
58
+ homes: list[Path] = []
59
+ users_dir = Path("/mnt/c/Users")
60
+ if not users_dir.is_dir():
61
+ return homes
62
+ # Prefer the matching username, but include all real profiles as a fallback.
63
+ win_user = os.environ.get("WIN_USER") or os.environ.get("USER")
64
+ skip = {"Default", "Default User", "Public", "All Users", "desktop.ini"}
65
+ for entry in users_dir.iterdir():
66
+ if entry.name in skip or not entry.is_dir():
67
+ continue
68
+ if win_user and entry.name.lower() == win_user.lower():
69
+ homes.insert(0, entry)
70
+ else:
71
+ homes.append(entry)
72
+ return homes
73
+
74
+
75
+ def find_tool_dirs(*relative_parts: str) -> list[Path]:
76
+ """Return existing directories at ``<home>/<relative_parts>`` across all
77
+ candidate homes (native + WSL-mounted Windows)."""
78
+ found: list[Path] = []
79
+ for home in home_candidates():
80
+ candidate = home.joinpath(*relative_parts)
81
+ if candidate.is_dir():
82
+ found.append(candidate)
83
+ return found
84
+
85
+
86
+ def vscode_user_dirs() -> list[Path]:
87
+ """Locate VS Code ``User`` directories across platforms (and forks/insiders).
88
+
89
+ VS Code stores per-user state (including chat sessions) under different paths
90
+ on each OS. We return every existing match so collectors can search them.
91
+ """
92
+ # Path of the "User" dir relative to each home, per platform.
93
+ rel_by_platform: dict[str, list[tuple[str, ...]]] = {
94
+ "darwin": [("Library", "Application Support", "{app}", "User")],
95
+ "win32": [("AppData", "Roaming", "{app}", "User")],
96
+ "linux": [(".config", "{app}", "User")],
97
+ }
98
+ # On WSL we also want the Windows-side VS Code, which lives under AppData.
99
+ if is_wsl():
100
+ rel_by_platform["linux"].append(("AppData", "Roaming", "{app}", "User"))
101
+
102
+ apps = ["Code", "Code - Insiders", "VSCodium", "Cursor"]
103
+ templates = rel_by_platform.get(sys.platform, rel_by_platform["linux"])
104
+
105
+ found: list[Path] = []
106
+ for home in home_candidates():
107
+ for template in templates:
108
+ for app in apps:
109
+ parts = tuple(p.replace("{app}", app) for p in template)
110
+ candidate = home.joinpath(*parts)
111
+ if candidate.is_dir():
112
+ found.append(candidate)
113
+ return found