coding-agent-roi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_roi/__init__.py +3 -0
- agent_roi/api/__init__.py +1 -0
- agent_roi/api/app.py +179 -0
- agent_roi/classify/__init__.py +26 -0
- agent_roi/classify/base.py +44 -0
- agent_roi/classify/semantic.py +197 -0
- agent_roi/cli/__init__.py +1 -0
- agent_roi/cli/main.py +200 -0
- agent_roi/collectors/__init__.py +31 -0
- agent_roi/collectors/base.py +49 -0
- agent_roi/collectors/claude_code.py +165 -0
- agent_roi/collectors/codex.py +157 -0
- agent_roi/collectors/copilot.py +210 -0
- agent_roi/collectors/gemini.py +220 -0
- agent_roi/core/__init__.py +1 -0
- agent_roi/core/config.py +58 -0
- agent_roi/core/models.py +241 -0
- agent_roi/core/platform.py +113 -0
- agent_roi/core/pricing.py +79 -0
- agent_roi/core/project.py +52 -0
- agent_roi/core/service.py +172 -0
- agent_roi/core/timeframe.py +76 -0
- agent_roi/core/tokens.py +30 -0
- agent_roi/storage/__init__.py +5 -0
- agent_roi/storage/db.py +542 -0
- coding_agent_roi-0.1.0.dist-info/METADATA +163 -0
- coding_agent_roi-0.1.0.dist-info/RECORD +30 -0
- coding_agent_roi-0.1.0.dist-info/WHEEL +4 -0
- coding_agent_roi-0.1.0.dist-info/entry_points.txt +2 -0
- coding_agent_roi-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Collector registry.
|
|
2
|
+
|
|
3
|
+
New collectors register here so the CLI and config can refer to them by name.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from agent_roi.collectors.base import Collector
|
|
9
|
+
from agent_roi.collectors.claude_code import ClaudeCodeCollector
|
|
10
|
+
from agent_roi.collectors.codex import CodexCollector
|
|
11
|
+
from agent_roi.collectors.copilot import CopilotCollector
|
|
12
|
+
from agent_roi.collectors.gemini import GeminiCollector
|
|
13
|
+
|
|
14
|
+
_REGISTRY: dict[str, type[Collector]] = {
|
|
15
|
+
ClaudeCodeCollector.name: ClaudeCodeCollector,
|
|
16
|
+
CodexCollector.name: CodexCollector,
|
|
17
|
+
CopilotCollector.name: CopilotCollector,
|
|
18
|
+
GeminiCollector.name: GeminiCollector,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_collectors(names: list[str]) -> list[Collector]:
|
|
23
|
+
"""Instantiate the named collectors, skipping unknown names."""
|
|
24
|
+
return [_REGISTRY[name]() for name in names if name in _REGISTRY]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def all_collector_names() -> list[str]:
|
|
28
|
+
return list(_REGISTRY)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = ["Collector", "get_collectors", "all_collector_names"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Collector interface.
|
|
2
|
+
|
|
3
|
+
A collector knows how to find one tool's local logs and turn them into a stream
|
|
4
|
+
of normalized :class:`Interaction` objects. Collectors must be read-only and
|
|
5
|
+
idempotent: running ingest twice should not double-count, which is enforced
|
|
6
|
+
upstream by the stable ``Interaction.id``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from agent_roi.core.models import Interaction, Tool
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Collector(ABC):
|
|
19
|
+
"""Base class for all tool log collectors."""
|
|
20
|
+
|
|
21
|
+
#: Which tool this collector produces interactions for.
|
|
22
|
+
tool: Tool
|
|
23
|
+
|
|
24
|
+
#: Stable name used in config's ``collectors.enabled`` list.
|
|
25
|
+
name: str
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def is_available(self) -> bool:
|
|
29
|
+
"""Return True if this tool's logs exist on the current machine."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def collect(self) -> Iterator[Interaction]:
|
|
33
|
+
"""Yield normalized interactions parsed from local logs."""
|
|
34
|
+
|
|
35
|
+
def search_paths(self) -> list[Path]:
|
|
36
|
+
"""Directories this collector looked in (for the diagnostics report).
|
|
37
|
+
|
|
38
|
+
Default is empty; collectors override to expose where they searched so
|
|
39
|
+
users can see why a tool was or wasn't detected.
|
|
40
|
+
"""
|
|
41
|
+
return []
|
|
42
|
+
|
|
43
|
+
def count_files(self) -> int:
|
|
44
|
+
"""Number of log files this collector can see (cheap; no parsing)."""
|
|
45
|
+
return 0
|
|
46
|
+
|
|
47
|
+
def note(self) -> str:
|
|
48
|
+
"""Optional human-readable hint shown in diagnostics (e.g. why empty)."""
|
|
49
|
+
return ""
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Collector for Claude Code.
|
|
2
|
+
|
|
3
|
+
Claude Code stores one JSONL file per session under
|
|
4
|
+
``~/.claude/projects/<encoded-project-path>/<session-id>.jsonl``. Each line is a
|
|
5
|
+
JSON object; assistant turns carry a ``message.usage`` block with token counts.
|
|
6
|
+
We read these files read-only and never modify them.
|
|
7
|
+
|
|
8
|
+
We emit one :class:`Interaction` per assistant turn (those are the ones with
|
|
9
|
+
token usage), but we build each interaction's ``summary`` from both the user's
|
|
10
|
+
request and the assistant's reply. Many assistant turns are pure tool calls with
|
|
11
|
+
no prose, so using assistant text alone leaves most summaries empty — which makes
|
|
12
|
+
topic discovery impossible. Carrying the preceding user message keeps the topic
|
|
13
|
+
signal intact.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
from collections.abc import Iterator
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from agent_roi.collectors.base import Collector
|
|
25
|
+
from agent_roi.core.models import Interaction, Tool
|
|
26
|
+
from agent_roi.core.platform import find_tool_dirs
|
|
27
|
+
from agent_roi.core.project import project_for
|
|
28
|
+
|
|
29
|
+
_SUMMARY_MAX = 600
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ClaudeCodeCollector(Collector):
|
|
33
|
+
tool = Tool.CLAUDE_CODE
|
|
34
|
+
name = "claude_code"
|
|
35
|
+
|
|
36
|
+
def __init__(self, roots: list[Path] | None = None) -> None:
|
|
37
|
+
# Supports multiple roots so WSL can also read Windows-side logs.
|
|
38
|
+
self.roots = roots if roots is not None else find_tool_dirs(".claude", "projects")
|
|
39
|
+
|
|
40
|
+
def is_available(self) -> bool:
|
|
41
|
+
return bool(self.roots)
|
|
42
|
+
|
|
43
|
+
def search_paths(self) -> list[Path]:
|
|
44
|
+
return list(self.roots)
|
|
45
|
+
|
|
46
|
+
def count_files(self) -> int:
|
|
47
|
+
return sum(1 for root in self.roots for _ in root.rglob("*.jsonl"))
|
|
48
|
+
|
|
49
|
+
def collect(self) -> Iterator[Interaction]:
|
|
50
|
+
for root in self.roots:
|
|
51
|
+
for jsonl in root.rglob("*.jsonl"):
|
|
52
|
+
yield from self._parse_file(jsonl)
|
|
53
|
+
|
|
54
|
+
def _parse_file(self, path: Path) -> Iterator[Interaction]:
|
|
55
|
+
session_id = path.stem
|
|
56
|
+
try:
|
|
57
|
+
lines = path.read_text(encoding="utf-8").splitlines()
|
|
58
|
+
except OSError:
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
last_user_text = ""
|
|
62
|
+
for line in lines:
|
|
63
|
+
line = line.strip()
|
|
64
|
+
if not line:
|
|
65
|
+
continue
|
|
66
|
+
try:
|
|
67
|
+
record = json.loads(line)
|
|
68
|
+
except json.JSONDecodeError:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
message = record.get("message")
|
|
72
|
+
if not isinstance(message, dict):
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
role = message.get("role") or record.get("type")
|
|
76
|
+
text = _text_from_content(message.get("content"))
|
|
77
|
+
|
|
78
|
+
if role == "user":
|
|
79
|
+
# Remember the latest user intent to attach to the next reply.
|
|
80
|
+
if text:
|
|
81
|
+
last_user_text = text
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
usage = message.get("usage")
|
|
85
|
+
if not isinstance(usage, dict):
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
interaction = self._to_interaction(
|
|
89
|
+
record, message, usage, session_id, last_user_text, text
|
|
90
|
+
)
|
|
91
|
+
if interaction is not None:
|
|
92
|
+
yield interaction
|
|
93
|
+
|
|
94
|
+
def _to_interaction(
|
|
95
|
+
self,
|
|
96
|
+
record: dict[str, Any],
|
|
97
|
+
message: dict[str, Any],
|
|
98
|
+
usage: dict[str, Any],
|
|
99
|
+
session_id: str,
|
|
100
|
+
user_text: str,
|
|
101
|
+
assistant_text: str,
|
|
102
|
+
) -> Interaction | None:
|
|
103
|
+
msg_id = message.get("id") or record.get("uuid")
|
|
104
|
+
if not msg_id:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
ts_raw = record.get("timestamp")
|
|
108
|
+
try:
|
|
109
|
+
timestamp = (
|
|
110
|
+
datetime.fromisoformat(ts_raw.replace("Z", "+00:00"))
|
|
111
|
+
if isinstance(ts_raw, str)
|
|
112
|
+
else datetime.now()
|
|
113
|
+
)
|
|
114
|
+
except ValueError:
|
|
115
|
+
timestamp = datetime.now()
|
|
116
|
+
|
|
117
|
+
cwd = str(record.get("cwd", ""))
|
|
118
|
+
return Interaction(
|
|
119
|
+
id=str(msg_id),
|
|
120
|
+
tool=self.tool,
|
|
121
|
+
session_id=session_id,
|
|
122
|
+
timestamp=timestamp,
|
|
123
|
+
model=message.get("model", "unknown"),
|
|
124
|
+
input_tokens=int(usage.get("input_tokens", 0)),
|
|
125
|
+
output_tokens=int(usage.get("output_tokens", 0)),
|
|
126
|
+
cache_read_tokens=int(usage.get("cache_read_input_tokens", 0)),
|
|
127
|
+
cache_write_tokens=int(usage.get("cache_creation_input_tokens", 0)),
|
|
128
|
+
cwd=cwd,
|
|
129
|
+
project=project_for(cwd),
|
|
130
|
+
summary=_combine_summary(user_text, assistant_text),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _combine_summary(user_text: str, assistant_text: str) -> str:
|
|
135
|
+
"""Build a topic-bearing summary, preferring the user's request first."""
|
|
136
|
+
parts = [p for p in (user_text, assistant_text) if p]
|
|
137
|
+
return " ".join(parts)[:_SUMMARY_MAX]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _text_from_content(content: object) -> str:
|
|
141
|
+
"""Extract human-readable text from a Claude message ``content`` field.
|
|
142
|
+
|
|
143
|
+
Content may be a plain string or a list of typed blocks. We keep prose
|
|
144
|
+
(``text``) and tool *names* (``tool_use``) as topic signal, but deliberately
|
|
145
|
+
skip ``tool_result`` bodies: those carry command output (e.g. ``ls -l``
|
|
146
|
+
listings, file dumps) that pollutes topic labels with noise like permission
|
|
147
|
+
bits and paths rather than describing the task.
|
|
148
|
+
"""
|
|
149
|
+
if isinstance(content, str):
|
|
150
|
+
return content.strip()
|
|
151
|
+
if not isinstance(content, list):
|
|
152
|
+
return ""
|
|
153
|
+
|
|
154
|
+
pieces: list[str] = []
|
|
155
|
+
for block in content:
|
|
156
|
+
if not isinstance(block, dict):
|
|
157
|
+
continue
|
|
158
|
+
btype = block.get("type")
|
|
159
|
+
if btype == "text":
|
|
160
|
+
pieces.append(str(block.get("text", "")))
|
|
161
|
+
elif btype == "tool_use":
|
|
162
|
+
name = block.get("name")
|
|
163
|
+
if name:
|
|
164
|
+
pieces.append(str(name))
|
|
165
|
+
return " ".join(p for p in pieces if p).strip()
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Collector for OpenAI Codex CLI.
|
|
2
|
+
|
|
3
|
+
Codex CLI stores one rollout log per session as JSONL under
|
|
4
|
+
``~/.codex/sessions/<YYYY>/<MM>/<DD>/rollout-*.jsonl``. Each line is a typed
|
|
5
|
+
record. The shapes we care about:
|
|
6
|
+
|
|
7
|
+
- ``session_meta`` — session id and start time.
|
|
8
|
+
- ``turn_context`` — carries the active ``model`` for subsequent turns.
|
|
9
|
+
- ``event_msg`` with ``payload.type == "token_count"`` — per-turn token usage in
|
|
10
|
+
``info.last_token_usage`` (input/cached/output/reasoning tokens).
|
|
11
|
+
- ``event_msg`` with ``payload.type in {"user_message","agent_message"}`` — text
|
|
12
|
+
we keep a short snippet of for the classifier.
|
|
13
|
+
|
|
14
|
+
We emit one :class:`Interaction` per ``token_count`` event, using
|
|
15
|
+
``last_token_usage`` (the delta for that turn) so usage isn't double-counted from
|
|
16
|
+
the running ``total_token_usage``. The parser is defensive: unknown records are
|
|
17
|
+
skipped rather than failing the whole ingest.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
from collections.abc import Iterator
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
from agent_roi.collectors.base import Collector
|
|
29
|
+
from agent_roi.core.models import Interaction, Tool
|
|
30
|
+
from agent_roi.core.platform import find_tool_dirs
|
|
31
|
+
from agent_roi.core.project import project_for
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CodexCollector(Collector):
|
|
35
|
+
tool = Tool.CODEX
|
|
36
|
+
name = "codex"
|
|
37
|
+
|
|
38
|
+
def __init__(self, roots: list[Path] | None = None) -> None:
|
|
39
|
+
self.roots = roots if roots is not None else find_tool_dirs(".codex", "sessions")
|
|
40
|
+
|
|
41
|
+
def is_available(self) -> bool:
|
|
42
|
+
return bool(self.roots)
|
|
43
|
+
|
|
44
|
+
def search_paths(self) -> list[Path]:
|
|
45
|
+
return list(self.roots)
|
|
46
|
+
|
|
47
|
+
def count_files(self) -> int:
|
|
48
|
+
return sum(1 for root in self.roots for _ in root.rglob("rollout-*.jsonl"))
|
|
49
|
+
|
|
50
|
+
def collect(self) -> Iterator[Interaction]:
|
|
51
|
+
for root in self.roots:
|
|
52
|
+
for jsonl in root.rglob("*.jsonl"):
|
|
53
|
+
yield from self._parse_file(jsonl)
|
|
54
|
+
|
|
55
|
+
def _parse_file(self, path: Path) -> Iterator[Interaction]:
|
|
56
|
+
# Session id is the uuid at the end of the filename if present, else stem.
|
|
57
|
+
session_id = path.stem.split("-")[-1] if "-" in path.stem else path.stem
|
|
58
|
+
|
|
59
|
+
model = "unknown"
|
|
60
|
+
cwd = ""
|
|
61
|
+
last_user = ""
|
|
62
|
+
last_agent = ""
|
|
63
|
+
seq = 0
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
lines = path.read_text(encoding="utf-8").splitlines()
|
|
67
|
+
except OSError:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
for line in lines:
|
|
71
|
+
line = line.strip()
|
|
72
|
+
if not line:
|
|
73
|
+
continue
|
|
74
|
+
try:
|
|
75
|
+
record = json.loads(line)
|
|
76
|
+
except json.JSONDecodeError:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
rtype = record.get("type")
|
|
80
|
+
payload = record.get("payload")
|
|
81
|
+
payload = payload if isinstance(payload, dict) else {}
|
|
82
|
+
|
|
83
|
+
if rtype == "turn_context":
|
|
84
|
+
model = str(payload.get("model") or model)
|
|
85
|
+
cwd = str(payload.get("cwd") or cwd)
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
if rtype == "session_meta":
|
|
89
|
+
cwd = str(payload.get("cwd") or cwd)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
if rtype != "event_msg":
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
ptype = payload.get("type")
|
|
96
|
+
if ptype == "user_message":
|
|
97
|
+
text = _event_text(payload)
|
|
98
|
+
if text:
|
|
99
|
+
last_user = text
|
|
100
|
+
elif ptype == "agent_message":
|
|
101
|
+
text = _event_text(payload)
|
|
102
|
+
if text:
|
|
103
|
+
last_agent = text
|
|
104
|
+
elif ptype == "token_count":
|
|
105
|
+
usage = _last_usage(payload)
|
|
106
|
+
if usage is None:
|
|
107
|
+
continue
|
|
108
|
+
seq += 1
|
|
109
|
+
yield Interaction(
|
|
110
|
+
id=f"codex:{session_id}:{seq}",
|
|
111
|
+
tool=self.tool,
|
|
112
|
+
session_id=session_id,
|
|
113
|
+
timestamp=_parse_ts(record.get("timestamp")),
|
|
114
|
+
model=_normalize_model(model),
|
|
115
|
+
input_tokens=int(usage.get("input_tokens", 0)),
|
|
116
|
+
output_tokens=(
|
|
117
|
+
int(usage.get("output_tokens", 0))
|
|
118
|
+
+ int(usage.get("reasoning_output_tokens", 0))
|
|
119
|
+
),
|
|
120
|
+
cache_read_tokens=int(usage.get("cached_input_tokens", 0)),
|
|
121
|
+
cwd=cwd,
|
|
122
|
+
project=project_for(cwd),
|
|
123
|
+
summary=_combine(last_user, last_agent),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _event_text(payload: dict[str, Any]) -> str:
|
|
128
|
+
text = payload.get("message") or payload.get("text") or ""
|
|
129
|
+
return text.strip() if isinstance(text, str) else ""
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _combine(user_text: str, agent_text: str) -> str:
|
|
133
|
+
parts = [p for p in (user_text, agent_text) if p]
|
|
134
|
+
return " ".join(parts)[:600]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _last_usage(payload: dict[str, Any]) -> dict[str, Any] | None:
|
|
138
|
+
"""Pull the per-turn token usage from a token_count event payload."""
|
|
139
|
+
info = payload.get("info")
|
|
140
|
+
if not isinstance(info, dict):
|
|
141
|
+
return None
|
|
142
|
+
usage = info.get("last_token_usage") or info.get("total_token_usage")
|
|
143
|
+
return usage if isinstance(usage, dict) else None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _normalize_model(model: str) -> str:
|
|
147
|
+
# Codex reports e.g. "gpt-5.5"; normalize dots to dashes for pricing lookup.
|
|
148
|
+
return model.replace(".", "-")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _parse_ts(raw: object) -> datetime:
|
|
152
|
+
if isinstance(raw, str):
|
|
153
|
+
try:
|
|
154
|
+
return datetime.fromisoformat(raw.replace("Z", "+00:00"))
|
|
155
|
+
except ValueError:
|
|
156
|
+
pass
|
|
157
|
+
return datetime.now()
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Collector for GitHub Copilot Chat in VS Code.
|
|
2
|
+
|
|
3
|
+
Copilot stores chat sessions under each VS Code workspace's
|
|
4
|
+
``workspaceStorage/<id>/chatSessions/*.json(l)``. Crucially, these logs record
|
|
5
|
+
the conversation text and the model id, but **not** real token usage (Copilot is
|
|
6
|
+
subscription-billed, so GitHub doesn't write token counts). We therefore
|
|
7
|
+
*estimate* token counts from the message text with a tokenizer and flag the
|
|
8
|
+
interactions as ``estimated`` so reports never present them as exact.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from collections.abc import Iterator
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from agent_roi.collectors.base import Collector
|
|
20
|
+
from agent_roi.core.models import Interaction, Tool
|
|
21
|
+
from agent_roi.core.platform import vscode_user_dirs
|
|
22
|
+
from agent_roi.core.project import project_for
|
|
23
|
+
from agent_roi.core.tokens import estimate_tokens
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CopilotCollector(Collector):
|
|
27
|
+
tool = Tool.COPILOT
|
|
28
|
+
name = "copilot"
|
|
29
|
+
|
|
30
|
+
def __init__(self, roots: list[Path] | None = None) -> None:
|
|
31
|
+
# Each root is a VS Code "User" dir; chat sessions live under its
|
|
32
|
+
# workspaceStorage subtree.
|
|
33
|
+
self.roots = roots if roots is not None else vscode_user_dirs()
|
|
34
|
+
|
|
35
|
+
def is_available(self) -> bool:
|
|
36
|
+
return any((r / "workspaceStorage").is_dir() for r in self.roots)
|
|
37
|
+
|
|
38
|
+
def search_paths(self) -> list[Path]:
|
|
39
|
+
return list(self.roots)
|
|
40
|
+
|
|
41
|
+
def count_files(self) -> int:
|
|
42
|
+
total = 0
|
|
43
|
+
for root in self.roots:
|
|
44
|
+
ws = root / "workspaceStorage"
|
|
45
|
+
if ws.is_dir():
|
|
46
|
+
total += sum(1 for _ in ws.glob("*/chatSessions/*.json*"))
|
|
47
|
+
return total
|
|
48
|
+
|
|
49
|
+
def collect(self) -> Iterator[Interaction]:
|
|
50
|
+
for root in self.roots:
|
|
51
|
+
ws = root / "workspaceStorage"
|
|
52
|
+
if not ws.is_dir():
|
|
53
|
+
continue
|
|
54
|
+
for ws_dir in ws.iterdir():
|
|
55
|
+
if not ws_dir.is_dir():
|
|
56
|
+
continue
|
|
57
|
+
cwd = _workspace_cwd(ws_dir)
|
|
58
|
+
chat_dir = ws_dir / "chatSessions"
|
|
59
|
+
if not chat_dir.is_dir():
|
|
60
|
+
continue
|
|
61
|
+
for session_file in chat_dir.glob("*.json*"):
|
|
62
|
+
yield from self._parse_file(session_file, cwd)
|
|
63
|
+
|
|
64
|
+
def _parse_file(self, path: Path, cwd: str) -> Iterator[Interaction]:
|
|
65
|
+
session_id = path.stem
|
|
66
|
+
try:
|
|
67
|
+
raw = path.read_text(encoding="utf-8")
|
|
68
|
+
except OSError:
|
|
69
|
+
return
|
|
70
|
+
for obj in _load_objects(raw):
|
|
71
|
+
yield from self._requests_in(obj, session_id, cwd)
|
|
72
|
+
|
|
73
|
+
def _requests_in(self, obj: Any, session_id: str, cwd: str) -> Iterator[Interaction]:
|
|
74
|
+
"""Walk an arbitrary JSON structure, yielding an Interaction per Copilot
|
|
75
|
+
chat request found."""
|
|
76
|
+
if isinstance(obj, dict):
|
|
77
|
+
if obj.get("requestId") and "modelId" in obj:
|
|
78
|
+
itx = self._to_interaction(obj, session_id, cwd)
|
|
79
|
+
if itx is not None:
|
|
80
|
+
yield itx
|
|
81
|
+
for value in obj.values():
|
|
82
|
+
yield from self._requests_in(value, session_id, cwd)
|
|
83
|
+
elif isinstance(obj, list):
|
|
84
|
+
for value in obj:
|
|
85
|
+
yield from self._requests_in(value, session_id, cwd)
|
|
86
|
+
|
|
87
|
+
def _to_interaction(self, req: dict[str, Any], session_id: str, cwd: str) -> Interaction | None:
|
|
88
|
+
request_id = req.get("requestId")
|
|
89
|
+
if not request_id:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
user_text = _message_text(req.get("message"))
|
|
93
|
+
response_text = _response_text(req.get("response"))
|
|
94
|
+
|
|
95
|
+
model = str(req.get("modelId") or "unknown")
|
|
96
|
+
summary = " ".join(p for p in (user_text, response_text) if p)[:600]
|
|
97
|
+
return Interaction(
|
|
98
|
+
id=f"copilot:{request_id}",
|
|
99
|
+
tool=self.tool,
|
|
100
|
+
session_id=session_id,
|
|
101
|
+
timestamp=_parse_ts(req.get("timestamp")),
|
|
102
|
+
model=_normalize_model(model),
|
|
103
|
+
input_tokens=estimate_tokens(user_text),
|
|
104
|
+
output_tokens=estimate_tokens(response_text),
|
|
105
|
+
cwd=cwd,
|
|
106
|
+
project=project_for(cwd),
|
|
107
|
+
summary=summary,
|
|
108
|
+
estimated=True,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _workspace_cwd(ws_dir: Path) -> str:
|
|
113
|
+
"""Read the workspace folder from VS Code's ``workspace.json``.
|
|
114
|
+
|
|
115
|
+
VS Code writes ``workspaceStorage/<hash>/workspace.json`` with a ``folder``
|
|
116
|
+
key that is a URI such as:
|
|
117
|
+
- ``file:///Users/yen/repo`` → local path (most common)
|
|
118
|
+
- ``vscode-remote://ssh-remote%2B<host>/home/yen/repo`` → SSH remote
|
|
119
|
+
|
|
120
|
+
We convert both to the plain path portion so ``project_for`` can derive a
|
|
121
|
+
project name. For remote workspaces we keep a ``ssh:<host>:`` prefix so the
|
|
122
|
+
project name stays meaningful (e.g. ``repo`` on host ``100.120.0.60``).
|
|
123
|
+
"""
|
|
124
|
+
from urllib.parse import unquote
|
|
125
|
+
|
|
126
|
+
wj = ws_dir / "workspace.json"
|
|
127
|
+
try:
|
|
128
|
+
data = json.loads(wj.read_text(encoding="utf-8"))
|
|
129
|
+
except (OSError, json.JSONDecodeError):
|
|
130
|
+
return ""
|
|
131
|
+
folder = str(data.get("folder", ""))
|
|
132
|
+
|
|
133
|
+
if folder.startswith("file:///"):
|
|
134
|
+
# file:///Users/yen/repo -> /Users/yen/repo
|
|
135
|
+
return unquote(folder[len("file://"):])
|
|
136
|
+
|
|
137
|
+
if folder.startswith("vscode-remote://"):
|
|
138
|
+
# vscode-remote://ssh-remote%2B<host>/path/to/repo
|
|
139
|
+
rest = folder[len("vscode-remote://"):]
|
|
140
|
+
slash = rest.find("/")
|
|
141
|
+
if slash != -1:
|
|
142
|
+
path = unquote(rest[slash:])
|
|
143
|
+
return path # project_for will pick up the last meaningful segment
|
|
144
|
+
return unquote(rest)
|
|
145
|
+
|
|
146
|
+
return folder
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _load_objects(raw: str) -> list[Any]:
|
|
150
|
+
"""Parse a session file that may be a single JSON object or JSONL."""
|
|
151
|
+
try:
|
|
152
|
+
return [json.loads(raw)]
|
|
153
|
+
except json.JSONDecodeError:
|
|
154
|
+
objects: list[Any] = []
|
|
155
|
+
for line in raw.splitlines():
|
|
156
|
+
line = line.strip()
|
|
157
|
+
if not line:
|
|
158
|
+
continue
|
|
159
|
+
try:
|
|
160
|
+
objects.append(json.loads(line))
|
|
161
|
+
except json.JSONDecodeError:
|
|
162
|
+
continue
|
|
163
|
+
return objects
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _message_text(message: Any) -> str:
|
|
167
|
+
if isinstance(message, dict):
|
|
168
|
+
return str(message.get("text", ""))
|
|
169
|
+
if isinstance(message, str):
|
|
170
|
+
return message
|
|
171
|
+
return ""
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _response_text(response: Any) -> str:
|
|
175
|
+
"""Copilot responses are usually a list of parts with a ``value``/``text``."""
|
|
176
|
+
if isinstance(response, str):
|
|
177
|
+
return response
|
|
178
|
+
if isinstance(response, dict):
|
|
179
|
+
return str(response.get("value") or response.get("text") or "")
|
|
180
|
+
if isinstance(response, list):
|
|
181
|
+
parts = []
|
|
182
|
+
for part in response:
|
|
183
|
+
if isinstance(part, dict):
|
|
184
|
+
parts.append(str(part.get("value") or part.get("text") or ""))
|
|
185
|
+
elif isinstance(part, str):
|
|
186
|
+
parts.append(part)
|
|
187
|
+
return "".join(parts)
|
|
188
|
+
return ""
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _normalize_model(model: str) -> str:
|
|
192
|
+
# Copilot reports e.g. "copilot/claude-opus-4.6"; strip the vendor prefix and
|
|
193
|
+
# map dots to dashes so it matches the pricing table where possible.
|
|
194
|
+
name = model.split("/", 1)[-1]
|
|
195
|
+
return name.replace(".", "-")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _parse_ts(raw: Any) -> datetime:
|
|
199
|
+
# Copilot timestamps are epoch milliseconds.
|
|
200
|
+
if isinstance(raw, (int, float)):
|
|
201
|
+
try:
|
|
202
|
+
return datetime.fromtimestamp(raw / 1000, tz=timezone.utc)
|
|
203
|
+
except (ValueError, OSError):
|
|
204
|
+
pass
|
|
205
|
+
if isinstance(raw, str):
|
|
206
|
+
try:
|
|
207
|
+
return datetime.fromisoformat(raw.replace("Z", "+00:00"))
|
|
208
|
+
except ValueError:
|
|
209
|
+
pass
|
|
210
|
+
return datetime.now(tz=timezone.utc)
|