coding-agent-roi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ """Per-model token pricing, used to turn token counts into USD cost.
2
+
3
+ Prices are USD per 1M tokens. This table is intentionally simple and easy to
4
+ edit; keep it current as providers change pricing. Unknown models fall back to a
5
+ zero price so usage is still tracked (cost just shows as 0).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from agent_roi.core.models import Interaction, ModelPricing
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ModelPrice:
17
+ input: float
18
+ output: float
19
+ cache_read: float = 0.0
20
+ cache_write: float = 0.0
21
+
22
+
23
+ # Prices in USD per 1,000,000 tokens. Extend freely.
24
+ PRICES: dict[str, ModelPrice] = {
25
+ "claude-opus-4-8": ModelPrice(input=15.0, output=75.0, cache_read=1.5, cache_write=18.75),
26
+ "claude-sonnet-4-6": ModelPrice(input=3.0, output=15.0, cache_read=0.3, cache_write=3.75),
27
+ "claude-haiku-4-5": ModelPrice(input=0.8, output=4.0, cache_read=0.08, cache_write=1.0),
28
+ "gpt-4o": ModelPrice(input=2.5, output=10.0),
29
+ "gpt-4o-mini": ModelPrice(input=0.15, output=0.6),
30
+ # Codex normalizes "gpt-5.5" -> "gpt-5-5". Pricing is approximate; edit to match
31
+ # your plan (see docs/configuration — pricing is user-verifiable).
32
+ "gpt-5-5": ModelPrice(input=1.25, output=10.0, cache_read=0.125),
33
+ "gpt-5": ModelPrice(input=1.25, output=10.0, cache_read=0.125),
34
+ # Gemini API list prices. Edit to match your plan; prefix-matched, so
35
+ # "gemini-2.5-pro" / "gemini-3-flash-preview" resolve to the right tier.
36
+ "gemini-3-pro": ModelPrice(input=2.0, output=12.0, cache_read=0.2),
37
+ "gemini-3-flash": ModelPrice(input=0.3, output=2.5, cache_read=0.03),
38
+ "gemini-2.5-pro": ModelPrice(input=1.25, output=10.0, cache_read=0.125),
39
+ "gemini-2.5-flash": ModelPrice(input=0.3, output=2.5, cache_read=0.03),
40
+ "gemini-2.0-flash": ModelPrice(input=0.1, output=0.4, cache_read=0.025),
41
+ "gemini": ModelPrice(input=0.3, output=2.5, cache_read=0.03),
42
+ }
43
+
44
+ _UNKNOWN = ModelPrice(input=0.0, output=0.0)
45
+
46
+
47
+ def price_for(model: str) -> ModelPrice:
48
+ """Resolve a price by exact match, then by longest known prefix."""
49
+ if model in PRICES:
50
+ return PRICES[model]
51
+ candidates = [name for name in PRICES if model.startswith(name)]
52
+ if candidates:
53
+ return PRICES[max(candidates, key=len)]
54
+ return _UNKNOWN
55
+
56
+
57
+ def all_prices() -> list[ModelPricing]:
58
+ """Return the full pricing table, so users can verify cost = usage x price."""
59
+ return [
60
+ ModelPricing(
61
+ model=name,
62
+ input=p.input,
63
+ output=p.output,
64
+ cache_read=p.cache_read,
65
+ cache_write=p.cache_write,
66
+ )
67
+ for name, p in sorted(PRICES.items())
68
+ ]
69
+
70
+
71
+ def cost_of(interaction: Interaction) -> float:
72
+ """Compute the USD cost of a single interaction."""
73
+ p = price_for(interaction.model)
74
+ return (
75
+ interaction.input_tokens * p.input
76
+ + interaction.output_tokens * p.output
77
+ + interaction.cache_read_tokens * p.cache_read
78
+ + interaction.cache_write_tokens * p.cache_write
79
+ ) / 1_000_000
@@ -0,0 +1,52 @@
1
+ """Derive a coarse 'project' label from a working directory.
2
+
3
+ This is only a *grouping hint* (the semantic topic still comes from the
4
+ classifier). We map a cwd to the nearest meaningful root so that subfolders of
5
+ one repo (``/repo`` and ``/repo/web``) collapse to the same project. Because
6
+ collectors run over historical logs, the original directory may no longer exist,
7
+ so we can't always stat ``.git`` — we fall back to a path heuristic.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+
14
+ # Workspace parents whose immediate child is the actual project (e.g. the folder
15
+ # under ~/Desktop or ~/projects is the project, not Desktop itself).
16
+ _WORKSPACE_PARENTS = {"desktop", "documents", "projects", "code", "src", "repos", "dev", "work"}
17
+
18
+
19
+ def project_for(cwd: str) -> str:
20
+ """Return a short project label for a working directory.
21
+
22
+ Empty or root cwds yield ``"unknown"``.
23
+ """
24
+ if not cwd or cwd in ("/", "."):
25
+ return "unknown"
26
+
27
+ path = Path(cwd)
28
+
29
+ # If the directory still exists, prefer a real git root.
30
+ git_root = _git_root(path)
31
+ if git_root is not None:
32
+ return git_root.name
33
+
34
+ # Otherwise: walk up until the parent looks like a workspace container, and
35
+ # take the child of that container as the project root.
36
+ parts = [p for p in path.parts if p not in ("/", "")]
37
+ for i, part in enumerate(parts):
38
+ if part.lower() in _WORKSPACE_PARENTS and i + 1 < len(parts):
39
+ return parts[i + 1]
40
+
41
+ # Fall back to the last path segment.
42
+ return path.name or "unknown"
43
+
44
+
45
+ def _git_root(path: Path) -> Path | None:
46
+ try:
47
+ for candidate in (path, *path.parents):
48
+ if (candidate / ".git").exists():
49
+ return candidate
50
+ except OSError:
51
+ return None
52
+ return None
@@ -0,0 +1,172 @@
1
+ """High-level orchestration used by both the CLI and the API.
2
+
3
+ Keeps the wiring of collectors -> storage -> classifier in one place so the CLI
4
+ and REST layers stay thin.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import datetime
10
+
11
+ from agent_roi.classify import SessionDoc, get_classifier
12
+ from agent_roi.classify.base import UNCATEGORIZED
13
+ from agent_roi.collectors import get_collectors
14
+ from agent_roi.core.config import Config
15
+ from agent_roi.core.models import (
16
+ CollectorStatus,
17
+ ModelPricing,
18
+ Rollup,
19
+ SessionDetail,
20
+ SessionSummary,
21
+ TimeSeriesBundle,
22
+ TopicBreakdown,
23
+ )
24
+ from agent_roi.core.pricing import all_prices
25
+ from agent_roi.storage import Database
26
+
27
+
28
+ class Service:
29
+ def __init__(self, config: Config | None = None) -> None:
30
+ self.config = config or Config.load()
31
+ self.db = Database(self.config.db_path)
32
+
33
+ def ingest(self) -> int:
34
+ """Collect interactions from all enabled tools and store them.
35
+
36
+ Returns the number of interactions processed.
37
+ """
38
+ collectors = get_collectors(self.config.collectors.enabled)
39
+ total = 0
40
+ for collector in collectors:
41
+ if not collector.is_available():
42
+ continue
43
+ total += self.db.upsert_many(collector.collect())
44
+ return total
45
+
46
+ def classify(self, limit: int | None = None, reclassify: bool = True) -> int:
47
+ """Group whole sessions into topics and apply them.
48
+
49
+ A session is one continuous piece of work, so we classify sessions as a
50
+ unit (not each interaction) and apply the discovered topic to all of a
51
+ session's rows. The classifier looks at all sessions together so it can
52
+ group the ones about the same kind of work — e.g. several sessions across
53
+ different repos that are all "auth refactor" — into one topic.
54
+
55
+ With ``reclassify`` (the default) every session is re-labeled, which keeps
56
+ the clustering globally consistent. Set it to False to only label sessions
57
+ that have no topic yet.
58
+
59
+ Returns the number of interactions newly classified.
60
+ """
61
+ if reclassify:
62
+ self.db.clear_topics()
63
+ sessions = (
64
+ self.db.all_sessions(limit=limit)
65
+ if reclassify
66
+ else self.db.unclassified_sessions(limit=limit)
67
+ )
68
+ if not sessions:
69
+ return 0
70
+ classifier = get_classifier(self.config.classifier)
71
+ docs = [
72
+ SessionDoc(session_id=s.session_id, project=s.project, summary=s.summary)
73
+ for s in sessions
74
+ ]
75
+ labels = classifier.label_sessions(docs)
76
+ updated = 0
77
+ for sess in sessions:
78
+ topic = labels.get(sess.session_id, UNCATEGORIZED)
79
+ updated += self.db.set_session_topic(sess.session_id, topic)
80
+ return updated
81
+
82
+ def refresh(self) -> dict[str, int]:
83
+ """Ingest fresh logs and re-classify everything in one step.
84
+
85
+ This is the one-button flow for the dashboard: pull new interactions from
86
+ every tool, then rebuild topics across the whole corpus.
87
+ """
88
+ ingested = self.ingest()
89
+ classified = self.classify()
90
+ return {"ingested": ingested, "classified": classified}
91
+
92
+ def report(
93
+ self,
94
+ dimension: str = "topic",
95
+ start: datetime | None = None,
96
+ end: datetime | None = None,
97
+ ) -> list[Rollup]:
98
+ """Aggregate usage/cost by 'topic', 'tool', or 'model' over a window."""
99
+ return self.db.rollup(dimension, start=start, end=end)
100
+
101
+ def topic_breakdown(
102
+ self,
103
+ topic: str,
104
+ start: datetime | None = None,
105
+ end: datetime | None = None,
106
+ ) -> TopicBreakdown:
107
+ """Drill into one topic: how its tokens split across tools and models."""
108
+ return self.db.topic_breakdown(topic, start=start, end=end)
109
+
110
+ def sessions(
111
+ self,
112
+ topic: str | None = None,
113
+ start: datetime | None = None,
114
+ end: datetime | None = None,
115
+ limit: int | None = None,
116
+ ) -> list[SessionSummary]:
117
+ """Per-session breakdown, optionally scoped to one topic and window."""
118
+ return self.db.sessions(topic=topic, start=start, end=end, limit=limit)
119
+
120
+ def session_detail(self, session_id: str) -> SessionDetail | None:
121
+ """One session's aggregate plus its individual interactions."""
122
+ return self.db.session_detail(session_id)
123
+
124
+ def timeseries(
125
+ self,
126
+ start: datetime | None = None,
127
+ end: datetime | None = None,
128
+ granularity: str = "day",
129
+ ) -> TimeSeriesBundle:
130
+ """Usage trends for charts (day / week / month buckets)."""
131
+ return self.db.timeseries(start=start, end=end, granularity=granularity)
132
+
133
+ def pricing(self) -> list[ModelPricing]:
134
+ """The pricing table behind every cost figure (for verification)."""
135
+ return all_prices()
136
+
137
+ def sources(self) -> list[CollectorStatus]:
138
+ """Diagnostics for every enabled collector: where it looked, what it
139
+ found on disk, and how much is already in the database.
140
+
141
+ This is what makes detection transparent — users can see exactly why a
142
+ tool shows up (or doesn't) instead of guessing.
143
+ """
144
+ by_tool = {r.key: r for r in self.db.rollup("tool")}
145
+ statuses: list[CollectorStatus] = []
146
+ for collector in get_collectors(self.config.collectors.enabled):
147
+ available = collector.is_available()
148
+ files = collector.count_files()
149
+ roll = by_tool.get(collector.tool.value)
150
+ interactions = roll.interactions if roll else 0
151
+
152
+ note = collector.note()
153
+ if not note:
154
+ if not available:
155
+ note = "No logs found on this machine."
156
+ elif files and interactions == 0:
157
+ note = "Logs found but not ingested yet — run a refresh."
158
+
159
+ statuses.append(
160
+ CollectorStatus(
161
+ name=collector.name,
162
+ tool=collector.tool.value,
163
+ available=available,
164
+ search_paths=[str(p) for p in collector.search_paths()],
165
+ log_files=files,
166
+ interactions=interactions,
167
+ tokens=roll.total_tokens if roll else 0,
168
+ cost_usd=roll.cost_usd if roll else 0.0,
169
+ note=note,
170
+ )
171
+ )
172
+ return statuses
@@ -0,0 +1,76 @@
1
+ """Parse user-supplied time-window strings into datetimes.
2
+
3
+ Accepts:
4
+ - ISO dates: ``2026-05-01``
5
+ - ISO datetimes: ``2026-05-01T12:00``
6
+ - Shorthands: ``today``, ``7d`` (last 7 days), ``24h`` (last 24 hours),
7
+ ``30m`` (last 30 minutes), ``8w`` (last 8 weeks).
8
+
9
+ Returns ``None`` for an empty string (meaning "no lower bound").
10
+
11
+ ``parse_until`` is the upper bound (exclusive): an ISO date includes that whole
12
+ calendar day; ``today`` means through end of today.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from datetime import date, datetime, timedelta, timezone
19
+
20
+ _SHORTHAND = re.compile(r"^(\d+)\s*([mhdw])$", re.IGNORECASE)
21
+ _UNIT_TO_DELTA = {
22
+ "m": lambda n: timedelta(minutes=n),
23
+ "h": lambda n: timedelta(hours=n),
24
+ "d": lambda n: timedelta(days=n),
25
+ "w": lambda n: timedelta(weeks=n),
26
+ }
27
+
28
+
29
+ def parse_since(value: str, *, now: datetime | None = None) -> datetime | None:
30
+ """Parse a window-start string. Raises ``ValueError`` on bad input."""
31
+ value = value.strip()
32
+ if not value:
33
+ return None
34
+ now = now or datetime.now(tz=timezone.utc)
35
+
36
+ if value.lower() == "today":
37
+ return now.replace(hour=0, minute=0, second=0, microsecond=0)
38
+
39
+ match = _SHORTHAND.match(value)
40
+ if match:
41
+ amount = int(match.group(1))
42
+ unit = match.group(2).lower()
43
+ return now - _UNIT_TO_DELTA[unit](amount)
44
+
45
+ # Fall back to ISO date / datetime.
46
+ try:
47
+ return datetime.fromisoformat(value)
48
+ except ValueError as exc:
49
+ raise ValueError(
50
+ f"Could not parse time '{value}'. Use a date (YYYY-MM-DD) or 7d/24h/today."
51
+ ) from exc
52
+
53
+
54
+ def parse_until(value: str, *, now: datetime | None = None) -> datetime | None:
55
+ """Parse a window-end string (exclusive). Raises ``ValueError`` on bad input."""
56
+ value = value.strip()
57
+ if not value:
58
+ return None
59
+ now = now or datetime.now(tz=timezone.utc)
60
+
61
+ if value.lower() == "today":
62
+ start_today = now.replace(hour=0, minute=0, second=0, microsecond=0)
63
+ return start_today + timedelta(days=1)
64
+
65
+ try:
66
+ parsed = datetime.fromisoformat(value)
67
+ except ValueError as exc:
68
+ raise ValueError(
69
+ f"Could not parse end time '{value}'. Use a date (YYYY-MM-DD) or today."
70
+ ) from exc
71
+
72
+ # Bare YYYY-MM-DD → include the full calendar day.
73
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", value):
74
+ d = date.fromisoformat(value)
75
+ return datetime(d.year, d.month, d.day) + timedelta(days=1)
76
+ return parsed
@@ -0,0 +1,30 @@
1
+ """Token estimation for tools that don't report real usage.
2
+
3
+ Some tools (notably GitHub Copilot) log the conversation text but not the token
4
+ counts. For those, we estimate counts so cost can still be *approximated* and
5
+ compared across tools. Estimated interactions are flagged
6
+ (``Interaction.estimated = True``) so reports never present them as exact.
7
+
8
+ The estimator is a dependency-free heuristic so it works fully offline (this is
9
+ a local-first tool). It blends a character-based and word-based estimate, which
10
+ tracks real BPE token counts closely enough for reporting — typically within
11
+ ~10-15% for mixed English/code text. We deliberately avoid pulling a tokenizer
12
+ that downloads model files at runtime.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ # Empirically, English + code averages ~4 characters per token, and tokens run
18
+ # ~1.3x the whitespace-delimited word count. Averaging the two estimates is more
19
+ # robust than either alone across prose, code, and JSON-heavy text.
20
+ _CHARS_PER_TOKEN = 4.0
21
+ _TOKENS_PER_WORD = 1.3
22
+
23
+
24
+ def estimate_tokens(text: str) -> int:
25
+ """Estimate the number of tokens in a piece of text (offline heuristic)."""
26
+ if not text:
27
+ return 0
28
+ char_estimate = len(text) / _CHARS_PER_TOKEN
29
+ word_estimate = len(text.split()) * _TOKENS_PER_WORD
30
+ return max(1, round((char_estimate + word_estimate) / 2))
@@ -0,0 +1,5 @@
1
+ """Storage layer."""
2
+
3
+ from agent_roi.storage.db import Database
4
+
5
+ __all__ = ["Database"]