PyPI - coding-agent-roi - Versions diffs - 0.1.0__py3-none-any.whl - Mend

coding-agent-roi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

agent_roi/__init__.py +3 -0
agent_roi/api/__init__.py +1 -0
agent_roi/api/app.py +179 -0
agent_roi/classify/__init__.py +26 -0
agent_roi/classify/base.py +44 -0
agent_roi/classify/semantic.py +197 -0
agent_roi/cli/__init__.py +1 -0
agent_roi/cli/main.py +200 -0
agent_roi/collectors/__init__.py +31 -0
agent_roi/collectors/base.py +49 -0
agent_roi/collectors/claude_code.py +165 -0
agent_roi/collectors/codex.py +157 -0
agent_roi/collectors/copilot.py +210 -0
agent_roi/collectors/gemini.py +220 -0
agent_roi/core/__init__.py +1 -0
agent_roi/core/config.py +58 -0
agent_roi/core/models.py +241 -0
agent_roi/core/platform.py +113 -0
agent_roi/core/pricing.py +79 -0
agent_roi/core/project.py +52 -0
agent_roi/core/service.py +172 -0
agent_roi/core/timeframe.py +76 -0
agent_roi/core/tokens.py +30 -0
agent_roi/storage/__init__.py +5 -0
agent_roi/storage/db.py +542 -0
coding_agent_roi-0.1.0.dist-info/METADATA +163 -0
coding_agent_roi-0.1.0.dist-info/RECORD +30 -0
coding_agent_roi-0.1.0.dist-info/WHEEL +4 -0
coding_agent_roi-0.1.0.dist-info/entry_points.txt +2 -0
coding_agent_roi-0.1.0.dist-info/licenses/LICENSE +21 -0

agent_roi/collectors/gemini.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""Collector for the Gemini CLI.
+The Gemini CLI keeps one chat log per session under a per-project temp tree::
+    ~/.gemini/tmp/<projectHash>/chats/session-<timestamp>-<id>.json
+    ~/.gemini/tmp/<projectHash>/chats/session-<timestamp>-<id>.jsonl   # newer
+``<projectHash>`` is ``sha256(cwd)``; the real working directory is written
+verbatim next to the chats in ``~/.gemini/tmp/<projectHash>/.project_root`` (or
+the parent of ``chats/``), which we read to recover a meaningful ``project``.
+Both file shapes carry the same per-message structure — the difference is only
+the container:
+- ``.json``  — a single object ``{"sessionId", "projectHash", "messages": [...]}``
+- ``.jsonl`` — one record per line; a ``kind: "main"`` header line, then one
+  record per message (lines like ``{"$set": ...}`` are state deltas we skip).
+Either way, ``type == "gemini"`` messages carry **real** token usage in a
+``tokens`` block (``input``/``output``/``cached``/``thoughts``/``total``) plus the
+``model``, so Gemini interactions are exact, not estimated. ``thoughts``
+(reasoning) tokens are folded into output, and ``cached`` maps to cache reads.
+Files are read read-only and never modified.
+"""
+from __future__ import annotations
+import hashlib
+import json
+from collections.abc import Iterator
+from datetime import datetime
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from agent_roi.collectors.base import Collector
+from agent_roi.core.models import Interaction, Tool
+from agent_roi.core.platform import find_tool_dirs
+from agent_roi.core.project import project_for
+_SUMMARY_MAX = 600
+class GeminiCollector(Collector):
+    tool = Tool.GEMINI
+    name = "gemini"
+    def __init__(self, roots: list[Path] | None = None) -> None:
+        # Each root is a ``~/.gemini/tmp`` dir; chat logs live under
+        # ``<projectHash>/chats/``. Multiple roots support WSL reading the
+        # Windows-side profile too.
+        self.roots = roots if roots is not None else find_tool_dirs(".gemini", "tmp")
+    def is_available(self) -> bool:
+        return any(self._chat_files(root) for root in self.roots)
+    def search_paths(self) -> list[Path]:
+        return list(self.roots)
+    def count_files(self) -> int:
+        return sum(1 for root in self.roots for _ in self._chat_files(root))
+    def collect(self) -> Iterator[Interaction]:
+        for root in self.roots:
+            for chat in self._chat_files(root):
+                yield from self._parse_file(chat)
+    @staticmethod
+    def _chat_files(root: Path) -> Iterator[Path]:
+        # ``<projectHash>/chats/session-*.json{,l}`` — glob both extensions.
+        yield from root.glob("*/chats/session-*.json")
+        yield from root.glob("*/chats/session-*.jsonl")
+    def _parse_file(self, path: Path) -> Iterator[Interaction]:
+        try:
+            raw = path.read_text(encoding="utf-8")
+        except OSError:
+            return
+        cwd = _project_root_for(path)
+        project = project_for(cwd)
+        session_id = ""
+        last_user = ""
+        seq = 0
+        for record in _records(raw):
+            if not isinstance(record, dict):
+                continue
+            # Header / metadata: capture the session id once.
+            sid = record.get("sessionId")
+            if isinstance(sid, str) and sid:
+                session_id = sid
+            rtype = record.get("type")
+            if rtype == "user":
+                text = _content_text(record.get("content"))
+                if text:
+                    last_user = text
+                continue
+            if rtype != "gemini":
+                continue
+            tokens = record.get("tokens")
+            if not isinstance(tokens, dict):
+                continue
+            seq += 1
+            sess = session_id or path.stem
+            assistant_text = _content_text(record.get("content"))
+            yield Interaction(
+                id=f"gemini:{sess}:{seq}",
+                tool=self.tool,
+                session_id=sess,
+                timestamp=_parse_ts(record.get("timestamp")),
+                model=str(record.get("model") or "gemini"),
+                input_tokens=int(tokens.get("input", 0)),
+                # Reasoning ("thoughts") tokens are billed like output.
+                output_tokens=int(tokens.get("output", 0)) + int(tokens.get("thoughts", 0)),
+                cache_read_tokens=int(tokens.get("cached", 0)),
+                cwd=cwd,
+                project=project,
+                summary=_combine(last_user, assistant_text),
+            )
+def _records(raw: str) -> Iterator[Any]:
+    """Yield message records from either container shape.
+    A ``.json`` file is one object with a ``messages`` list; a ``.jsonl`` file is
+    one record per line. We also yield the top-level object so callers can read
+    ``sessionId`` from a ``.json``'s header.
+    """
+    stripped = raw.lstrip()
+    if stripped.startswith("{") and '"messages"' in raw:
+        try:
+            obj = json.loads(raw)
+        except json.JSONDecodeError:
+            return
+        if isinstance(obj, dict):
+            yield obj  # header (sessionId, projectHash, ...)
+            yield from obj.get("messages", [])
+        return
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            yield json.loads(line)
+        except json.JSONDecodeError:
+            continue
+def _project_root_for(chat_file: Path) -> str:
+    """Recover the real cwd for a chat file.
+    Layout is ``<root>/<projectHash>/chats/<file>``. We try, in order:
+    1. the ``.project_root`` marker in the ``<projectHash>`` dir (authoritative,
+       written by newer Gemini CLI), then
+    2. a reverse lookup of ``<projectHash>`` (which is ``sha256(cwd)``) against
+       the cwds Gemini recorded in ``~/.gemini/projects.json`` — this recovers a
+       real path for older sessions that predate the marker file.
+    """
+    project_dir = chat_file.parent.parent  # .../<projectHash>
+    marker = project_dir / ".project_root"
+    try:
+        text = marker.read_text(encoding="utf-8").strip()
+        if text:
+            return text
+    except OSError:
+        pass
+    gemini_home = project_dir.parent.parent  # .../.gemini
+    return _hash_to_cwd(gemini_home).get(project_dir.name, "")
+@lru_cache(maxsize=8)
+def _hash_to_cwd(gemini_home: Path) -> dict[str, str]:
+    """Map ``sha256(cwd) -> cwd`` for every project in ``projects.json``."""
+    try:
+        data = json.loads((gemini_home / "projects.json").read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return {}
+    projects = data.get("projects") if isinstance(data, dict) else None
+    if not isinstance(projects, dict):
+        return {}
+    return {hashlib.sha256(cwd.encode()).hexdigest(): cwd for cwd in projects}
+def _content_text(content: object) -> str:
+    """Extract prose from a Gemini message ``content`` field (string or blocks)."""
+    if isinstance(content, str):
+        return content.strip()
+    if isinstance(content, list):
+        pieces: list[str] = []
+        for block in content:
+            if isinstance(block, dict):
+                text = block.get("text")
+                if isinstance(text, str):
+                    pieces.append(text)
+            elif isinstance(block, str):
+                pieces.append(block)
+        return " ".join(p for p in pieces if p).strip()
+    return ""
+def _combine(user_text: str, assistant_text: str) -> str:
+    parts = [p for p in (user_text, assistant_text) if p]
+    return " ".join(parts)[:_SUMMARY_MAX]
+def _parse_ts(raw: object) -> datetime:
+    if isinstance(raw, str):
+        try:
+            return datetime.fromisoformat(raw.replace("Z", "+00:00"))
+        except ValueError:
+            pass
+    return datetime.now()

agent_roi/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Core domain: models, config, pricing, and the orchestration service."""

agent_roi/core/config.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Configuration loading for Agent-ROI.
+Config is read from ``~/.config/agent-roi/config.toml`` (override with the
+``AGENT_ROI_CONFIG`` env var). Every field has a sensible default so the tool
+works out of the box with zero configuration.
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+from platformdirs import user_config_dir, user_data_dir
+from pydantic import BaseModel
+if sys.version_info >= (3, 11):
+    import tomllib
+else:  # pragma: no cover
+    import tomli as tomllib  # type: ignore
+APP_NAME = "agent-roi"
+class ClassifierConfig(BaseModel):
+    # Only "semantic" exists: model-free, offline topic discovery.
+    provider: str = "semantic"
+    # Cosine similarity at/above which two sessions are grouped into the same
+    # topic. Higher = stricter (more, smaller topics); lower = broader topics.
+    similarity_threshold: float = 0.18
+    # Number of distinctive terms used to name each discovered topic.
+    label_terms: int = 3
+class CollectorsConfig(BaseModel):
+    enabled: list[str] = ["claude_code", "codex", "copilot", "gemini"]
+class Config(BaseModel):
+    classifier: ClassifierConfig = ClassifierConfig()
+    collectors: CollectorsConfig = CollectorsConfig()
+    db_path: Path = Path(user_data_dir(APP_NAME)) / "agent_roi.db"
+    @classmethod
+    def load(cls) -> Config:
+        path = _config_path()
+        if not path.exists():
+            return cls()
+        with path.open("rb") as f:
+            data = tomllib.load(f)
+        return cls.model_validate(data)
+def _config_path() -> Path:
+    override = os.environ.get("AGENT_ROI_CONFIG")
+    if override:
+        return Path(override)
+    return Path(user_config_dir(APP_NAME)) / "config.toml"

agent_roi/core/models.py ADDED Viewed

@@ -0,0 +1,241 @@
+"""Core domain models shared across collectors, classifier, and storage."""
+from __future__ import annotations
+from datetime import datetime
+from enum import Enum
+from pydantic import BaseModel, Field
+class Tool(str, Enum):
+    """Supported AI coding tools."""
+    CLAUDE_CODE = "claude_code"
+    CODEX = "codex"
+    COPILOT = "copilot"
+    GEMINI = "gemini"
+    CURSOR = "cursor"
+    UNKNOWN = "unknown"
+class Interaction(BaseModel):
+    """A single normalized request/response turn parsed from a tool's logs.
+    This is the canonical unit produced by collectors and stored in the database.
+    Collectors translate each tool's native log format into this shape.
+    """
+    id: str = Field(..., description="Stable unique id (usually tool's own message id).")
+    tool: Tool
+    session_id: str = Field(..., description="Groups interactions from one agent session.")
+    timestamp: datetime
+    model: str = Field(..., description="Model name reported by the tool, e.g. 'claude-opus-4-8'.")
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cache_read_tokens: int = 0
+    cache_write_tokens: int = 0
+    # Working directory the agent ran in, when the tool records it. Used to derive
+    # ``project`` and as a signal for topic classification.
+    cwd: str = ""
+    # A coarse grouping derived from cwd (git root / folder name). Not the final
+    # topic — the classifier still assigns a semantic ``topic`` per session.
+    project: str = ""
+    # Free-text summary the classifier reads to derive a topic. Kept short and
+    # never includes full prompt bodies. Classification is local and offline, so
+    # nothing here ever leaves the machine.
+    summary: str = ""
+    # Populated by the classifier; null until classification runs.
+    topic: str | None = None
+    # True when token counts are estimated (e.g. via a tokenizer) rather than
+    # reported by the tool. Copilot doesn't expose real usage, so its
+    # interactions are flagged here and shown as "estimated" in reports.
+    estimated: bool = False
+    @property
+    def total_tokens(self) -> int:
+        return (
+            self.input_tokens
+            + self.output_tokens
+            + self.cache_read_tokens
+            + self.cache_write_tokens
+        )
+class Rollup(BaseModel):
+    """Aggregated token usage and cost for one value of a grouping dimension.
+    ``key`` is the dimension value (a topic, a tool, or a model, depending on how
+    the rollup was requested). ``estimated`` is true if *any* interaction in the
+    group has estimated rather than tool-reported tokens, so the UI can mark the
+    number as approximate.
+    """
+    key: str
+    interactions: int
+    input_tokens: int
+    output_tokens: int
+    cache_read_tokens: int
+    cache_write_tokens: int
+    cost_usd: float
+    estimated: bool = False
+    @property
+    def total_tokens(self) -> int:
+        return (
+            self.input_tokens
+            + self.output_tokens
+            + self.cache_read_tokens
+            + self.cache_write_tokens
+        )
+class TopicBreakdown(BaseModel):
+    """A topic's total, plus how it splits across tools and models.
+    This is what lets a user answer "this topic's tokens came from which tools,
+    at what price?" — the drill-down behind a single topic row.
+    """
+    topic: str
+    total: Rollup
+    by_tool: list[Rollup]
+    by_model: list[Rollup]
+class ModelPricing(BaseModel):
+    """Per-model unit prices (USD per 1M tokens), exposed so users can verify
+    that cost = usage x these numbers."""
+    model: str
+    input: float
+    output: float
+    cache_read: float
+    cache_write: float
+class SessionSummary(BaseModel):
+    """One agent session aggregated: the unit a topic is made of.
+    A topic groups many sessions; a session groups many interactions. This is the
+    middle layer of the topic -> session -> interaction drill-down.
+    """
+    session_id: str
+    topic: str
+    project: str
+    tools: list[str]
+    models: list[str]
+    started: datetime
+    ended: datetime
+    interactions: int
+    input_tokens: int
+    output_tokens: int
+    cache_read_tokens: int
+    cache_write_tokens: int
+    cost_usd: float
+    estimated: bool = False
+    @property
+    def total_tokens(self) -> int:
+        return (
+            self.input_tokens
+            + self.output_tokens
+            + self.cache_read_tokens
+            + self.cache_write_tokens
+        )
+class InteractionView(BaseModel):
+    """A single interaction as shown when drilling into a session."""
+    id: str
+    tool: str
+    model: str
+    timestamp: datetime
+    input_tokens: int
+    output_tokens: int
+    cache_read_tokens: int
+    cache_write_tokens: int
+    cost_usd: float
+    estimated: bool
+    summary: str
+    @property
+    def total_tokens(self) -> int:
+        return (
+            self.input_tokens
+            + self.output_tokens
+            + self.cache_read_tokens
+            + self.cache_write_tokens
+        )
+class SessionDetail(BaseModel):
+    """A session's aggregate plus the interactions (conversation turns) in it."""
+    session: SessionSummary
+    interactions: list[InteractionView]
+class TimeSeriesPoint(BaseModel):
+    """Daily usage bucket for trend charts."""
+    date: str
+    interactions: int
+    input_tokens: int
+    output_tokens: int
+    cache_read_tokens: int = 0
+    cache_write_tokens: int = 0
+    cost_usd: float
+    @property
+    def total_tokens(self) -> int:
+        return (
+            self.input_tokens
+            + self.output_tokens
+            + self.cache_read_tokens
+            + self.cache_write_tokens
+        )
+class TimeSeriesSplitRow(BaseModel):
+    """One day of token usage split across a dimension (tool, model, …)."""
+    date: str
+    values: dict[str, int]
+    cost_usd: float
+    interactions: int
+class TimeSeriesBundle(BaseModel):
+    """Everything the trends dashboard needs in one round trip."""
+    totals: list[TimeSeriesPoint]
+    by_tool: list[TimeSeriesSplitRow]
+    by_model: list[TimeSeriesSplitRow]
+    tool_keys: list[str]
+    model_keys: list[str]
+class CollectorStatus(BaseModel):
+    """Diagnostics for one tool collector: where it looked and what it found.
+    This powers the `doctor` command and the dashboard's "data sources" panel so
+    users can see *why* a tool was or wasn't picked up, instead of guessing.
+    """
+    name: str
+    tool: str
+    available: bool
+    search_paths: list[str]
+    log_files: int
+    interactions: int = 0
+    tokens: int = 0
+    cost_usd: float = 0.0
+    note: str = ""

agent_roi/core/platform.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Cross-platform helpers for locating tool log directories.
+Agent-ROI runs on Windows, macOS, and Linux. The tricky case is WSL: a user may
+run their AI tools from Windows (logs under ``C:\\Users\\<name>``, visible from
+WSL at ``/mnt/c/Users/<name>``) while running Agent-ROI from inside the Linux
+distro. We therefore search several candidate roots and use whichever exist.
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+def is_wsl() -> bool:
+    """Detect Windows Subsystem for Linux."""
+    if not sys.platform.startswith("linux"):
+        return False
+    return "microsoft" in _osrelease()
+def platform_label() -> str:
+    """A short human-readable label for the current OS (for diagnostics)."""
+    names = {"darwin": "macOS", "win32": "Windows", "linux": "Linux"}
+    base = names.get(sys.platform, sys.platform)
+    return f"{base} (WSL)" if is_wsl() else base
+def _osrelease() -> str:
+    try:
+        return Path("/proc/sys/kernel/osrelease").read_text().lower()
+    except OSError:
+        return ""
+def home_candidates() -> list[Path]:
+    """Home directories worth searching, most-specific first.
+    Always includes the native home. Under WSL it also includes the mounted
+    Windows user profile(s), since tools are commonly run from the Windows side.
+    """
+    candidates: list[Path] = [Path.home()]
+    if is_wsl():
+        candidates.extend(_windows_homes_from_wsl())
+    # De-duplicate while preserving order.
+    seen: set[Path] = set()
+    result: list[Path] = []
+    for c in candidates:
+        if c not in seen:
+            seen.add(c)
+            result.append(c)
+    return result
+def _windows_homes_from_wsl() -> list[Path]:
+    """Best-effort discovery of Windows user homes from inside WSL."""
+    homes: list[Path] = []
+    users_dir = Path("/mnt/c/Users")
+    if not users_dir.is_dir():
+        return homes
+    # Prefer the matching username, but include all real profiles as a fallback.
+    win_user = os.environ.get("WIN_USER") or os.environ.get("USER")
+    skip = {"Default", "Default User", "Public", "All Users", "desktop.ini"}
+    for entry in users_dir.iterdir():
+        if entry.name in skip or not entry.is_dir():
+            continue
+        if win_user and entry.name.lower() == win_user.lower():
+            homes.insert(0, entry)
+        else:
+            homes.append(entry)
+    return homes
+def find_tool_dirs(*relative_parts: str) -> list[Path]:
+    """Return existing directories at ``<home>/<relative_parts>`` across all
+    candidate homes (native + WSL-mounted Windows)."""
+    found: list[Path] = []
+    for home in home_candidates():
+        candidate = home.joinpath(*relative_parts)
+        if candidate.is_dir():
+            found.append(candidate)
+    return found
+def vscode_user_dirs() -> list[Path]:
+    """Locate VS Code ``User`` directories across platforms (and forks/insiders).
+    VS Code stores per-user state (including chat sessions) under different paths
+    on each OS. We return every existing match so collectors can search them.
+    """
+    # Path of the "User" dir relative to each home, per platform.
+    rel_by_platform: dict[str, list[tuple[str, ...]]] = {
+        "darwin": [("Library", "Application Support", "{app}", "User")],
+        "win32": [("AppData", "Roaming", "{app}", "User")],
+        "linux": [(".config", "{app}", "User")],
+    }
+    # On WSL we also want the Windows-side VS Code, which lives under AppData.
+    if is_wsl():
+        rel_by_platform["linux"].append(("AppData", "Roaming", "{app}", "User"))
+    apps = ["Code", "Code - Insiders", "VSCodium", "Cursor"]
+    templates = rel_by_platform.get(sys.platform, rel_by_platform["linux"])
+    found: list[Path] = []
+    for home in home_candidates():
+        for template in templates:
+            for app in apps:
+                parts = tuple(p.replace("{app}", app) for p in template)
+                candidate = home.joinpath(*parts)
+                if candidate.is_dir():
+                    found.append(candidate)
+    return found