cost-intel 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. cost_intel-0.1.0/.dogfood-data/cost-intel.db +0 -0
  2. cost_intel-0.1.0/.dogfood.db +0 -0
  3. cost_intel-0.1.0/.env.example +18 -0
  4. cost_intel-0.1.0/.github/workflows/ci.yml +39 -0
  5. cost_intel-0.1.0/.github/workflows/publish.yml +34 -0
  6. cost_intel-0.1.0/.gitignore +38 -0
  7. cost_intel-0.1.0/AGENTS.md +153 -0
  8. cost_intel-0.1.0/AGENTS.md.project +97 -0
  9. cost_intel-0.1.0/DOGFOOD_RETROSPECTIVE.md +165 -0
  10. cost_intel-0.1.0/PHASE1_COMPLETE.md +103 -0
  11. cost_intel-0.1.0/PHASE3_COMPLETE.md +92 -0
  12. cost_intel-0.1.0/PHASE4_COMPLETE.md +103 -0
  13. cost_intel-0.1.0/PKG-INFO +133 -0
  14. cost_intel-0.1.0/README.md +100 -0
  15. cost_intel-0.1.0/SESSION_HANDOFF.md +152 -0
  16. cost_intel-0.1.0/cost-intel.db +0 -0
  17. cost_intel-0.1.0/examples/github-actions-cost-gate.yml +45 -0
  18. cost_intel-0.1.0/mission-phase1.md +66 -0
  19. cost_intel-0.1.0/mission-phase2.md +151 -0
  20. cost_intel-0.1.0/plan.md +4297 -0
  21. cost_intel-0.1.0/pyproject.toml +63 -0
  22. cost_intel-0.1.0/scripts/bootstrap.sh +32 -0
  23. cost_intel-0.1.0/scripts/dogfood.sh +34 -0
  24. cost_intel-0.1.0/src/cost_intel/__init__.py +3 -0
  25. cost_intel-0.1.0/src/cost_intel/__main__.py +6 -0
  26. cost_intel-0.1.0/src/cost_intel/adapters/__init__.py +1 -0
  27. cost_intel-0.1.0/src/cost_intel/adapters/braintrust.py +67 -0
  28. cost_intel-0.1.0/src/cost_intel/adapters/eval_harness.py +53 -0
  29. cost_intel-0.1.0/src/cost_intel/alerts.py +134 -0
  30. cost_intel-0.1.0/src/cost_intel/budget.py +74 -0
  31. cost_intel-0.1.0/src/cost_intel/cli.py +950 -0
  32. cost_intel-0.1.0/src/cost_intel/compare.py +72 -0
  33. cost_intel-0.1.0/src/cost_intel/config.py +47 -0
  34. cost_intel-0.1.0/src/cost_intel/db.py +65 -0
  35. cost_intel-0.1.0/src/cost_intel/duration.py +43 -0
  36. cost_intel-0.1.0/src/cost_intel/estimate.py +58 -0
  37. cost_intel-0.1.0/src/cost_intel/gate.py +81 -0
  38. cost_intel-0.1.0/src/cost_intel/guard.py +43 -0
  39. cost_intel-0.1.0/src/cost_intel/ingest.py +332 -0
  40. cost_intel-0.1.0/src/cost_intel/migration_runner.py +100 -0
  41. cost_intel-0.1.0/src/cost_intel/migrations/001_initial.sql +68 -0
  42. cost_intel-0.1.0/src/cost_intel/migrations/002_add_quality.sql +58 -0
  43. cost_intel-0.1.0/src/cost_intel/migrations/003_add_trace_ids.sql +10 -0
  44. cost_intel-0.1.0/src/cost_intel/optimize.py +126 -0
  45. cost_intel-0.1.0/src/cost_intel/otel.py +154 -0
  46. cost_intel-0.1.0/src/cost_intel/pricing.py +222 -0
  47. cost_intel-0.1.0/src/cost_intel/prompt_opt.py +107 -0
  48. cost_intel-0.1.0/src/cost_intel/quality.py +205 -0
  49. cost_intel-0.1.0/src/cost_intel/record.py +171 -0
  50. cost_intel-0.1.0/src/cost_intel/report.py +137 -0
  51. cost_intel-0.1.0/src/cost_intel/trends.py +62 -0
  52. cost_intel-0.1.0/src/cost_intel/utils.py +40 -0
  53. cost_intel-0.1.0/tests/conftest.py +49 -0
  54. cost_intel-0.1.0/tests/test_adapters.py +77 -0
  55. cost_intel-0.1.0/tests/test_alerts.py +164 -0
  56. cost_intel-0.1.0/tests/test_cli_cpqp.py +67 -0
  57. cost_intel-0.1.0/tests/test_compare.py +75 -0
  58. cost_intel-0.1.0/tests/test_config.py +66 -0
  59. cost_intel-0.1.0/tests/test_db.py +172 -0
  60. cost_intel-0.1.0/tests/test_duration.py +61 -0
  61. cost_intel-0.1.0/tests/test_estimate.py +59 -0
  62. cost_intel-0.1.0/tests/test_gate.py +127 -0
  63. cost_intel-0.1.0/tests/test_guard.py +42 -0
  64. cost_intel-0.1.0/tests/test_ingest.py +154 -0
  65. cost_intel-0.1.0/tests/test_ingest_csv.py +158 -0
  66. cost_intel-0.1.0/tests/test_migrations.py +148 -0
  67. cost_intel-0.1.0/tests/test_optimize.py +115 -0
  68. cost_intel-0.1.0/tests/test_otel.py +139 -0
  69. cost_intel-0.1.0/tests/test_pricing.py +234 -0
  70. cost_intel-0.1.0/tests/test_prompt_opt.py +91 -0
  71. cost_intel-0.1.0/tests/test_quality.py +145 -0
  72. cost_intel-0.1.0/tests/test_record.py +186 -0
  73. cost_intel-0.1.0/tests/test_report.py +208 -0
  74. cost_intel-0.1.0/tests/test_trends.py +59 -0
  75. cost_intel-0.1.0/tests/test_utils.py +46 -0
File without changes
@@ -0,0 +1,18 @@
1
+ # Cost Intelligence — Environment Variables
2
+ # Copy to .env and fill in values
3
+
4
+ # OpenRouter — for pricing API + cost-per-model data
5
+ OPENROUTER_API_KEY=
6
+
7
+
8
+ # Linear — for task updates
9
+ LINEAR_API_KEY=
10
+
11
+ # Factory.ai — for Droid orchestration
12
+ FACTORY_API_KEY=
13
+
14
+ # GitHub — for issue/PR creation
15
+ GITHUB_TOKEN=
16
+
17
+ # Cost Intel home directory (default: ~/.cost-intel)
18
+ # COST_INTEL_HOME=
@@ -0,0 +1,39 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+ pip install pytest ruff pytest-cov
29
+
30
+ - name: Lint with ruff
31
+ run: |
32
+ ruff check src/ tests/
33
+ ruff format --check src/ tests/
34
+
35
+ - name: Test with pytest
36
+ run: |
37
+ pytest tests/ -v --cov=src --cov-report=term-missing
38
+ env:
39
+ COST_INTEL_HOME: /tmp/.cost-intel-test
@@ -0,0 +1,34 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ environment:
12
+ name: pypi
13
+ url: https://pypi.org/p/cost-intel
14
+ permissions:
15
+ id-token: write # Required for trusted publishing
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Set up Python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: "3.12"
24
+
25
+ - name: Install build dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install build
29
+
30
+ - name: Build package
31
+ run: python -m build
32
+
33
+ - name: Publish to PyPI
34
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,38 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+ .eggs/
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ ENV/
15
+
16
+ # IDE
17
+ .idea/
18
+ .vscode/
19
+ *.swp
20
+ *.swo
21
+
22
+ # Testing
23
+ .pytest_cache/
24
+ .coverage
25
+ htmlcov/
26
+
27
+ # Ruff
28
+ .ruff_cache/
29
+
30
+ # Environment
31
+ .env
32
+
33
+ # OS
34
+ .DS_Store
35
+ Thumbs.db
36
+
37
+ # Cost Intel data
38
+ .cost-intel/
@@ -0,0 +1,153 @@
1
+ # AGENTS.md — Cost Intelligence
2
+
3
+ ## Session Startup (MANDATORY)
4
+ 1. Read SOUL.md (in profile root)
5
+ 2. Read this file
6
+ 3. Read SESSION_HANDOFF.md (in workspace root) — **start here for current state**
7
+ 4. Read PHASE1_COMPLETE.md (in workspace root) — Phase 1 details
8
+ 5. Read mission-phase2.md (in workspace root) — Phase 2 task specs
9
+ 6. Check Linear for active tasks (project: Cost Intelligence, ONI-43..ONI-71)
10
+
11
+ ## Project Status (June 3 2026)
12
+ **ALL 4 PHASES COMPLETE** — 26 tasks, 164 tests, pushed to GitHub.
13
+ See PHASE4_COMPLETE.md for full details.
14
+
15
+ **Project is DONE.** All phases implemented per the audit-approved plan.
16
+ - Migration 002: quality_scores table + cost_run_cpqp view with PERCENT_RANK()
17
+ - Quality score import adapters (Eval Harness, Braintrust, CSV)
18
+ - CPQP report, waste detection, model comparison, optimization
19
+
20
+ ## Project
21
+ `cost-intel` — a standalone Python CLI for AI cost tracking and quality correlation.
22
+
23
+ ## Mission
24
+ Build a CLI tool that tracks AI spending at the task level, correlates with quality scores, and produces cost-efficiency metrics. No tool currently bridges cost tracking and quality evaluation in a CLI-native package.
25
+
26
+ ## Tech Stack
27
+ - Python 3.11+
28
+ - Typer + Rich (CLI + terminal output)
29
+ - sqlite3 stdlib (WAL mode, busy_timeout=5000)
30
+ - httpx (async HTTP for pricing API)
31
+ - Pydantic v2 (data validation)
32
+ - pyyaml (config loading)
33
+ - tiktoken (token estimation)
34
+ - hatchling (build backend)
35
+ - ruff (lint + format), pytest (test)
36
+
37
+ ## Build Rules (NON-NEGOTIABLE)
38
+ - **TDD**: write failing test → run → implement → run → commit
39
+ - **Type hints everywhere** (mypy-compatible)
40
+ - **Google-style docstrings** for all public functions
41
+ - **ruff check + ruff format** before every commit
42
+ - **Commit after every task** (git add -A && git commit -m "type: description")
43
+ - **No API keys in source** — read from .env
44
+ - **`from typing import Optional`** in all modules using Optional
45
+
46
+ ## Data Directory
47
+ `~/.cost-intel/` (override: `COST_INTEL_HOME`)
48
+ DB: `~/.cost-intel/cost-intel.db`
49
+ Config: `~/.cost-intel/config.yaml`
50
+
51
+ ## File Organization
52
+ ```
53
+ workspace/cost-intel/
54
+ ├── src/cost_intel/ # Source package
55
+ │ ├── __init__.py # __version__ = "0.1.0"
56
+ │ ├── __main__.py # Entry point
57
+ │ ├── cli.py # Typer app + sub-apps (ALL Phase 1 commands)
58
+ │ ├── config.py # Config loader (reads ~/.cost-intel/config.yaml)
59
+ │ ├── db.py # Connection manager + migration runner
60
+ │ ├── migration_runner.py # Numbered SQL migration runner
61
+ │ ├── migrations/ # Numbered SQL files
62
+ │ │ └── 001_initial.sql # Phase 1 schema (COMPLETE)
63
+ │ ├── pricing.py # OpenRouter fetch + historical store
64
+ │ ├── record.py # Cost run recording (cache tokens + raw_response)
65
+ │ ├── report.py # Aggregate views + time-window filtering
66
+ │ ├── budget.py # Budget set/status subcommands
67
+ │ ├── estimate.py # tiktoken pre-call estimation
68
+ │ ├── ingest.py # JSONL ingestion with provider cache extraction
69
+ │ ├── duration.py # parse_window("7d") → 7 (CANONICAL location)
70
+ │ ├── utils.py # Shared utilities (retry, now_iso)
71
+ │ ├── quality.py # [Phase 2] Score import + CPQP + waste detection
72
+ │ ├── optimize.py # [Phase 2] Model routing + target CPQP
73
+ │ ├── compare.py # [Phase 2] Model comparison with efficiency delta
74
+ │ ├── trends.py # [Phase 2] CPQP trend analysis
75
+ │ ├── gate.py # [Phase 3] CI/CD cost gates
76
+ │ ├── alerts.py # [Phase 3] Slack webhook + SMTP email alerts
77
+ │ ├── otel.py # [Phase 4] OpenTelemetry span ingestion + trace cost
78
+ │ ├── enforce.py # [Phase 4] Budget enforcement / hard-stop
79
+ │ ├── prompt_opt.py # [Phase 4] High-cost pattern analysis
80
+ │ └── adapters/ # [Phase 2] Quality score import adapters
81
+ │ ├── eval_harness.py # [Phase 2] Eval Harness DB adapter
82
+ │ └── braintrust.py # [Phase 2] Braintrust API adapter
83
+ ├── tests/
84
+ │ ├── conftest.py # Shared fixtures (tmp_db, tmp_cost_intel_home)
85
+ │ ├── test_*.py # 10 test files, 77 tests (Phase 1 COMPLETE)
86
+ │ └── integration/ # Integration tests (empty — Phase 2+)
87
+ ├── pyproject.toml # hatchling build, dependencies, ruff config
88
+ ├── .env.example # Required env vars (no real values)
89
+ ├── .github/workflows/ci.yml # CI pipeline (ruff + pytest, Phase 1 COMPLETE)
90
+ ├── scripts/
91
+ │ ├── bootstrap.sh # One-command dev setup (executable)
92
+ │ └── dogfood.sh # Dogfood: ingest + report (executable)
93
+ ├── SESSION_HANDOFF.md # **READ FIRST** — current state + next steps
94
+ ├── PHASE1_COMPLETE.md # Phase 1 implementation details
95
+ ├── mission-phase2.md # Factory Droid Phase 2 prompt
96
+ └── plan.md # Full 4-phase plan (4297 lines, audit-approved)
97
+ ```
98
+
99
+ ## Database Conventions
100
+ - Composite PK `(model_id, effective_date)` for historical pricing
101
+ - `is_current` flag on pricing rows
102
+ - Numbered SQL migrations: `001_initial.sql`, `002_add_quality.sql`, `003_add_traces.sql`
103
+ - `schema_version` table for migration tracking
104
+ - Views use `DROP VIEW IF EXISTS` + `CREATE VIEW` in migrations (not `IF NOT EXISTS`)
105
+ - `PRAGMA busy_timeout=5000` on all connections
106
+ - `PRAGMA journal_mode=WAL`
107
+ - Use `with connect() as conn:` contextmanager pattern
108
+ - **Standalone** — zero foreign keys to any other product
109
+
110
+ ## Testing
111
+ - `pytest tests/ -v --cov=src --cov-report=term-missing`
112
+ - Coverage target: >90%
113
+ - Integration tests in `tests/integration/`
114
+ - No real API calls in CI (mock HTTP with pytest-httpx)
115
+ - Test file: `test_<module>.py` for each module
116
+ - `conftest.py` — shared fixtures (tmp_db, tmp_cost_intel_home)
117
+ - Each task: write failing test FIRST, then implement
118
+
119
+ ## CLI Conventions
120
+ - Typer sub-apps for command groups (`budget`, `pricing`)
121
+ - Rich tables for reports
122
+ - Duration parser: `parse_window("7d")` → 7 (days). **Canonical location: `src/cost_intel/duration.py`**
123
+ - Standard flags: `--last/-l`, `--days/-d`, `--window/-w`
124
+ - `--version` flag via Typer `is_eager` callback
125
+
126
+ ## Phase Gates
127
+ Each phase must pass validation before next phase starts:
128
+ - **Phase 1**: `pip install cost-intel` works, record + report work end-to-end, costs match invoices
129
+ - **Phase 2**: CPQP ordering matches intuition, division-by-zero guard works, percentile ratings displayed
130
+ - **Phase 3**: Gate exits 0/1 correctly, alerts trigger at right threshold
131
+ - **Phase 4**: OTel trace cost breakdown works, budget enforcement blocks when exceeded
132
+
133
+ ## Credentials (in ~/.hermes/profiles/cost-intel/.env)
134
+ - `OPENROUTER_API_KEY`
135
+ - `LINEAR_API_KEY`
136
+ - `GITHUB_TOKEN`
137
+ - `FACTORY_API_KEY`
138
+
139
+ ## Git
140
+ - Repo: https://github.com/onicarps/cost-intel
141
+ - Branch: main
142
+ - Branch naming: `cost-intel/ONI-XX-description`
143
+ - Commit style: `type: description` (feat:, fix:, test:, etc.)
144
+
145
+ ## Linear Project
146
+ Cost Intelligence: ONI-43 through ONI-71 (29 issues, 4 phases)
147
+ Project ID: 55e43d66-e6a2-4108-9abe-fd97600aa79a
148
+
149
+ ## Notion
150
+ https://www.notion.so/Cost-Intelligence-373e2527f3178147957ad4e1705278db
151
+
152
+ ## Implementation Plan
153
+ See `plan.md` in this directory for full 4-phase implementation plan (4297 lines, audit-approved, 0 open gaps).
@@ -0,0 +1,97 @@
1
+ # AGENTS.md — Cost Intelligence
2
+
3
+ ## Session Startup (MANDATORY)
4
+ 1. Read SOUL.md
5
+ 2. Read workspace/cost-intel/plan.md (if exists)
6
+ 3. Check Linear for active tasks (project: Cost Intelligence)
7
+ 4. Check Notion for recent design decisions
8
+
9
+ ## Build Rules
10
+ - **TDD**: write failing test → run → implement → run → commit
11
+ - **Type hints everywhere** (mypy-compatible)
12
+ - **Google-style docstrings** for all public functions
13
+ - **ruff check + ruff format** before every commit
14
+ - **Commit after every task** (git add -A && git commit -m "type: description")
15
+ - **No API keys in source** — read from .env
16
+
17
+ ## Project Context
18
+ Python CLI tool (`cost-intel`) that tracks AI spending at the task level and correlates with quality scores. Standalone — zero foreign keys to any other product.
19
+
20
+ **Tech Stack:** Python 3.11+, Typer + Rich, sqlite3 (WAL), httpx, Pydantic v2, pyyaml, tiktoken, hatchling, ruff, pytest
21
+
22
+ **Data directory:** `~/.cost-intel/` (env override: `COST_INTEL_HOME`)
23
+ **DB:** `~/.cost-intel/cost-intel.db`
24
+ **Config:** `~/.cost-intel/config.yaml`
25
+
26
+ ## File Organization
27
+ - Source: `workspace/cost-intel/src/cost_intel/`
28
+ - Tests: `workspace/cost-intel/tests/`
29
+ - Migrations: `workspace/cost-intel/src/cost_intel/migrations/`
30
+ - Docs: `workspace/cost-intel/docs/`
31
+
32
+ ## Database Conventions
33
+ - Composite PK `(model_id, effective_date)` for historical pricing
34
+ - `is_current` flag on pricing rows
35
+ - Numbered SQL migrations: `001_initial.sql`, `002_add_quality.sql`, `003_add_traces.sql`
36
+ - `schema_version` table for migration tracking
37
+ - Views use `DROP VIEW IF EXISTS` + `CREATE VIEW` in migrations (not `IF NOT EXISTS`)
38
+ - `PRAGMA busy_timeout=5000` on all connections
39
+ - Use `with connect() as conn:` contextmanager pattern
40
+
41
+ ## Testing
42
+ - `pytest tests/ -v --cov=src --cov-report=term-missing`
43
+ - Integration tests in `tests/integration/`
44
+ - No real API calls in CI (mock HTTP with pytest-httpx)
45
+ - Coverage target: >90%
46
+
47
+ ## CLI Conventions
48
+ - Typer sub-apps for command groups (`budget`, `pricing`)
49
+ - Rich tables for reports
50
+ - Duration parser: `parse_window("7d")` → 7 (days). Canonical location: `src/cost_intel/duration.py`
51
+ - Standard flags: `--last/-l`, `--days/-d`, `--window/-w`
52
+
53
+ ## Code Conventions
54
+ - `src/cost_intel/__init__.py` — `__version__ = "0.1.0"`
55
+ - `src/cost_intel/db.py` — connection manager + migration runner
56
+ - `src/cost_intel/migrations/` — numbered SQL files
57
+ - `src/cost_intel/models.py` — Pydantic models
58
+ - `src/cost_intel/pricing.py` — OpenRouter fetch + historical store
59
+ - `src/cost_intel/record.py` — cost run recording (cache tokens + raw_response)
60
+ - `src/cost_intel/report.py` — aggregate views + time-window filtering
61
+ - `src/cost_intel/budget.py` — budget set/status subcommands
62
+ - `src/cost_intel/estimate.py` — tiktoken pre-call estimation
63
+ - `src/cost_intel/ingest.py` — JSONL ingestion with provider-specific cache extraction
64
+ - `src/cost_intel/quality.py` — score import + CPQP + waste detection
65
+ - `src/cost_intel/optimize.py` — model routing + target CPQP
66
+ - `src/cost_intel/compare.py` — model comparison with efficiency delta
67
+ - `src/cost_intel/gate.py` — CI/CD cost gates
68
+ - `src/cost_intel/alerts.py` — Slack webhook + SMTP email alerts
69
+ - `src/cost_intel/otel.py` — OpenTelemetry span ingestion + trace cost
70
+ - `src/cost_intel/enforce.py` — budget enforcement / hard-stop
71
+ - `src/cost_intel/prompt_optimize.py` — high-cost pattern analysis
72
+ - `src/cost_intel/adapters/` — quality score import adapters
73
+ - `None` imports: always `from typing import Optional`
74
+
75
+ ## Testing Conventions
76
+ - Test file: `test_<module>.py` for each module
77
+ - `conftest.py` — shared fixtures (tmp_db, tmp_cost_intel_home)
78
+ - Each task: write failing test FIRST, then implement
79
+
80
+ ## Phase Gates
81
+ Each phase must pass validation before next phase starts:
82
+ - **Phase 1**: `pip install cost-intel` works, record + report work end-to-end, costs match invoices
83
+ - **Phase 2**: CPQP ordering matches intuition, division-by-zero guard works, percentile ratings displayed
84
+ - **Phase 3**: Gate exits 0/1 correctly, alerts trigger at right threshold
85
+ - **Phase 4**: OTel trace cost breakdown works, budget enforcement blocks when exceeded
86
+
87
+ ## Credentials (in ~/.hermes/profiles/cost-intel/.env)
88
+ - `OPENROUTER_API_KEY`
89
+ - `LINEAR_API_KEY`
90
+ - `GITHUB_TOKEN`
91
+ - `FACTORY_API_KEY`
92
+
93
+ ## Git
94
+ - Repo: (to be created)
95
+ - Branch: main
96
+ - Branch naming: `cost-intel/ONI-XX-description`
97
+ - Commit style: `type: description` (feat:, fix:, test:, etc.)
@@ -0,0 +1,165 @@
1
+ # Cost Intelligence — Dogfood Retrospective & Pricing Research
2
+
3
+ > **Date:** June 3, 2026
4
+ > **Purpose:** Real-world testing with Hermes session data + pricing system analysis
5
+
6
+ ---
7
+
8
+ ## What We Found
9
+
10
+ ### 1. Hermes Real Usage (Last 30 Days)
11
+
12
+ | Metric | Value |
13
+ |--------|-------|
14
+ | Sessions | 153 |
15
+ | Total tokens | 6.5M in, 221K out |
16
+ | **Total cost** | **$0.18** |
17
+ | Free sessions | 138 (90%) |
18
+ | Paid sessions | 15 (10%) |
19
+
20
+ Cost breakdown by model:
21
+ - `llama-4-scout`: 7 runs, $0.17 (94% of spend)
22
+ - `granite4.1:3b`: 2 runs, $0.003
23
+ - `qwen-3-235b`: 1 run, $0.002
24
+ - `gpt-oss-120b`: 1 run, $0.001
25
+ - All free models (owl-alpha, nemotron-free): $0.00
26
+
27
+ ### 2. Critical Bug: `* 1_000_000` in `refresh_all_pricing`
28
+
29
+ **File:** `src/cost_intel/pricing.py`, lines 185-186
30
+
31
+ ```python
32
+ # Comment says: "OpenRouter returns per-million-token pricing"
33
+ # Reality: OpenRouter returns per-token pricing (e.g., 0.000000039)
34
+ input_price = float(pricing.get("prompt", 0)) * 1_000_000
35
+ output_price = float(pricing.get("completion", 0)) * 1_000_000
36
+ ```
37
+
38
+ For `gpt-oss-120b`:
39
+ - OpenRouter returns: `prompt = "0.000000039"` (per-token)
40
+ - After `* 1_000_000`: `0.039` stored as "per-1K tokens"
41
+ - But `0.039` is actually the **per-1M** price, not per-1K
42
+ - `_compute_cost` then does `(tokens / 1000) * 0.039` = **1000x too high**
43
+
44
+ The comment is wrong. OpenRouter returns per-token pricing. The multiplication by 1M converts it to per-1M pricing. But the column is named `per_1k_tokens`. The `_compute_cost` function divides by 1000, expecting per-1K pricing. So the final result is 1000x inflated.
45
+
46
+ **Actual pricing for reference (per 1M tokens):**
47
+ | Model | Input | Output |
48
+ |-------|-------|--------|
49
+ | gpt-oss-120b | $0.039 | $0.18 |
50
+ | llama-4-scout | $0.08 | $0.30 |
51
+ | qwen-3-235b | $0.071 | $0.099 |
52
+ | granite-4.0-h-micro | $0.017 | $0.112 |
53
+ | owl-alpha | FREE | FREE |
54
+
55
+ ### 3. Model Name Mismatch Problem
56
+
57
+ Hermes state.db uses different names than OpenRouter API:
58
+ - Hermes: `granite4.1:3b` → OpenRouter: `ibm-granite/granite-4.1-8b`
59
+ - Hermes: `gpt-oss-120b` → OpenRouter: `openai/gpt-oss-120b`
60
+ - Hermes: `openrouter/owl-alpha` → OpenRouter: `openrouter/owl-alpha` (matches)
61
+
62
+ When `get_pricing()` can't find a match, it returns None and cost = $0.
63
+ This silently zeros out costs for unmatched models.
64
+
65
+ ### 4. Cost Is Dynamic — Three Moving Parts
66
+
67
+ ```
68
+ cost = tokens × price_per_token
69
+ ```
70
+
71
+ **All three variables change:**
72
+
73
+ 1. **Tokens per session**: 2K to 1.5M input tokens (3 orders of magnitude)
74
+ 2. **Model used**: 8+ models, $0 to $150/1M tokens (5 orders of magnitude)
75
+ 3. **Pricing itself**: Models change price over time
76
+
77
+ ### 5. Model Pricing Evolution (OpenRouter data, June 2026)
78
+
79
+ From the 343 models on OpenRouter:
80
+ - 25 models are free (7.3%)
81
+ - 318 models are paid (92.7%)
82
+ - Price range: $0.01 to $150 per 1M input tokens
83
+ - Median: $0.40 per 1M input tokens
84
+
85
+ **Key observation:** Every major model family has BOTH free and paid variants:
86
+ - `nvidia/nemotron-3-super-120b-a12b:free` → FREE
87
+ - `nvidia/nemotron-3-super-120b-a12b` → $0.09/1M in, $0.45/1M out
88
+ - `openai/gpt-oss-120b:free` → FREE
89
+ - `openai/gpt-oss-120b` → $0.039/1M in, $0.18/1M out
90
+
91
+ This suggests OpenRouter (and providers) use a strategy of offering free tiers that can be upgraded. The `:free` suffix models are often rate-limited or lower-priority versions.
92
+
93
+ ### 6. Historical Pricing — The Real Challenge
94
+
95
+ Model pricing changes over time. Examples from OpenRouter:
96
+ - Models start as free during beta, then become paid
97
+ - Prices decrease as models get more efficient (e.g., llama-3 vs llama-4)
98
+ - Prices increase when demand spikes
99
+ - Free tiers get discontinued
100
+
101
+ **The current schema is designed for this:**
102
+ - `model_pricing` has `(model_id, effective_date)` composite PK
103
+ - `is_current` flag for quick lookups
104
+ - Old pricing rows are preserved when prices change
105
+
106
+ **But the cost computation bakes the price:**
107
+ - `cost_run_calls.call_cost` is computed at record time using current pricing
108
+ - If pricing changes later, historical costs don't update
109
+ - This is a **design tension**: do we store the price at time of use, or recompute?
110
+
111
+ For the Hermes dogfood, this doesn't matter much because:
112
+ - Most sessions use free models (price = $0, won't change)
113
+ - The paid sessions are tiny ($0.18 total)
114
+ - Pricing for stable models changes slowly
115
+
116
+ But at scale (thousands of $, enterprise usage), pricing history matters.
117
+
118
+ ---
119
+
120
+ ## Architectural Implications
121
+
122
+ ### Cost Tracking Needs a Reconciliation Layer
123
+
124
+ The current flow:
125
+ ```
126
+ Hermes session → model name → get_pricing(model_name) → cost
127
+ ```
128
+
129
+ This breaks when:
130
+ 1. Model names don't match between systems (naming drift)
131
+ 2. Pricing is missing for a model (new model, API lag)
132
+ 3. Pricing changed retroactively (provider refunds, corrections)
133
+
134
+ **Needed:**
135
+ 1. Model name mapping table (Hermes name ↔ OpenRouter ID)
136
+ 2. Pricing fallback chain: exact match → prefix match → fuzzy match → default
137
+ 3. Pricing freshness tracking with alerts
138
+ 4. Cost recomputation capability for historical data
139
+
140
+ ### Cost Is Effectively Free at Small Scale
141
+
142
+ 90% of sessions use free models. The 10% paid sessions average $0.012 each.
143
+ At this scale, cost tracking is about:
144
+ - **Anomaly detection**: alert when a paid model is used unexpectedly
145
+ - **Trend tracking**: monitor the ratio of free vs paid over time
146
+ - **Audit trail**: know which tasks required paid models
147
+
148
+ ### The Value Proposition Shifts with Scale
149
+
150
+ - **Small scale (<$10/month)**: Cost tracking = anomaly detection
151
+ - **Medium scale ($10-100/month)**: Cost tracking = optimization opportunities
152
+ - **Large scale ($100+/month)**: Cost tracking = cost avoidance, budget enforcement
153
+
154
+ Hermes is currently at the small scale. The OWL value prop ("unified cost-quality metric") is most valuable at medium-to-large scale.
155
+
156
+ ---
157
+
158
+ ## Action Items
159
+
160
+ 1. [ ] Fix `* 1_000_000` bug in `refresh_all_pricing` — the comment is wrong, OpenRouter returns per-token pricing
161
+ 2. [ ] Add model name mapping/normalization layer
162
+ 3. [ ] Add pricing freshness tracking and alerts
163
+ 4. [ ] Add cost recomputation for historical data
164
+ 5. [ ] Add cost anomaly detection (alert when cost > Nσ for model)
165
+ 6. [ ] Determine pricing unit convention and make it consistent (per-1K vs per-1M vs per-token)
@@ -0,0 +1,103 @@
1
+ # Cost Intelligence — Phase 1 Implementation Complete
2
+
3
+ > **Date:** June 2-3, 2026
4
+ > **Status:** Phase 1 (Cost-Only Foundation) — ALL 12 tasks complete
5
+ > **Tests:** 77 passing, ruff clean, 0 lint errors
6
+ > **GitHub:** https://github.com/onicarps/cost-intel (7 commits on main)
7
+
8
+ ---
9
+
10
+ ## What Was Built
11
+
12
+ A standalone Python CLI (`cost-intel`) that tracks AI spending from the command line.
13
+ No quality data needed — purely cost-only layer (Phase 1).
14
+
15
+ ### Source Modules Created
16
+
17
+ | Module | Purpose |
18
+ |--------|---------|
19
+ | `src/cost_intel/__init__.py` | `__version__ = "0.1.0"` |
20
+ | `src/cost_intel/__main__.py` | `python -m cost_intel` entry |
21
+ | `src/cost_intel/cli.py` | Typer app with all CLI commands |
22
+ | `src/cost_intel/config.py` | YAML config loader with caching |
23
+ | `src/cost_intel/utils.py` | `now_iso()`, `retry()` with exponential backoff |
24
+ | `src/cost_intel/duration.py` | `parse_window("7d")` → 7 (CANONICAL location) |
25
+ | `src/cost_intel/db.py` | Connection manager + `connect()` contextmanager + `init_db()` |
26
+ | `src/cost_intel/migration_runner.py` | Numbered SQL migration runner |
27
+ | `src/cost_intel/migrations/001_initial.sql` | Full Phase 1 schema |
28
+ | `src/cost_intel/pricing.py` | OpenRouter fetch, upsert (same-day update vs cross-date insert), get, manual |
29
+ | `src/cost_intel/record.py` | `record_run()`, `get_run()`, `get_run_calls()` |
30
+ | `src/cost_intel/report.py` | `report_summary()`, `report_by_model()`, `report_by_label()`, `report_by_day()` |
31
+ | `src/cost_intel/budget.py` | `set_budget()`, `get_budget_status()` |
32
+ | `src/cost_intel/estimate.py` | `estimate_tokens()`, `estimate_cost()` (tiktoken) |
33
+ | `src/cost_intel/ingest.py` | `ingest_jsonl()` with provider token extraction |
34
+
35
+ ### CLI Commands Working
36
+
37
+ ```
38
+ cost-intel --version → cost-intel 0.1.0
39
+ cost-intel record --model M -i 100 -o 50 → record a cost run
40
+ cost-intel report --last 7d --by-model → cost report with tables
41
+ cost-intel trends --last 30d → daily spending trends
42
+ cost-intel export --format csv --last 7d → CSV export
43
+ cost-intel budget set --monthly 500 → set budget
44
+ cost-intel budget status → show budget status
45
+ cost-intel refresh-pricing → fetch from OpenRouter API
46
+ cost-intel pricing set/show --model M → manual pricing
47
+ cost-intel estimate "hello" --model gpt-4 → token/cost estimation
48
+ cost-intel ingest-api-responses file.jsonl → ingest JSONL
49
+ ```
50
+
51
+ ---
52
+
53
+ ## Known Issues / Bugs Found During Implementation
54
+
55
+ 1. **OpenRouter pricing math**: API returns per-million-token pricing. Must multiply by `1_000_000` to get per-1K-token pricing (not `1_000`). Fixed in `refresh_all_pricing()`.
56
+
57
+ 2. **Same-day upsert DELETE+INSERT**: Using `INSERT OR REPLACE` with composite PK `(model_id, effective_date)` deleted the old row entirely, so `is_current=0` historical rows were lost. Fixed with conditional logic: same-day → UPDATE in place, different day → mark old `is_current=0` + INSERT new.
58
+
59
+ 3. **SQLite datetime comparison**: ISO timestamps with timezone offsets (`2025-01-01T00:00:00+00:00`) don't compare correctly with SQLite's `datetime('now', '-N days')` which returns `'YYYY-MM-DD HH:MM:SS'` format. Fixed test to use `'YYYY-MM-DD HH:MM:SS'` format for manually inserted timestamps.
60
+
61
+ 4. **`dict` params vs positional `?`**: Report `_days_filter()` passed a dict to `conn.execute()` but SQL used `?` positional placeholders. Fixed to return `list` instead of `dict`.
62
+
63
+ ---
64
+
65
+ ## Test Coverage (77 tests)
66
+
67
+ | Test File | Tests | What's Covered |
68
+ |-----------|-------|----------------|
69
+ | `test_config.py` | 5 | Config loader (no file, reads YAML, caches, eval weights) |
70
+ | `test_utils.py` | 3 | now_iso, retry (success, retries, raises) |
71
+ | `test_duration.py` | 12 | parse_window (d/h/w/bare int/whitespace/case/invalid) |
72
+ | `test_db.py` | 12 | Schema creation, migrations, composite PK, WAL, busy_timeout, foreign_keys, contextmanager commit/rollback |
73
+ | `test_pricing.py` | 10 | Upsert, update preserves old row, same-day update, noop, cache pricing, historical pricing, manual pricing, refresh insert/skip |
74
+ | `test_record.py` | 11 | Basic record, cost computation, unknown model, cache tokens, raw_response truncation, run_type, label, latency, get_run, get_run_calls |
75
+ | `test_report.py` | 9 | Summary empty/with-runs/time-window, by-model, by-label, by-day, budget set/status/spending |
76
+ | `test_estimate.py` | 5 | Token estimation (basic, empty, longer=text, cost with pricing/unknown) |
77
+ | `test_ingest.py` | 9 | Token extraction (OpenRouter/Anthropic/OpenAI/unknown), JSONL ingest (basic, skip invalid, label, nonexistent file) |
78
+
79
+ ---
80
+
81
+ ## Remaining Phases
82
+
83
+ ### Phase 2: Quality Correlation (Weeks 4-6)
84
+ Tasks 2.0-2.5 — Quality scores, CPQP metric, waste detection, model comparison
85
+
86
+ Key migration needed: `002_add_quality.sql` — adds `quality_scores` table + `cost_run_cpqp` view with `PERCENT_RANK()` for A/B/C/D/F ratings.
87
+
88
+ ### Phase 3: CI/CD + Alerts (Weeks 7-9)
89
+ Tasks 3.1-3.3 — Cost gate, GitHub Actions example, Slack/email alerts
90
+
91
+ ### Phase 4: Multi-Agent + Advanced (Weeks 10-12)
92
+ Tasks 4.0-4.4 — OTel span ingestion, trace cost breakdown, prompt optimization, budget enforcement
93
+
94
+ Migration needed: `003_add_traces.sql` — adds `trace_id`, `span_id`, `parent_span_id` to `cost_runs`.
95
+
96
+ ---
97
+
98
+ ## Key Files for Next Session
99
+
100
+ - **Plan:** `research/cost-intelligence/plan.md` (4297 lines, audit-approved)
101
+ - **AGENTS.md:** `workspace/cost-intel/AGENTS.md` (full project spec + file tree)
102
+ - **Mission prompt:** `workspace/cost-intel/mission-phase1.md`
103
+ - **This doc:** `workspace/cost-intel/PHASE1_COMPLETE.md`