cost-intel 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cost_intel-0.1.0/.dogfood-data/cost-intel.db +0 -0
- cost_intel-0.1.0/.dogfood.db +0 -0
- cost_intel-0.1.0/.env.example +18 -0
- cost_intel-0.1.0/.github/workflows/ci.yml +39 -0
- cost_intel-0.1.0/.github/workflows/publish.yml +34 -0
- cost_intel-0.1.0/.gitignore +38 -0
- cost_intel-0.1.0/AGENTS.md +153 -0
- cost_intel-0.1.0/AGENTS.md.project +97 -0
- cost_intel-0.1.0/DOGFOOD_RETROSPECTIVE.md +165 -0
- cost_intel-0.1.0/PHASE1_COMPLETE.md +103 -0
- cost_intel-0.1.0/PHASE3_COMPLETE.md +92 -0
- cost_intel-0.1.0/PHASE4_COMPLETE.md +103 -0
- cost_intel-0.1.0/PKG-INFO +133 -0
- cost_intel-0.1.0/README.md +100 -0
- cost_intel-0.1.0/SESSION_HANDOFF.md +152 -0
- cost_intel-0.1.0/cost-intel.db +0 -0
- cost_intel-0.1.0/examples/github-actions-cost-gate.yml +45 -0
- cost_intel-0.1.0/mission-phase1.md +66 -0
- cost_intel-0.1.0/mission-phase2.md +151 -0
- cost_intel-0.1.0/plan.md +4297 -0
- cost_intel-0.1.0/pyproject.toml +63 -0
- cost_intel-0.1.0/scripts/bootstrap.sh +32 -0
- cost_intel-0.1.0/scripts/dogfood.sh +34 -0
- cost_intel-0.1.0/src/cost_intel/__init__.py +3 -0
- cost_intel-0.1.0/src/cost_intel/__main__.py +6 -0
- cost_intel-0.1.0/src/cost_intel/adapters/__init__.py +1 -0
- cost_intel-0.1.0/src/cost_intel/adapters/braintrust.py +67 -0
- cost_intel-0.1.0/src/cost_intel/adapters/eval_harness.py +53 -0
- cost_intel-0.1.0/src/cost_intel/alerts.py +134 -0
- cost_intel-0.1.0/src/cost_intel/budget.py +74 -0
- cost_intel-0.1.0/src/cost_intel/cli.py +950 -0
- cost_intel-0.1.0/src/cost_intel/compare.py +72 -0
- cost_intel-0.1.0/src/cost_intel/config.py +47 -0
- cost_intel-0.1.0/src/cost_intel/db.py +65 -0
- cost_intel-0.1.0/src/cost_intel/duration.py +43 -0
- cost_intel-0.1.0/src/cost_intel/estimate.py +58 -0
- cost_intel-0.1.0/src/cost_intel/gate.py +81 -0
- cost_intel-0.1.0/src/cost_intel/guard.py +43 -0
- cost_intel-0.1.0/src/cost_intel/ingest.py +332 -0
- cost_intel-0.1.0/src/cost_intel/migration_runner.py +100 -0
- cost_intel-0.1.0/src/cost_intel/migrations/001_initial.sql +68 -0
- cost_intel-0.1.0/src/cost_intel/migrations/002_add_quality.sql +58 -0
- cost_intel-0.1.0/src/cost_intel/migrations/003_add_trace_ids.sql +10 -0
- cost_intel-0.1.0/src/cost_intel/optimize.py +126 -0
- cost_intel-0.1.0/src/cost_intel/otel.py +154 -0
- cost_intel-0.1.0/src/cost_intel/pricing.py +222 -0
- cost_intel-0.1.0/src/cost_intel/prompt_opt.py +107 -0
- cost_intel-0.1.0/src/cost_intel/quality.py +205 -0
- cost_intel-0.1.0/src/cost_intel/record.py +171 -0
- cost_intel-0.1.0/src/cost_intel/report.py +137 -0
- cost_intel-0.1.0/src/cost_intel/trends.py +62 -0
- cost_intel-0.1.0/src/cost_intel/utils.py +40 -0
- cost_intel-0.1.0/tests/conftest.py +49 -0
- cost_intel-0.1.0/tests/test_adapters.py +77 -0
- cost_intel-0.1.0/tests/test_alerts.py +164 -0
- cost_intel-0.1.0/tests/test_cli_cpqp.py +67 -0
- cost_intel-0.1.0/tests/test_compare.py +75 -0
- cost_intel-0.1.0/tests/test_config.py +66 -0
- cost_intel-0.1.0/tests/test_db.py +172 -0
- cost_intel-0.1.0/tests/test_duration.py +61 -0
- cost_intel-0.1.0/tests/test_estimate.py +59 -0
- cost_intel-0.1.0/tests/test_gate.py +127 -0
- cost_intel-0.1.0/tests/test_guard.py +42 -0
- cost_intel-0.1.0/tests/test_ingest.py +154 -0
- cost_intel-0.1.0/tests/test_ingest_csv.py +158 -0
- cost_intel-0.1.0/tests/test_migrations.py +148 -0
- cost_intel-0.1.0/tests/test_optimize.py +115 -0
- cost_intel-0.1.0/tests/test_otel.py +139 -0
- cost_intel-0.1.0/tests/test_pricing.py +234 -0
- cost_intel-0.1.0/tests/test_prompt_opt.py +91 -0
- cost_intel-0.1.0/tests/test_quality.py +145 -0
- cost_intel-0.1.0/tests/test_record.py +186 -0
- cost_intel-0.1.0/tests/test_report.py +208 -0
- cost_intel-0.1.0/tests/test_trends.py +59 -0
- cost_intel-0.1.0/tests/test_utils.py +46 -0
|
Binary file
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Cost Intelligence — Environment Variables
|
|
2
|
+
# Copy to .env and fill in values
|
|
3
|
+
|
|
4
|
+
# OpenRouter — for pricing API + cost-per-model data
|
|
5
|
+
OPENROUTER_API_KEY=
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# Linear — for task updates
|
|
9
|
+
LINEAR_API_KEY=
|
|
10
|
+
|
|
11
|
+
# Factory.ai — for Droid orchestration
|
|
12
|
+
FACTORY_API_KEY=
|
|
13
|
+
|
|
14
|
+
# GitHub — for issue/PR creation
|
|
15
|
+
GITHUB_TOKEN=
|
|
16
|
+
|
|
17
|
+
# Cost Intel home directory (default: ~/.cost-intel)
|
|
18
|
+
# COST_INTEL_HOME=
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
pip install pytest ruff pytest-cov
|
|
29
|
+
|
|
30
|
+
- name: Lint with ruff
|
|
31
|
+
run: |
|
|
32
|
+
ruff check src/ tests/
|
|
33
|
+
ruff format --check src/ tests/
|
|
34
|
+
|
|
35
|
+
- name: Test with pytest
|
|
36
|
+
run: |
|
|
37
|
+
pytest tests/ -v --cov=src --cov-report=term-missing
|
|
38
|
+
env:
|
|
39
|
+
COST_INTEL_HOME: /tmp/.cost-intel-test
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment:
|
|
12
|
+
name: pypi
|
|
13
|
+
url: https://pypi.org/p/cost-intel
|
|
14
|
+
permissions:
|
|
15
|
+
id-token: write # Required for trusted publishing
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: "3.12"
|
|
24
|
+
|
|
25
|
+
- name: Install build dependencies
|
|
26
|
+
run: |
|
|
27
|
+
python -m pip install --upgrade pip
|
|
28
|
+
pip install build
|
|
29
|
+
|
|
30
|
+
- name: Build package
|
|
31
|
+
run: python -m build
|
|
32
|
+
|
|
33
|
+
- name: Publish to PyPI
|
|
34
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
.eggs/
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
ENV/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
|
|
22
|
+
# Testing
|
|
23
|
+
.pytest_cache/
|
|
24
|
+
.coverage
|
|
25
|
+
htmlcov/
|
|
26
|
+
|
|
27
|
+
# Ruff
|
|
28
|
+
.ruff_cache/
|
|
29
|
+
|
|
30
|
+
# Environment
|
|
31
|
+
.env
|
|
32
|
+
|
|
33
|
+
# OS
|
|
34
|
+
.DS_Store
|
|
35
|
+
Thumbs.db
|
|
36
|
+
|
|
37
|
+
# Cost Intel data
|
|
38
|
+
.cost-intel/
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# AGENTS.md — Cost Intelligence
|
|
2
|
+
|
|
3
|
+
## Session Startup (MANDATORY)
|
|
4
|
+
1. Read SOUL.md (in profile root)
|
|
5
|
+
2. Read this file
|
|
6
|
+
3. Read SESSION_HANDOFF.md (in workspace root) — **start here for current state**
|
|
7
|
+
4. Read PHASE1_COMPLETE.md (in workspace root) — Phase 1 details
|
|
8
|
+
5. Read mission-phase2.md (in workspace root) — Phase 2 task specs
|
|
9
|
+
6. Check Linear for active tasks (project: Cost Intelligence, ONI-43..ONI-71)
|
|
10
|
+
|
|
11
|
+
## Project Status (June 3 2026)
|
|
12
|
+
**ALL 4 PHASES COMPLETE** — 26 tasks, 164 tests, pushed to GitHub.
|
|
13
|
+
See PHASE4_COMPLETE.md for full details.
|
|
14
|
+
|
|
15
|
+
**Project is DONE.** All phases implemented per the audit-approved plan.
|
|
16
|
+
- Migration 002: quality_scores table + cost_run_cpqp view with PERCENT_RANK()
|
|
17
|
+
- Quality score import adapters (Eval Harness, Braintrust, CSV)
|
|
18
|
+
- CPQP report, waste detection, model comparison, optimization
|
|
19
|
+
|
|
20
|
+
## Project
|
|
21
|
+
`cost-intel` — a standalone Python CLI for AI cost tracking and quality correlation.
|
|
22
|
+
|
|
23
|
+
## Mission
|
|
24
|
+
Build a CLI tool that tracks AI spending at the task level, correlates with quality scores, and produces cost-efficiency metrics. No tool currently bridges cost tracking and quality evaluation in a CLI-native package.
|
|
25
|
+
|
|
26
|
+
## Tech Stack
|
|
27
|
+
- Python 3.11+
|
|
28
|
+
- Typer + Rich (CLI + terminal output)
|
|
29
|
+
- sqlite3 stdlib (WAL mode, busy_timeout=5000)
|
|
30
|
+
- httpx (async HTTP for pricing API)
|
|
31
|
+
- Pydantic v2 (data validation)
|
|
32
|
+
- pyyaml (config loading)
|
|
33
|
+
- tiktoken (token estimation)
|
|
34
|
+
- hatchling (build backend)
|
|
35
|
+
- ruff (lint + format), pytest (test)
|
|
36
|
+
|
|
37
|
+
## Build Rules (NON-NEGOTIABLE)
|
|
38
|
+
- **TDD**: write failing test → run → implement → run → commit
|
|
39
|
+
- **Type hints everywhere** (mypy-compatible)
|
|
40
|
+
- **Google-style docstrings** for all public functions
|
|
41
|
+
- **ruff check + ruff format** before every commit
|
|
42
|
+
- **Commit after every task** (git add -A && git commit -m "type: description")
|
|
43
|
+
- **No API keys in source** — read from .env
|
|
44
|
+
- **`from typing import Optional`** in all modules using Optional
|
|
45
|
+
|
|
46
|
+
## Data Directory
|
|
47
|
+
`~/.cost-intel/` (override: `COST_INTEL_HOME`)
|
|
48
|
+
DB: `~/.cost-intel/cost-intel.db`
|
|
49
|
+
Config: `~/.cost-intel/config.yaml`
|
|
50
|
+
|
|
51
|
+
## File Organization
|
|
52
|
+
```
|
|
53
|
+
workspace/cost-intel/
|
|
54
|
+
├── src/cost_intel/ # Source package
|
|
55
|
+
│ ├── __init__.py # __version__ = "0.1.0"
|
|
56
|
+
│ ├── __main__.py # Entry point
|
|
57
|
+
│ ├── cli.py # Typer app + sub-apps (ALL Phase 1 commands)
|
|
58
|
+
│ ├── config.py # Config loader (reads ~/.cost-intel/config.yaml)
|
|
59
|
+
│ ├── db.py # Connection manager + migration runner
|
|
60
|
+
│ ├── migration_runner.py # Numbered SQL migration runner
|
|
61
|
+
│ ├── migrations/ # Numbered SQL files
|
|
62
|
+
│ │ └── 001_initial.sql # Phase 1 schema (COMPLETE)
|
|
63
|
+
│ ├── pricing.py # OpenRouter fetch + historical store
|
|
64
|
+
│ ├── record.py # Cost run recording (cache tokens + raw_response)
|
|
65
|
+
│ ├── report.py # Aggregate views + time-window filtering
|
|
66
|
+
│ ├── budget.py # Budget set/status subcommands
|
|
67
|
+
│ ├── estimate.py # tiktoken pre-call estimation
|
|
68
|
+
│ ├── ingest.py # JSONL ingestion with provider cache extraction
|
|
69
|
+
│ ├── duration.py # parse_window("7d") → 7 (CANONICAL location)
|
|
70
|
+
│ ├── utils.py # Shared utilities (retry, now_iso)
|
|
71
|
+
│ ├── quality.py # [Phase 2] Score import + CPQP + waste detection
|
|
72
|
+
│ ├── optimize.py # [Phase 2] Model routing + target CPQP
|
|
73
|
+
│ ├── compare.py # [Phase 2] Model comparison with efficiency delta
|
|
74
|
+
│ ├── trends.py # [Phase 2] CPQP trend analysis
|
|
75
|
+
│ ├── gate.py # [Phase 3] CI/CD cost gates
|
|
76
|
+
│ ├── alerts.py # [Phase 3] Slack webhook + SMTP email alerts
|
|
77
|
+
│ ├── otel.py # [Phase 4] OpenTelemetry span ingestion + trace cost
|
|
78
|
+
│ ├── enforce.py # [Phase 4] Budget enforcement / hard-stop
|
|
79
|
+
│ ├── prompt_opt.py # [Phase 4] High-cost pattern analysis
|
|
80
|
+
│ └── adapters/ # [Phase 2] Quality score import adapters
|
|
81
|
+
│ ├── eval_harness.py # [Phase 2] Eval Harness DB adapter
|
|
82
|
+
│ └── braintrust.py # [Phase 2] Braintrust API adapter
|
|
83
|
+
├── tests/
|
|
84
|
+
│ ├── conftest.py # Shared fixtures (tmp_db, tmp_cost_intel_home)
|
|
85
|
+
│ ├── test_*.py # 10 test files, 77 tests (Phase 1 COMPLETE)
|
|
86
|
+
│ └── integration/ # Integration tests (empty — Phase 2+)
|
|
87
|
+
├── pyproject.toml # hatchling build, dependencies, ruff config
|
|
88
|
+
├── .env.example # Required env vars (no real values)
|
|
89
|
+
├── .github/workflows/ci.yml # CI pipeline (ruff + pytest, Phase 1 COMPLETE)
|
|
90
|
+
├── scripts/
|
|
91
|
+
│ ├── bootstrap.sh # One-command dev setup (executable)
|
|
92
|
+
│ └── dogfood.sh # Dogfood: ingest + report (executable)
|
|
93
|
+
├── SESSION_HANDOFF.md # **READ FIRST** — current state + next steps
|
|
94
|
+
├── PHASE1_COMPLETE.md # Phase 1 implementation details
|
|
95
|
+
├── mission-phase2.md # Factory Droid Phase 2 prompt
|
|
96
|
+
└── plan.md # Full 4-phase plan (4297 lines, audit-approved)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Database Conventions
|
|
100
|
+
- Composite PK `(model_id, effective_date)` for historical pricing
|
|
101
|
+
- `is_current` flag on pricing rows
|
|
102
|
+
- Numbered SQL migrations: `001_initial.sql`, `002_add_quality.sql`, `003_add_traces.sql`
|
|
103
|
+
- `schema_version` table for migration tracking
|
|
104
|
+
- Views use `DROP VIEW IF EXISTS` + `CREATE VIEW` in migrations (not `IF NOT EXISTS`)
|
|
105
|
+
- `PRAGMA busy_timeout=5000` on all connections
|
|
106
|
+
- `PRAGMA journal_mode=WAL`
|
|
107
|
+
- Use `with connect() as conn:` contextmanager pattern
|
|
108
|
+
- **Standalone** — zero foreign keys to any other product
|
|
109
|
+
|
|
110
|
+
## Testing
|
|
111
|
+
- `pytest tests/ -v --cov=src --cov-report=term-missing`
|
|
112
|
+
- Coverage target: >90%
|
|
113
|
+
- Integration tests in `tests/integration/`
|
|
114
|
+
- No real API calls in CI (mock HTTP with pytest-httpx)
|
|
115
|
+
- Test file: `test_<module>.py` for each module
|
|
116
|
+
- `conftest.py` — shared fixtures (tmp_db, tmp_cost_intel_home)
|
|
117
|
+
- Each task: write failing test FIRST, then implement
|
|
118
|
+
|
|
119
|
+
## CLI Conventions
|
|
120
|
+
- Typer sub-apps for command groups (`budget`, `pricing`)
|
|
121
|
+
- Rich tables for reports
|
|
122
|
+
- Duration parser: `parse_window("7d")` → 7 (days). **Canonical location: `src/cost_intel/duration.py`**
|
|
123
|
+
- Standard flags: `--last/-l`, `--days/-d`, `--window/-w`
|
|
124
|
+
- `--version` flag via Typer `is_eager` callback
|
|
125
|
+
|
|
126
|
+
## Phase Gates
|
|
127
|
+
Each phase must pass validation before next phase starts:
|
|
128
|
+
- **Phase 1**: `pip install cost-intel` works, record + report work end-to-end, costs match invoices
|
|
129
|
+
- **Phase 2**: CPQP ordering matches intuition, division-by-zero guard works, percentile ratings displayed
|
|
130
|
+
- **Phase 3**: Gate exits 0/1 correctly, alerts trigger at right threshold
|
|
131
|
+
- **Phase 4**: OTel trace cost breakdown works, budget enforcement blocks when exceeded
|
|
132
|
+
|
|
133
|
+
## Credentials (in ~/.hermes/profiles/cost-intel/.env)
|
|
134
|
+
- `OPENROUTER_API_KEY`
|
|
135
|
+
- `LINEAR_API_KEY`
|
|
136
|
+
- `GITHUB_TOKEN`
|
|
137
|
+
- `FACTORY_API_KEY`
|
|
138
|
+
|
|
139
|
+
## Git
|
|
140
|
+
- Repo: https://github.com/onicarps/cost-intel
|
|
141
|
+
- Branch: main
|
|
142
|
+
- Branch naming: `cost-intel/ONI-XX-description`
|
|
143
|
+
- Commit style: `type: description` (feat:, fix:, test:, etc.)
|
|
144
|
+
|
|
145
|
+
## Linear Project
|
|
146
|
+
Cost Intelligence: ONI-43 through ONI-71 (29 issues, 4 phases)
|
|
147
|
+
Project ID: 55e43d66-e6a2-4108-9abe-fd97600aa79a
|
|
148
|
+
|
|
149
|
+
## Notion
|
|
150
|
+
https://www.notion.so/Cost-Intelligence-373e2527f3178147957ad4e1705278db
|
|
151
|
+
|
|
152
|
+
## Implementation Plan
|
|
153
|
+
See `plan.md` in this directory for full 4-phase implementation plan (4297 lines, audit-approved, 0 open gaps).
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# AGENTS.md — Cost Intelligence
|
|
2
|
+
|
|
3
|
+
## Session Startup (MANDATORY)
|
|
4
|
+
1. Read SOUL.md
|
|
5
|
+
2. Read workspace/cost-intel/plan.md (if exists)
|
|
6
|
+
3. Check Linear for active tasks (project: Cost Intelligence)
|
|
7
|
+
4. Check Notion for recent design decisions
|
|
8
|
+
|
|
9
|
+
## Build Rules
|
|
10
|
+
- **TDD**: write failing test → run → implement → run → commit
|
|
11
|
+
- **Type hints everywhere** (mypy-compatible)
|
|
12
|
+
- **Google-style docstrings** for all public functions
|
|
13
|
+
- **ruff check + ruff format** before every commit
|
|
14
|
+
- **Commit after every task** (git add -A && git commit -m "type: description")
|
|
15
|
+
- **No API keys in source** — read from .env
|
|
16
|
+
|
|
17
|
+
## Project Context
|
|
18
|
+
Python CLI tool (`cost-intel`) that tracks AI spending at the task level and correlates with quality scores. Standalone — zero foreign keys to any other product.
|
|
19
|
+
|
|
20
|
+
**Tech Stack:** Python 3.11+, Typer + Rich, sqlite3 (WAL), httpx, Pydantic v2, pyyaml, tiktoken, hatchling, ruff, pytest
|
|
21
|
+
|
|
22
|
+
**Data directory:** `~/.cost-intel/` (env override: `COST_INTEL_HOME`)
|
|
23
|
+
**DB:** `~/.cost-intel/cost-intel.db`
|
|
24
|
+
**Config:** `~/.cost-intel/config.yaml`
|
|
25
|
+
|
|
26
|
+
## File Organization
|
|
27
|
+
- Source: `workspace/cost-intel/src/cost_intel/`
|
|
28
|
+
- Tests: `workspace/cost-intel/tests/`
|
|
29
|
+
- Migrations: `workspace/cost-intel/src/cost_intel/migrations/`
|
|
30
|
+
- Docs: `workspace/cost-intel/docs/`
|
|
31
|
+
|
|
32
|
+
## Database Conventions
|
|
33
|
+
- Composite PK `(model_id, effective_date)` for historical pricing
|
|
34
|
+
- `is_current` flag on pricing rows
|
|
35
|
+
- Numbered SQL migrations: `001_initial.sql`, `002_add_quality.sql`, `003_add_traces.sql`
|
|
36
|
+
- `schema_version` table for migration tracking
|
|
37
|
+
- Views use `DROP VIEW IF EXISTS` + `CREATE VIEW` in migrations (not `IF NOT EXISTS`)
|
|
38
|
+
- `PRAGMA busy_timeout=5000` on all connections
|
|
39
|
+
- Use `with connect() as conn:` contextmanager pattern
|
|
40
|
+
|
|
41
|
+
## Testing
|
|
42
|
+
- `pytest tests/ -v --cov=src --cov-report=term-missing`
|
|
43
|
+
- Integration tests in `tests/integration/`
|
|
44
|
+
- No real API calls in CI (mock HTTP with pytest-httpx)
|
|
45
|
+
- Coverage target: >90%
|
|
46
|
+
|
|
47
|
+
## CLI Conventions
|
|
48
|
+
- Typer sub-apps for command groups (`budget`, `pricing`)
|
|
49
|
+
- Rich tables for reports
|
|
50
|
+
- Duration parser: `parse_window("7d")` → 7 (days). Canonical location: `src/cost_intel/duration.py`
|
|
51
|
+
- Standard flags: `--last/-l`, `--days/-d`, `--window/-w`
|
|
52
|
+
|
|
53
|
+
## Code Conventions
|
|
54
|
+
- `src/cost_intel/__init__.py` — `__version__ = "0.1.0"`
|
|
55
|
+
- `src/cost_intel/db.py` — connection manager + migration runner
|
|
56
|
+
- `src/cost_intel/migrations/` — numbered SQL files
|
|
57
|
+
- `src/cost_intel/models.py` — Pydantic models
|
|
58
|
+
- `src/cost_intel/pricing.py` — OpenRouter fetch + historical store
|
|
59
|
+
- `src/cost_intel/record.py` — cost run recording (cache tokens + raw_response)
|
|
60
|
+
- `src/cost_intel/report.py` — aggregate views + time-window filtering
|
|
61
|
+
- `src/cost_intel/budget.py` — budget set/status subcommands
|
|
62
|
+
- `src/cost_intel/estimate.py` — tiktoken pre-call estimation
|
|
63
|
+
- `src/cost_intel/ingest.py` — JSONL ingestion with provider-specific cache extraction
|
|
64
|
+
- `src/cost_intel/quality.py` — score import + CPQP + waste detection
|
|
65
|
+
- `src/cost_intel/optimize.py` — model routing + target CPQP
|
|
66
|
+
- `src/cost_intel/compare.py` — model comparison with efficiency delta
|
|
67
|
+
- `src/cost_intel/gate.py` — CI/CD cost gates
|
|
68
|
+
- `src/cost_intel/alerts.py` — Slack webhook + SMTP email alerts
|
|
69
|
+
- `src/cost_intel/otel.py` — OpenTelemetry span ingestion + trace cost
|
|
70
|
+
- `src/cost_intel/enforce.py` — budget enforcement / hard-stop
|
|
71
|
+
- `src/cost_intel/prompt_optimize.py` — high-cost pattern analysis
|
|
72
|
+
- `src/cost_intel/adapters/` — quality score import adapters
|
|
73
|
+
- `None` imports: always `from typing import Optional`
|
|
74
|
+
|
|
75
|
+
## Testing Conventions
|
|
76
|
+
- Test file: `test_<module>.py` for each module
|
|
77
|
+
- `conftest.py` — shared fixtures (tmp_db, tmp_cost_intel_home)
|
|
78
|
+
- Each task: write failing test FIRST, then implement
|
|
79
|
+
|
|
80
|
+
## Phase Gates
|
|
81
|
+
Each phase must pass validation before next phase starts:
|
|
82
|
+
- **Phase 1**: `pip install cost-intel` works, record + report work end-to-end, costs match invoices
|
|
83
|
+
- **Phase 2**: CPQP ordering matches intuition, division-by-zero guard works, percentile ratings displayed
|
|
84
|
+
- **Phase 3**: Gate exits 0/1 correctly, alerts trigger at right threshold
|
|
85
|
+
- **Phase 4**: OTel trace cost breakdown works, budget enforcement blocks when exceeded
|
|
86
|
+
|
|
87
|
+
## Credentials (in ~/.hermes/profiles/cost-intel/.env)
|
|
88
|
+
- `OPENROUTER_API_KEY`
|
|
89
|
+
- `LINEAR_API_KEY`
|
|
90
|
+
- `GITHUB_TOKEN`
|
|
91
|
+
- `FACTORY_API_KEY`
|
|
92
|
+
|
|
93
|
+
## Git
|
|
94
|
+
- Repo: (to be created)
|
|
95
|
+
- Branch: main
|
|
96
|
+
- Branch naming: `cost-intel/ONI-XX-description`
|
|
97
|
+
- Commit style: `type: description` (feat:, fix:, test:, etc.)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Cost Intelligence — Dogfood Retrospective & Pricing Research
|
|
2
|
+
|
|
3
|
+
> **Date:** June 3, 2026
|
|
4
|
+
> **Purpose:** Real-world testing with Hermes session data + pricing system analysis
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## What We Found
|
|
9
|
+
|
|
10
|
+
### 1. Hermes Real Usage (Last 30 Days)
|
|
11
|
+
|
|
12
|
+
| Metric | Value |
|
|
13
|
+
|--------|-------|
|
|
14
|
+
| Sessions | 153 |
|
|
15
|
+
| Total tokens | 6.5M in, 221K out |
|
|
16
|
+
| **Total cost** | **$0.18** |
|
|
17
|
+
| Free sessions | 138 (90%) |
|
|
18
|
+
| Paid sessions | 15 (10%) |
|
|
19
|
+
|
|
20
|
+
Cost breakdown by model:
|
|
21
|
+
- `llama-4-scout`: 7 runs, $0.17 (94% of spend)
|
|
22
|
+
- `granite4.1:3b`: 2 runs, $0.003
|
|
23
|
+
- `qwen-3-235b`: 1 run, $0.002
|
|
24
|
+
- `gpt-oss-120b`: 1 run, $0.001
|
|
25
|
+
- All free models (owl-alpha, nemotron-free): $0.00
|
|
26
|
+
|
|
27
|
+
### 2. Critical Bug: `* 1_000_000` in `refresh_all_pricing`
|
|
28
|
+
|
|
29
|
+
**File:** `src/cost_intel/pricing.py`, lines 185-186
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
# Comment says: "OpenRouter returns per-million-token pricing"
|
|
33
|
+
# Reality: OpenRouter returns per-token pricing (e.g., 0.000000039)
|
|
34
|
+
input_price = float(pricing.get("prompt", 0)) * 1_000_000
|
|
35
|
+
output_price = float(pricing.get("completion", 0)) * 1_000_000
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For `gpt-oss-120b`:
|
|
39
|
+
- OpenRouter returns: `prompt = "0.000000039"` (per-token)
|
|
40
|
+
- After `* 1_000_000`: `0.039` stored as "per-1K tokens"
|
|
41
|
+
- But `0.039` is actually the **per-1M** price, not per-1K
|
|
42
|
+
- `_compute_cost` then does `(tokens / 1000) * 0.039` = **1000x too high**
|
|
43
|
+
|
|
44
|
+
The comment is wrong. OpenRouter returns per-token pricing. The multiplication by 1M converts it to per-1M pricing. But the column is named `per_1k_tokens`. The `_compute_cost` function divides by 1000, expecting per-1K pricing. So the final result is 1000x inflated.
|
|
45
|
+
|
|
46
|
+
**Actual pricing for reference (per 1M tokens):**
|
|
47
|
+
| Model | Input | Output |
|
|
48
|
+
|-------|-------|--------|
|
|
49
|
+
| gpt-oss-120b | $0.039 | $0.18 |
|
|
50
|
+
| llama-4-scout | $0.08 | $0.30 |
|
|
51
|
+
| qwen-3-235b | $0.071 | $0.099 |
|
|
52
|
+
| granite-4.0-h-micro | $0.017 | $0.112 |
|
|
53
|
+
| owl-alpha | FREE | FREE |
|
|
54
|
+
|
|
55
|
+
### 3. Model Name Mismatch Problem
|
|
56
|
+
|
|
57
|
+
Hermes state.db uses different names than OpenRouter API:
|
|
58
|
+
- Hermes: `granite4.1:3b` → OpenRouter: `ibm-granite/granite-4.1-8b`
|
|
59
|
+
- Hermes: `gpt-oss-120b` → OpenRouter: `openai/gpt-oss-120b`
|
|
60
|
+
- Hermes: `openrouter/owl-alpha` → OpenRouter: `openrouter/owl-alpha` (matches)
|
|
61
|
+
|
|
62
|
+
When `get_pricing()` can't find a match, it returns None and cost = $0.
|
|
63
|
+
This silently zeros out costs for unmatched models.
|
|
64
|
+
|
|
65
|
+
### 4. Cost Is Dynamic — Three Moving Parts
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
cost = tokens × price_per_token
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**All three variables change:**
|
|
72
|
+
|
|
73
|
+
1. **Tokens per session**: 2K to 1.5M input tokens (3 orders of magnitude)
|
|
74
|
+
2. **Model used**: 8+ models, $0 to $150/1M tokens (5 orders of magnitude)
|
|
75
|
+
3. **Pricing itself**: Models change price over time
|
|
76
|
+
|
|
77
|
+
### 5. Model Pricing Evolution (OpenRouter data, June 2026)
|
|
78
|
+
|
|
79
|
+
From the 343 models on OpenRouter:
|
|
80
|
+
- 25 models are free (7.3%)
|
|
81
|
+
- 318 models are paid (92.7%)
|
|
82
|
+
- Price range: $0.01 to $150 per 1M input tokens
|
|
83
|
+
- Median: $0.40 per 1M input tokens
|
|
84
|
+
|
|
85
|
+
**Key observation:** Every major model family has BOTH free and paid variants:
|
|
86
|
+
- `nvidia/nemotron-3-super-120b-a12b:free` → FREE
|
|
87
|
+
- `nvidia/nemotron-3-super-120b-a12b` → $0.09/1M in, $0.45/1M out
|
|
88
|
+
- `openai/gpt-oss-120b:free` → FREE
|
|
89
|
+
- `openai/gpt-oss-120b` → $0.039/1M in, $0.18/1M out
|
|
90
|
+
|
|
91
|
+
This suggests OpenRouter (and providers) use a strategy of offering free tiers that can be upgraded. The `:free` suffix models are often rate-limited or lower-priority versions.
|
|
92
|
+
|
|
93
|
+
### 6. Historical Pricing — The Real Challenge
|
|
94
|
+
|
|
95
|
+
Model pricing changes over time. Examples from OpenRouter:
|
|
96
|
+
- Models start as free during beta, then become paid
|
|
97
|
+
- Prices decrease as models get more efficient (e.g., llama-3 vs llama-4)
|
|
98
|
+
- Prices increase when demand spikes
|
|
99
|
+
- Free tiers get discontinued
|
|
100
|
+
|
|
101
|
+
**The current schema is designed for this:**
|
|
102
|
+
- `model_pricing` has `(model_id, effective_date)` composite PK
|
|
103
|
+
- `is_current` flag for quick lookups
|
|
104
|
+
- Old pricing rows are preserved when prices change
|
|
105
|
+
|
|
106
|
+
**But the cost computation bakes the price:**
|
|
107
|
+
- `cost_run_calls.call_cost` is computed at record time using current pricing
|
|
108
|
+
- If pricing changes later, historical costs don't update
|
|
109
|
+
- This is a **design tension**: do we store the price at time of use, or recompute?
|
|
110
|
+
|
|
111
|
+
For the Hermes dogfood, this doesn't matter much because:
|
|
112
|
+
- Most sessions use free models (price = $0, won't change)
|
|
113
|
+
- The paid sessions are tiny ($0.18 total)
|
|
114
|
+
- Pricing for stable models changes slowly
|
|
115
|
+
|
|
116
|
+
But at scale (thousands of $, enterprise usage), pricing history matters.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Architectural Implications
|
|
121
|
+
|
|
122
|
+
### Cost Tracking Needs a Reconciliation Layer
|
|
123
|
+
|
|
124
|
+
The current flow:
|
|
125
|
+
```
|
|
126
|
+
Hermes session → model name → get_pricing(model_name) → cost
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
This breaks when:
|
|
130
|
+
1. Model names don't match between systems (naming drift)
|
|
131
|
+
2. Pricing is missing for a model (new model, API lag)
|
|
132
|
+
3. Pricing changed retroactively (provider refunds, corrections)
|
|
133
|
+
|
|
134
|
+
**Needed:**
|
|
135
|
+
1. Model name mapping table (Hermes name ↔ OpenRouter ID)
|
|
136
|
+
2. Pricing fallback chain: exact match → prefix match → fuzzy match → default
|
|
137
|
+
3. Pricing freshness tracking with alerts
|
|
138
|
+
4. Cost recomputation capability for historical data
|
|
139
|
+
|
|
140
|
+
### Cost Is Effectively Free at Small Scale
|
|
141
|
+
|
|
142
|
+
90% of sessions use free models. The 10% paid sessions average $0.012 each.
|
|
143
|
+
At this scale, cost tracking is about:
|
|
144
|
+
- **Anomaly detection**: alert when a paid model is used unexpectedly
|
|
145
|
+
- **Trend tracking**: monitor the ratio of free vs paid over time
|
|
146
|
+
- **Audit trail**: know which tasks required paid models
|
|
147
|
+
|
|
148
|
+
### The Value Proposition Shifts with Scale
|
|
149
|
+
|
|
150
|
+
- **Small scale (<$10/month)**: Cost tracking = anomaly detection
|
|
151
|
+
- **Medium scale ($10-100/month)**: Cost tracking = optimization opportunities
|
|
152
|
+
- **Large scale ($100+/month)**: Cost tracking = cost avoidance, budget enforcement
|
|
153
|
+
|
|
154
|
+
Hermes is currently at the small scale. The OWL value prop ("unified cost-quality metric") is most valuable at medium-to-large scale.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Action Items
|
|
159
|
+
|
|
160
|
+
1. [ ] Fix `* 1_000_000` bug in `refresh_all_pricing` — the comment is wrong, OpenRouter returns per-token pricing
|
|
161
|
+
2. [ ] Add model name mapping/normalization layer
|
|
162
|
+
3. [ ] Add pricing freshness tracking and alerts
|
|
163
|
+
4. [ ] Add cost recomputation for historical data
|
|
164
|
+
5. [ ] Add cost anomaly detection (alert when cost > Nσ for model)
|
|
165
|
+
6. [ ] Determine pricing unit convention and make it consistent (per-1K vs per-1M vs per-token)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Cost Intelligence — Phase 1 Implementation Complete
|
|
2
|
+
|
|
3
|
+
> **Date:** June 2-3, 2026
|
|
4
|
+
> **Status:** Phase 1 (Cost-Only Foundation) — ALL 12 tasks complete
|
|
5
|
+
> **Tests:** 77 passing, ruff clean, 0 lint errors
|
|
6
|
+
> **GitHub:** https://github.com/onicarps/cost-intel (7 commits on main)
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## What Was Built
|
|
11
|
+
|
|
12
|
+
A standalone Python CLI (`cost-intel`) that tracks AI spending from the command line.
|
|
13
|
+
No quality data needed — purely cost-only layer (Phase 1).
|
|
14
|
+
|
|
15
|
+
### Source Modules Created
|
|
16
|
+
|
|
17
|
+
| Module | Purpose |
|
|
18
|
+
|--------|---------|
|
|
19
|
+
| `src/cost_intel/__init__.py` | `__version__ = "0.1.0"` |
|
|
20
|
+
| `src/cost_intel/__main__.py` | `python -m cost_intel` entry |
|
|
21
|
+
| `src/cost_intel/cli.py` | Typer app with all CLI commands |
|
|
22
|
+
| `src/cost_intel/config.py` | YAML config loader with caching |
|
|
23
|
+
| `src/cost_intel/utils.py` | `now_iso()`, `retry()` with exponential backoff |
|
|
24
|
+
| `src/cost_intel/duration.py` | `parse_window("7d")` → 7 (CANONICAL location) |
|
|
25
|
+
| `src/cost_intel/db.py` | Connection manager + `connect()` contextmanager + `init_db()` |
|
|
26
|
+
| `src/cost_intel/migration_runner.py` | Numbered SQL migration runner |
|
|
27
|
+
| `src/cost_intel/migrations/001_initial.sql` | Full Phase 1 schema |
|
|
28
|
+
| `src/cost_intel/pricing.py` | OpenRouter fetch, upsert (same-day update vs cross-date insert), get, manual |
|
|
29
|
+
| `src/cost_intel/record.py` | `record_run()`, `get_run()`, `get_run_calls()` |
|
|
30
|
+
| `src/cost_intel/report.py` | `report_summary()`, `report_by_model()`, `report_by_label()`, `report_by_day()` |
|
|
31
|
+
| `src/cost_intel/budget.py` | `set_budget()`, `get_budget_status()` |
|
|
32
|
+
| `src/cost_intel/estimate.py` | `estimate_tokens()`, `estimate_cost()` (tiktoken) |
|
|
33
|
+
| `src/cost_intel/ingest.py` | `ingest_jsonl()` with provider token extraction |
|
|
34
|
+
|
|
35
|
+
### CLI Commands Working
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
cost-intel --version → cost-intel 0.1.0
|
|
39
|
+
cost-intel record --model M -i 100 -o 50 → record a cost run
|
|
40
|
+
cost-intel report --last 7d --by-model → cost report with tables
|
|
41
|
+
cost-intel trends --last 30d → daily spending trends
|
|
42
|
+
cost-intel export --format csv --last 7d → CSV export
|
|
43
|
+
cost-intel budget set --monthly 500 → set budget
|
|
44
|
+
cost-intel budget status → show budget status
|
|
45
|
+
cost-intel refresh-pricing → fetch from OpenRouter API
|
|
46
|
+
cost-intel pricing set/show --model M → manual pricing
|
|
47
|
+
cost-intel estimate "hello" --model gpt-4 → token/cost estimation
|
|
48
|
+
cost-intel ingest-api-responses file.jsonl → ingest JSONL
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Known Issues / Bugs Found During Implementation
|
|
54
|
+
|
|
55
|
+
1. **OpenRouter pricing math**: API returns per-million-token pricing. Must multiply by `1_000_000` to get per-1K-token pricing (not `1_000`). Fixed in `refresh_all_pricing()`.
|
|
56
|
+
|
|
57
|
+
2. **Same-day upsert DELETE+INSERT**: Using `INSERT OR REPLACE` with composite PK `(model_id, effective_date)` deleted the old row entirely, so `is_current=0` historical rows were lost. Fixed with conditional logic: same-day → UPDATE in place, different day → mark old `is_current=0` + INSERT new.
|
|
58
|
+
|
|
59
|
+
3. **SQLite datetime comparison**: ISO timestamps with timezone offsets (`2025-01-01T00:00:00+00:00`) don't compare correctly with SQLite's `datetime('now', '-N days')` which returns `'YYYY-MM-DD HH:MM:SS'` format. Fixed test to use `'YYYY-MM-DD HH:MM:SS'` format for manually inserted timestamps.
|
|
60
|
+
|
|
61
|
+
4. **`dict` params vs positional `?`**: Report `_days_filter()` passed a dict to `conn.execute()` but SQL used `?` positional placeholders. Fixed to return `list` instead of `dict`.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Test Coverage (77 tests)
|
|
66
|
+
|
|
67
|
+
| Test File | Tests | What's Covered |
|
|
68
|
+
|-----------|-------|----------------|
|
|
69
|
+
| `test_config.py` | 5 | Config loader (no file, reads YAML, caches, eval weights) |
|
|
70
|
+
| `test_utils.py` | 3 | now_iso, retry (success, retries, raises) |
|
|
71
|
+
| `test_duration.py` | 12 | parse_window (d/h/w/bare int/whitespace/case/invalid) |
|
|
72
|
+
| `test_db.py` | 12 | Schema creation, migrations, composite PK, WAL, busy_timeout, foreign_keys, contextmanager commit/rollback |
|
|
73
|
+
| `test_pricing.py` | 10 | Upsert, update preserves old row, same-day update, noop, cache pricing, historical pricing, manual pricing, refresh insert/skip |
|
|
74
|
+
| `test_record.py` | 11 | Basic record, cost computation, unknown model, cache tokens, raw_response truncation, run_type, label, latency, get_run, get_run_calls |
|
|
75
|
+
| `test_report.py` | 9 | Summary empty/with-runs/time-window, by-model, by-label, by-day, budget set/status/spending |
|
|
76
|
+
| `test_estimate.py` | 5 | Token estimation (basic, empty, longer=text, cost with pricing/unknown) |
|
|
77
|
+
| `test_ingest.py` | 9 | Token extraction (OpenRouter/Anthropic/OpenAI/unknown), JSONL ingest (basic, skip invalid, label, nonexistent file) |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Remaining Phases
|
|
82
|
+
|
|
83
|
+
### Phase 2: Quality Correlation (Weeks 4-6)
|
|
84
|
+
Tasks 2.0-2.5 — Quality scores, CPQP metric, waste detection, model comparison
|
|
85
|
+
|
|
86
|
+
Key migration needed: `002_add_quality.sql` — adds `quality_scores` table + `cost_run_cpqp` view with `PERCENT_RANK()` for A/B/C/D/F ratings.
|
|
87
|
+
|
|
88
|
+
### Phase 3: CI/CD + Alerts (Weeks 7-9)
|
|
89
|
+
Tasks 3.1-3.3 — Cost gate, GitHub Actions example, Slack/email alerts
|
|
90
|
+
|
|
91
|
+
### Phase 4: Multi-Agent + Advanced (Weeks 10-12)
|
|
92
|
+
Tasks 4.0-4.4 — OTel span ingestion, trace cost breakdown, prompt optimization, budget enforcement
|
|
93
|
+
|
|
94
|
+
Migration needed: `003_add_traces.sql` — adds `trace_id`, `span_id`, `parent_span_id` to `cost_runs`.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Key Files for Next Session
|
|
99
|
+
|
|
100
|
+
- **Plan:** `research/cost-intelligence/plan.md` (4297 lines, audit-approved)
|
|
101
|
+
- **AGENTS.md:** `workspace/cost-intel/AGENTS.md` (full project spec + file tree)
|
|
102
|
+
- **Mission prompt:** `workspace/cost-intel/mission-phase1.md`
|
|
103
|
+
- **This doc:** `workspace/cost-intel/PHASE1_COMPLETE.md`
|