copeca 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. copeca-0.1.0/LICENSE +21 -0
  2. copeca-0.1.0/PKG-INFO +271 -0
  3. copeca-0.1.0/README.md +239 -0
  4. copeca-0.1.0/pyproject.toml +78 -0
  5. copeca-0.1.0/setup.cfg +4 -0
  6. copeca-0.1.0/src/copeca/__init__.py +3 -0
  7. copeca-0.1.0/src/copeca/__main__.py +10 -0
  8. copeca-0.1.0/src/copeca/agnosticism.py +75 -0
  9. copeca-0.1.0/src/copeca/analysis/__init__.py +0 -0
  10. copeca-0.1.0/src/copeca/analysis/compare.py +126 -0
  11. copeca-0.1.0/src/copeca/analysis/report.py +457 -0
  12. copeca-0.1.0/src/copeca/analysis/stats.py +189 -0
  13. copeca-0.1.0/src/copeca/cli.py +795 -0
  14. copeca-0.1.0/src/copeca/config/__init__.py +0 -0
  15. copeca-0.1.0/src/copeca/config/loader.py +273 -0
  16. copeca-0.1.0/src/copeca/config/models.py +296 -0
  17. copeca-0.1.0/src/copeca/config/resources.py +16 -0
  18. copeca-0.1.0/src/copeca/contamination.py +75 -0
  19. copeca-0.1.0/src/copeca/data/contamination_blocklist.txt +24 -0
  20. copeca-0.1.0/src/copeca/data/defaults/modes/baseline.yaml +9 -0
  21. copeca-0.1.0/src/copeca/data/defaults/modes/hook.yaml +14 -0
  22. copeca-0.1.0/src/copeca/data/defaults/modes/indexed.yaml +12 -0
  23. copeca-0.1.0/src/copeca/data/defaults/modes/proxy.yaml +14 -0
  24. copeca-0.1.0/src/copeca/data/defaults/modes/wrapper.yaml +16 -0
  25. copeca-0.1.0/src/copeca/data/defaults/runners/claude.yaml +43 -0
  26. copeca-0.1.0/src/copeca/data/defaults/runners/codex.yaml +48 -0
  27. copeca-0.1.0/src/copeca/data/repos.yaml +44 -0
  28. copeca-0.1.0/src/copeca/data/schemas/scenario.schema.json +97 -0
  29. copeca-0.1.0/src/copeca/data/schemas/task.schema.json +155 -0
  30. copeca-0.1.0/src/copeca/data/tasks/express/express_app_init.yaml +21 -0
  31. copeca-0.1.0/src/copeca/data/tasks/express/express_app_render.yaml +20 -0
  32. copeca-0.1.0/src/copeca/data/tasks/express/express_diff_multi_mutation.yaml +47 -0
  33. copeca-0.1.0/src/copeca/data/tasks/express/express_edit_cookie_prefix.yaml +36 -0
  34. copeca-0.1.0/src/copeca/data/tasks/express/express_edit_json_type.yaml +35 -0
  35. copeca-0.1.0/src/copeca/data/tasks/express/express_edit_send_type.yaml +36 -0
  36. copeca-0.1.0/src/copeca/data/tasks/express/express_json_send.yaml +21 -0
  37. copeca-0.1.0/src/copeca/data/tasks/express/express_render_chain.yaml +21 -0
  38. copeca-0.1.0/src/copeca/data/tasks/express/express_res_send.yaml +18 -0
  39. copeca-0.1.0/src/copeca/data/tasks/express/swebenchlive_fix_middleware.yaml +34 -0
  40. copeca-0.1.0/src/copeca/data/tasks/express/t004_express_routing.yaml +28 -0
  41. copeca-0.1.0/src/copeca/data/tasks/express/t008_express_fix_route.yaml +31 -0
  42. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_dependency_resolution.yaml +21 -0
  43. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_depends_callers.yaml +19 -0
  44. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_depends_function.yaml +17 -0
  45. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_depends_internals.yaml +19 -0
  46. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_depends_processing.yaml +21 -0
  47. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_diff_which_commit.yaml +45 -0
  48. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_edit_dep_cache.yaml +33 -0
  49. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_edit_response_filter.yaml +33 -0
  50. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_edit_scope_cache.yaml +34 -0
  51. copeca-0.1.0/src/copeca/data/tasks/fastapi/fastapi_request_validation.yaml +20 -0
  52. copeca-0.1.0/src/copeca/data/tasks/fastapi/lca_bug_localization.yaml +26 -0
  53. copeca-0.1.0/src/copeca/data/tasks/fastapi/t002_fastapi_routing.yaml +28 -0
  54. copeca-0.1.0/src/copeca/data/tasks/fastapi/t006_fastapi_fix_status.yaml +29 -0
  55. copeca-0.1.0/src/copeca/data/tasks/fastapi/t010_fastapi_fix_validation.yaml +32 -0
  56. copeca-0.1.0/src/copeca/data/tasks/gin/crosscode_discovery.yaml +31 -0
  57. copeca-0.1.0/src/copeca/data/tasks/gin/gin_client_ip.yaml +20 -0
  58. copeca-0.1.0/src/copeca/data/tasks/gin/gin_context_next_peers.yaml +20 -0
  59. copeca-0.1.0/src/copeca/data/tasks/gin/gin_diff_comprehension.yaml +29 -0
  60. copeca-0.1.0/src/copeca/data/tasks/gin/gin_edit_abort_check.yaml +30 -0
  61. copeca-0.1.0/src/copeca/data/tasks/gin/gin_edit_context_reset.yaml +31 -0
  62. copeca-0.1.0/src/copeca/data/tasks/gin/gin_edit_middleware_skip.yaml +31 -0
  63. copeca-0.1.0/src/copeca/data/tasks/gin/gin_new_constructor.yaml +21 -0
  64. copeca-0.1.0/src/copeca/data/tasks/gin/gin_radix_tree.yaml +20 -0
  65. copeca-0.1.0/src/copeca/data/tasks/gin/gin_servehttp_flow.yaml +20 -0
  66. copeca-0.1.0/src/copeca/data/tasks/gin/t003_gin_middleware.yaml +28 -0
  67. copeca-0.1.0/src/copeca/data/tasks/gin/t007_gin_fix_binding.yaml +33 -0
  68. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_diff_misdirected_error.yaml +32 -0
  69. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_edit_line_count.yaml +31 -0
  70. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_edit_line_locate.yaml +32 -0
  71. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_edit_preceding.yaml +33 -0
  72. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_flag_definition.yaml +19 -0
  73. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_lineiter_definition.yaml +17 -0
  74. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_lineiter_usage.yaml +19 -0
  75. copeca-0.1.0/src/copeca/data/tasks/ripgrep/rg_walker_parallel.yaml +18 -0
  76. copeca-0.1.0/src/copeca/data/tasks/ripgrep/scbench_find_function.yaml +25 -0
  77. copeca-0.1.0/src/copeca/data/tasks/ripgrep/t001_find_matcher_trait.yaml +25 -0
  78. copeca-0.1.0/src/copeca/data/tasks/ripgrep/t005_ripgrep_search_flow.yaml +28 -0
  79. copeca-0.1.0/src/copeca/data/tasks/ripgrep/t009_ripgrep_fix_pattern.yaml +33 -0
  80. copeca-0.1.0/src/copeca/data/tasks/ripgrep/terminal_cli_task.yaml +34 -0
  81. copeca-0.1.0/src/copeca/data/tasks/ripgrep/trait_implementors.yaml +24 -0
  82. copeca-0.1.0/src/copeca/orchestration/__init__.py +0 -0
  83. copeca-0.1.0/src/copeca/orchestration/check.py +124 -0
  84. copeca-0.1.0/src/copeca/orchestration/run.py +537 -0
  85. copeca-0.1.0/src/copeca/orchestration/state.py +128 -0
  86. copeca-0.1.0/src/copeca/orchestration/validation.py +213 -0
  87. copeca-0.1.0/src/copeca/repos/__init__.py +0 -0
  88. copeca-0.1.0/src/copeca/repos/manager.py +338 -0
  89. copeca-0.1.0/src/copeca/results/__init__.py +0 -0
  90. copeca-0.1.0/src/copeca/results/artifact.py +156 -0
  91. copeca-0.1.0/src/copeca/results/signing.py +137 -0
  92. copeca-0.1.0/src/copeca/results/verification.py +412 -0
  93. copeca-0.1.0/src/copeca/results/writer.py +20 -0
  94. copeca-0.1.0/src/copeca/runners/__init__.py +0 -0
  95. copeca-0.1.0/src/copeca/runners/base.py +172 -0
  96. copeca-0.1.0/src/copeca/runners/cost.py +31 -0
  97. copeca-0.1.0/src/copeca/runners/parsers/__init__.py +47 -0
  98. copeca-0.1.0/src/copeca/runners/parsers/base.py +79 -0
  99. copeca-0.1.0/src/copeca/runners/parsers/codex_json.py +155 -0
  100. copeca-0.1.0/src/copeca/runners/parsers/stream_json.py +116 -0
  101. copeca-0.1.0/src/copeca/runners/subprocess.py +152 -0
  102. copeca-0.1.0/src/copeca/tasks/__init__.py +0 -0
  103. copeca-0.1.0/src/copeca/tasks/mutations.py +93 -0
  104. copeca-0.1.0/src/copeca/tasks/validator.py +114 -0
  105. copeca-0.1.0/src/copeca.egg-info/PKG-INFO +271 -0
  106. copeca-0.1.0/src/copeca.egg-info/SOURCES.txt +110 -0
  107. copeca-0.1.0/src/copeca.egg-info/dependency_links.txt +1 -0
  108. copeca-0.1.0/src/copeca.egg-info/entry_points.txt +2 -0
  109. copeca-0.1.0/src/copeca.egg-info/requires.txt +11 -0
  110. copeca-0.1.0/src/copeca.egg-info/top_level.txt +1 -0
  111. copeca-0.1.0/tests/test_agnosticism.py +72 -0
  112. copeca-0.1.0/tests/test_orchestrator.py +558 -0
copeca-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jan Hallvard Larsen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
copeca-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.4
2
+ Name: copeca
3
+ Version: 0.1.0
4
+ Summary: Cost per correct answer — a neutral, reproducible, verifiable benchmark for CLI-based coding agents
5
+ Author-email: Jan Hallvard Larsen <jan@plotplot.ai>
6
+ License: MIT
7
+ Project-URL: Homepage, https://jahala.github.io/copeca/
8
+ Project-URL: Repository, https://github.com/jahala/copeca
9
+ Project-URL: Issues, https://github.com/jahala/copeca/issues
10
+ Project-URL: Funding, https://buymeacoffee.com/jahala
11
+ Keywords: benchmark,ai-coding,agent,mcp,cost-efficiency,evaluation
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Testing
18
+ Requires-Python: >=3.11
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: typer>=0.9
22
+ Requires-Dist: pyyaml>=6.0
23
+ Requires-Dist: jsonschema>=4.20
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: cryptography>=42.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0; extra == "dev"
28
+ Requires-Dist: ruff>=0.1; extra == "dev"
29
+ Requires-Dist: mypy>=1.0; extra == "dev"
30
+ Requires-Dist: build>=1.0; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # copeca &middot; cost per correct answer
34
+
35
+ [![Live site](https://img.shields.io/badge/live_site-1F8A7B?logo=githubpages&logoColor=white)](https://jahala.github.io/copeca/)
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
37
+ [![Build](https://img.shields.io/github/actions/workflow/status/jahala/copeca/ci.yml?branch=master)](https://github.com/jahala/copeca/actions)
38
+
39
+ 🌱 **[What is copeca? →](https://jahala.github.io/copeca/)** &nbsp;·&nbsp; the visual overview
40
+
41
+ A neutral, reproducible, verifiable benchmark for CLI-based coding agents.
42
+ Copeca measures **cost per correct answer** — the expected dollar cost before
43
+ getting a right answer — to A/B-compare MCP servers, context compressors,
44
+ hooks, and harness improvements against a clean baseline.
45
+
46
+ ```
47
+ cost_per_correct = total_spend / correct_count
48
+ ```
49
+
50
+ **Why this metric.** "-90% tokens removed!" is a marketing number if it ignores
51
+ whether the answer was *right*. A tool that saves 90% of tokens but makes
52
+ 20% more mistakes has worse cost-per-correct. Copeca adjusts every savings
53
+ claim for accuracy, so the number you get is the number that actually matters.
54
+
55
+ **Why a separate benchmark.** Every tool in the ~45-tool agent-efficiency space
56
+ reports savings on its own methodology against its own baseline — the numbers
57
+ are literally incomparable. Copeca holds the agent and model fixed and varies
58
+ *one tool*, answering "did my tool help, and what did it cost?" No existing
59
+ benchmark occupies that lane.
60
+
61
+ ---
62
+
63
+ ## Quick start
64
+
65
+ ```bash
66
+ git clone https://github.com/jahala/copeca && cd copeca
67
+ pip install -e .
68
+ copeca init ./my-benchmark
69
+ copeca run --task scenarios/my-scenario.yaml --runner claude
70
+ copeca analyze results/bench.jsonl
71
+ ```
72
+
73
+ A scenario file defines what to measure:
74
+
75
+ ```yaml
76
+ name: my-tool-vs-baseline
77
+ tasks:
78
+ include: ["rg_*", "fastapi_*"]
79
+ modes: [baseline, my-tool]
80
+ models: [claude-sonnet-4-6]
81
+ model_runner_map:
82
+ claude-sonnet-4-6: claude
83
+ repetitions: 5
84
+ budget_usd: 1.00
85
+ ```
86
+
87
+ The report leads with the cost-per-correct delta between your tool and the
88
+ baseline, with 95% bootstrapped confidence intervals, per-task and
89
+ **per-capability** breakdowns (locate / trace / fix / debug — *where* the tool
90
+ helps, not just an overall number), and adversarial flags that catch token
91
+ snowballing and expensive failures.
92
+
93
+ **Current corpus: 52 tasks** across four real repos (ripgrep, gin, express,
94
+ fastapi — Rust, Go, JavaScript, Python), each tagged by capability so the report
95
+ shows where a tool helps. Broader coverage is on the roadmap; small N still means
96
+ wide confidence intervals — see
97
+ [docs/known-limitations.md](docs/known-limitations.md).
98
+
99
+ ---
100
+
101
+ ## What copeca measures
102
+
103
+ | Dimension | How |
104
+ |---|---|
105
+ | **Cost** | The vendor's billed cost when the runner reports it (the real bill — reflects cache TTL/tier/discounts; frozen into the artifact at run time). copeca also records a reproducible, provider-neutral cross-check: `computed_cost_usd = Σ tokens × runner.pricing[model]`. Token counts are read from the agent CLI and not re-tokenized — see known-limitations. |
106
+ | **Correctness** | String matching (comprehension tasks) or test-command exit codes (edit tasks) (case-insensitive substring matching — gameable on single tasks; see known-limitations) |
107
+ | **Completeness** | `all_of` field verifies the agent listed *everything* — not just *something* |
108
+ | **Futility** | Adversarial flags: token snowball, talkative failure, tool storm, budget exhaustion, timeout |
109
+ | **Integrity** | Each result is packaged with an integrity manifest — a SHA-256 hash of every file in the artifact. `copeca verify ARTIFACT` recomputes these to detect accidental corruption. The manifest alone is **not tamper-proof**: anyone who rewrites the zip can recompute it. For real tamper-evidence, sign artifacts with `copeca run … --artifacts --sign-key <private.pem>` — this writes a detached **Ed25519** signature over the content hash, and `copeca verify ARTIFACT --pubkey <public.pem>` rejects any artifact a holder of the private key did not sign (so a tampered-and-recomputed artifact fails). Unsigned artifacts get corruption detection only and are reported as unsigned. External transparency-log anchoring is a further planned option. |
110
+
111
+ ---
112
+
113
+ ## Who copeca is for
114
+
115
+ **Tool builders** — MCP/server authors, context compressor developers, code-search
116
+ tool maintainers. You ship a tool and need a number that isn't marketing. Copeca
117
+ gives you cost-per-correct with a delta and CI, and a `.copeca` zip anyone can
118
+ verify.
119
+
120
+ **Platform builders** — CLI agent authors (Codex, OpenCode, Gemini CLI style).
121
+ You need to validate that your pricing model is accurate before customers depend
122
+ on it. Copeca normalizes cost across providers and warns when pricing data is
123
+ stale.
124
+
125
+ **Skeptical evaluators** — Researchers, reviewers, procurement leads. You've
126
+ been burned by contaminated benchmarks and selectively reported results. Copeca's
127
+ artifact model lets you verify any individual result; batch completeness verification
128
+ (`copeca verify --batch --scenario <path>`) confirms all expected runs are present
129
+ and names any specific missing runs.
130
+
131
+ ---
132
+
133
+ ## How copeca works
134
+
135
+ Copeca launches a CLI coding agent as a subprocess against a real open-source
136
+ repo pinned at a known commit. The agent answers a question or fixes a bug.
137
+ Copeca parses the agent's output, checks correctness, computes cost from token
138
+ counts, and writes a JSONL record. A scenario runs the matrix of tasks × modes
139
+ × models × repetitions with parallel git-worktree-isolated workers.
140
+
141
+ **Modes** express the *one variable* that changes between baseline and
142
+ experimental. They cover all five integration types real tools use:
143
+
144
+ | Integration | Mode field | Example |
145
+ |---|---|---|
146
+ | MCP server | `mcp_config` | any MCP server |
147
+ | API proxy (env) | `env` | `ANTHROPIC_BASE_URL` proxy |
148
+ | Config-dir hook | `agent_config` | PreToolUse hook via settings overlay |
149
+ | Process wrapper | `wrapper` | `["your-wrapper-tool", "wrap"]` |
150
+ | Pre-run index | `setup` | per-worktree indexing command |
151
+
152
+ Copeca provisions each arm with its own config directory and an allow-listed
153
+ environment. The baseline arm receives only a minimal set of host vars (infra,
154
+ locale, and provider credentials); all ambient hooks, `CLAUDE_*` vars, and
155
+ `MCP_*` vars are excluded. Experimental modes may declare additional vars via
156
+ `mode.env`, which are merged on top.
157
+
158
+ ---
159
+
160
+ ## Task corpus
161
+
162
+ Tasks are YAML data — no embedded code, no Docker per task. They target real
163
+ open-source repos pinned at exact commits (per task, so one repo can serve
164
+ several code states). The corpus is **52 tasks** across ripgrep, gin, express,
165
+ and fastapi — drawn from six public source families plus a set migrated from the
166
+ tilth benchmark (MIT); each carries a `source:` field with provenance and a
167
+ `category` (locate / trace / fix / debug). Tasks are **tool-agnostic** — they name
168
+ the information required, never the method, so no tool is privileged; `copeca
169
+ validate` lints for it. Every edit task is verified by `copeca check-task`: the
170
+ test must pass on clean code and fail on mutated code, proving the mutation
171
+ actually bites. See [docs/task-taxonomy.md](docs/task-taxonomy.md).
172
+
173
+ **Contamination defense:** `copeca validate` checks every task's `source:`
174
+ field against a blocklist of known-contaminated source benchmarks (SWE-bench
175
+ Verified, RepoBench, ClassEval, DevEval, CoderEval). A task from any of
176
+ these sources is rejected before it can enter the corpus. This is a static
177
+ provenance check — no model calls, no network. A planned authoring-time
178
+ option (requires an API key) will also probe a live model with the task ID
179
+ and exclude it if the model reproduces the gold solution from memory; that
180
+ feature is not shipped yet.
181
+
182
+ ---
183
+
184
+ ## Runners
185
+
186
+ The runner interface is **config-driven**: a runner is a YAML file in
187
+ `defaults/runners/` declaring the CLI binary, its argument mapping, its config-dir
188
+ env var, and which output parser to use — plus a pricing table. Copeca builds the
189
+ subprocess invocation from that YAML, so adding an agent CLI means writing a YAML,
190
+ not editing copeca's code. See
191
+ [docs/runner-configuration.md](docs/runner-configuration.md).
192
+
193
+ To compute cost, copeca requires the *minimum* from the agent's output: token
194
+ counts. From those it derives `computed_cost_usd` — a reproducible, provider-neutral
195
+ cross-check; when the runner also reports its own billed cost, that vendor figure is
196
+ the headline. Duration and completion are derived from the output too.
197
+
198
+ ```jsonl
199
+ {"type": "turn", "input_tokens": 5000, "output_tokens": 200,
200
+ "cache_creation_tokens": 3500, "cache_read_tokens": 3000}
201
+ {"type": "assistant_message", "text": "...", "turn": 2}
202
+ {"type": "result", "total_cost_usd": 0.0734, "duration_ms": 45230}
203
+ ```
204
+
205
+ Two runners ship today: **Claude Code** (`stream_json` parser) and **OpenAI
206
+ Codex** (`codex_json` parser) — each added as a YAML plus a parser, with no
207
+ changes to copeca's core. A CLI with a different output format needs a matching
208
+ parser, and a runner YAML naming an unbuilt parser fails loudly rather than
209
+ silently miscounting.
210
+
211
+ ---
212
+
213
+ ## Install
214
+
215
+ A built wheel bundles its runtime data (`schemas/`, `tasks/`, `defaults/`, and
216
+ `repos.yaml`), so a pip install is fully functional — `copeca init`, `validate`,
217
+ and `run` work off the packaged corpus. Copeca is **not** published on PyPI yet,
218
+ so install from git or a source checkout:
219
+
220
+ ```bash
221
+ pip install git+https://github.com/jahala/copeca
222
+ ```
223
+
224
+ Or from a clone (use `-e` for development):
225
+
226
+ ```bash
227
+ git clone https://github.com/jahala/copeca
228
+ cd copeca
229
+ pip install .
230
+ ```
231
+
232
+ Requires Python ≥ 3.11. The Claude Code and Codex runners ship ready to use; the
233
+ runner interface is config-driven, so other CLIs are added by writing a YAML (and,
234
+ if their output format differs, a parser). See
235
+ [docs/runner-configuration.md](docs/runner-configuration.md).
236
+
237
+ ---
238
+
239
+ ## Documentation
240
+
241
+ - [Task authoring guide](docs/task-authoring.md) — write comprehensions and edits
242
+ - [Runner configuration](docs/runner-configuration.md) — output contract, pricing
243
+ - [Metrics & methodology](docs/metrics.md) — cost-per-correct math, delta-not-absolute
244
+ - [Known limitations](docs/known-limitations.md) — string matching, bootstrap CIs, modeled cost
245
+
246
+ ---
247
+
248
+ ## Support
249
+
250
+ [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://buymeacoffee.com/jahala)
251
+
252
+ ## License
253
+
254
+ MIT — see [LICENSE](LICENSE).
255
+
256
+ Copeca's bundled task corpus is derived from independent benchmark sources
257
+ under permissive licenses (Apache-2.0, MIT, CC BY 4.0). Each task carries a
258
+ `source:` field with provenance. Tasks from NonCommercial, ShareAlike, or
259
+ no-license sources are explicitly excluded.
260
+
261
+ ---
262
+
263
+ ## Related
264
+
265
+ Copeca is part of the [plotplot](https://github.com/plotplot-ai) garden of small,
266
+ sharp tools for building with AI. Siblings:
267
+ [tilth](https://github.com/jahala/tilth) (AST-aware code intelligence),
268
+ [umbel](https://github.com/jahala/umbel) (drive many agent CLIs from one session),
269
+ [pleach](https://github.com/jahala/pleach) (conduct agent work in isolated worktrees),
270
+ [petals](https://github.com/jahala/petals) (brand intelligence),
271
+ [tend](https://github.com/jahala/tend) (feature mapping across sessions).
copeca-0.1.0/README.md ADDED
@@ -0,0 +1,239 @@
1
+ # copeca &middot; cost per correct answer
2
+
3
+ [![Live site](https://img.shields.io/badge/live_site-1F8A7B?logo=githubpages&logoColor=white)](https://jahala.github.io/copeca/)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
5
+ [![Build](https://img.shields.io/github/actions/workflow/status/jahala/copeca/ci.yml?branch=master)](https://github.com/jahala/copeca/actions)
6
+
7
+ 🌱 **[What is copeca? →](https://jahala.github.io/copeca/)** &nbsp;·&nbsp; the visual overview
8
+
9
+ A neutral, reproducible, verifiable benchmark for CLI-based coding agents.
10
+ Copeca measures **cost per correct answer** — the expected dollar cost before
11
+ getting a right answer — to A/B-compare MCP servers, context compressors,
12
+ hooks, and harness improvements against a clean baseline.
13
+
14
+ ```
15
+ cost_per_correct = total_spend / correct_count
16
+ ```
17
+
18
+ **Why this metric.** "-90% tokens removed!" is a marketing number if it ignores
19
+ whether the answer was *right*. A tool that saves 90% of tokens but makes
20
+ 20% more mistakes has worse cost-per-correct. Copeca adjusts every savings
21
+ claim for accuracy, so the number you get is the number that actually matters.
22
+
23
+ **Why a separate benchmark.** Every tool in the ~45-tool agent-efficiency space
24
+ reports savings on its own methodology against its own baseline — the numbers
25
+ are literally incomparable. Copeca holds the agent and model fixed and varies
26
+ *one tool*, answering "did my tool help, and what did it cost?" No existing
27
+ benchmark occupies that lane.
28
+
29
+ ---
30
+
31
+ ## Quick start
32
+
33
+ ```bash
34
+ git clone https://github.com/jahala/copeca && cd copeca
35
+ pip install -e .
36
+ copeca init ./my-benchmark
37
+ copeca run --task scenarios/my-scenario.yaml --runner claude
38
+ copeca analyze results/bench.jsonl
39
+ ```
40
+
41
+ A scenario file defines what to measure:
42
+
43
+ ```yaml
44
+ name: my-tool-vs-baseline
45
+ tasks:
46
+ include: ["rg_*", "fastapi_*"]
47
+ modes: [baseline, my-tool]
48
+ models: [claude-sonnet-4-6]
49
+ model_runner_map:
50
+ claude-sonnet-4-6: claude
51
+ repetitions: 5
52
+ budget_usd: 1.00
53
+ ```
54
+
55
+ The report leads with the cost-per-correct delta between your tool and the
56
+ baseline, with 95% bootstrapped confidence intervals, per-task and
57
+ **per-capability** breakdowns (locate / trace / fix / debug — *where* the tool
58
+ helps, not just an overall number), and adversarial flags that catch token
59
+ snowballing and expensive failures.
60
+
61
+ **Current corpus: 52 tasks** across four real repos (ripgrep, gin, express,
62
+ fastapi — Rust, Go, JavaScript, Python), each tagged by capability so the report
63
+ shows where a tool helps. Broader coverage is on the roadmap; small N still means
64
+ wide confidence intervals — see
65
+ [docs/known-limitations.md](docs/known-limitations.md).
66
+
67
+ ---
68
+
69
+ ## What copeca measures
70
+
71
+ | Dimension | How |
72
+ |---|---|
73
+ | **Cost** | The vendor's billed cost when the runner reports it (the real bill — reflects cache TTL/tier/discounts; frozen into the artifact at run time). copeca also records a reproducible, provider-neutral cross-check: `computed_cost_usd = Σ tokens × runner.pricing[model]`. Token counts are read from the agent CLI and not re-tokenized — see known-limitations. |
74
+ | **Correctness** | String matching (comprehension tasks) or test-command exit codes (edit tasks) (case-insensitive substring matching — gameable on single tasks; see known-limitations) |
75
+ | **Completeness** | `all_of` field verifies the agent listed *everything* — not just *something* |
76
+ | **Futility** | Adversarial flags: token snowball, talkative failure, tool storm, budget exhaustion, timeout |
77
+ | **Integrity** | Each result is packaged with an integrity manifest — a SHA-256 hash of every file in the artifact. `copeca verify ARTIFACT` recomputes these to detect accidental corruption. The manifest alone is **not tamper-proof**: anyone who rewrites the zip can recompute it. For real tamper-evidence, sign artifacts with `copeca run … --artifacts --sign-key <private.pem>` — this writes a detached **Ed25519** signature over the content hash, and `copeca verify ARTIFACT --pubkey <public.pem>` rejects any artifact a holder of the private key did not sign (so a tampered-and-recomputed artifact fails). Unsigned artifacts get corruption detection only and are reported as unsigned. External transparency-log anchoring is a further planned option. |
78
+
79
+ ---
80
+
81
+ ## Who copeca is for
82
+
83
+ **Tool builders** — MCP/server authors, context compressor developers, code-search
84
+ tool maintainers. You ship a tool and need a number that isn't marketing. Copeca
85
+ gives you cost-per-correct with a delta and CI, and a `.copeca` zip anyone can
86
+ verify.
87
+
88
+ **Platform builders** — CLI agent authors (Codex, OpenCode, Gemini CLI style).
89
+ You need to validate that your pricing model is accurate before customers depend
90
+ on it. Copeca normalizes cost across providers and warns when pricing data is
91
+ stale.
92
+
93
+ **Skeptical evaluators** — Researchers, reviewers, procurement leads. You've
94
+ been burned by contaminated benchmarks and selectively reported results. Copeca's
95
+ artifact model lets you verify any individual result; batch completeness verification
96
+ (`copeca verify --batch --scenario <path>`) confirms all expected runs are present
97
+ and names any specific missing runs.
98
+
99
+ ---
100
+
101
+ ## How copeca works
102
+
103
+ Copeca launches a CLI coding agent as a subprocess against a real open-source
104
+ repo pinned at a known commit. The agent answers a question or fixes a bug.
105
+ Copeca parses the agent's output, checks correctness, computes cost from token
106
+ counts, and writes a JSONL record. A scenario runs the matrix of tasks × modes
107
+ × models × repetitions with parallel git-worktree-isolated workers.
108
+
109
+ **Modes** express the *one variable* that changes between baseline and
110
+ experimental. They cover all five integration types real tools use:
111
+
112
+ | Integration | Mode field | Example |
113
+ |---|---|---|
114
+ | MCP server | `mcp_config` | any MCP server |
115
+ | API proxy (env) | `env` | `ANTHROPIC_BASE_URL` proxy |
116
+ | Config-dir hook | `agent_config` | PreToolUse hook via settings overlay |
117
+ | Process wrapper | `wrapper` | `["your-wrapper-tool", "wrap"]` |
118
+ | Pre-run index | `setup` | per-worktree indexing command |
119
+
120
+ Copeca provisions each arm with its own config directory and an allow-listed
121
+ environment. The baseline arm receives only a minimal set of host vars (infra,
122
+ locale, and provider credentials); all ambient hooks, `CLAUDE_*` vars, and
123
+ `MCP_*` vars are excluded. Experimental modes may declare additional vars via
124
+ `mode.env`, which are merged on top.
125
+
126
+ ---
127
+
128
+ ## Task corpus
129
+
130
+ Tasks are YAML data — no embedded code, no Docker per task. They target real
131
+ open-source repos pinned at exact commits (per task, so one repo can serve
132
+ several code states). The corpus is **52 tasks** across ripgrep, gin, express,
133
+ and fastapi — drawn from six public source families plus a set migrated from the
134
+ tilth benchmark (MIT); each carries a `source:` field with provenance and a
135
+ `category` (locate / trace / fix / debug). Tasks are **tool-agnostic** — they name
136
+ the information required, never the method, so no tool is privileged; `copeca
137
+ validate` lints for it. Every edit task is verified by `copeca check-task`: the
138
+ test must pass on clean code and fail on mutated code, proving the mutation
139
+ actually bites. See [docs/task-taxonomy.md](docs/task-taxonomy.md).
140
+
141
+ **Contamination defense:** `copeca validate` checks every task's `source:`
142
+ field against a blocklist of known-contaminated source benchmarks (SWE-bench
143
+ Verified, RepoBench, ClassEval, DevEval, CoderEval). A task from any of
144
+ these sources is rejected before it can enter the corpus. This is a static
145
+ provenance check — no model calls, no network. A planned authoring-time
146
+ option (requires an API key) will also probe a live model with the task ID
147
+ and exclude it if the model reproduces the gold solution from memory; that
148
+ feature is not shipped yet.
149
+
150
+ ---
151
+
152
+ ## Runners
153
+
154
+ The runner interface is **config-driven**: a runner is a YAML file in
155
+ `defaults/runners/` declaring the CLI binary, its argument mapping, its config-dir
156
+ env var, and which output parser to use — plus a pricing table. Copeca builds the
157
+ subprocess invocation from that YAML, so adding an agent CLI means writing a YAML,
158
+ not editing copeca's code. See
159
+ [docs/runner-configuration.md](docs/runner-configuration.md).
160
+
161
+ To compute cost, copeca requires the *minimum* from the agent's output: token
162
+ counts. From those it derives `computed_cost_usd` — a reproducible, provider-neutral
163
+ cross-check; when the runner also reports its own billed cost, that vendor figure is
164
+ the headline. Duration and completion are derived from the output too.
165
+
166
+ ```jsonl
167
+ {"type": "turn", "input_tokens": 5000, "output_tokens": 200,
168
+ "cache_creation_tokens": 3500, "cache_read_tokens": 3000}
169
+ {"type": "assistant_message", "text": "...", "turn": 2}
170
+ {"type": "result", "total_cost_usd": 0.0734, "duration_ms": 45230}
171
+ ```
172
+
173
+ Two runners ship today: **Claude Code** (`stream_json` parser) and **OpenAI
174
+ Codex** (`codex_json` parser) — each added as a YAML plus a parser, with no
175
+ changes to copeca's core. A CLI with a different output format needs a matching
176
+ parser, and a runner YAML naming an unbuilt parser fails loudly rather than
177
+ silently miscounting.
178
+
179
+ ---
180
+
181
+ ## Install
182
+
183
+ A built wheel bundles its runtime data (`schemas/`, `tasks/`, `defaults/`, and
184
+ `repos.yaml`), so a pip install is fully functional — `copeca init`, `validate`,
185
+ and `run` work off the packaged corpus. Copeca is **not** published on PyPI yet,
186
+ so install from git or a source checkout:
187
+
188
+ ```bash
189
+ pip install git+https://github.com/jahala/copeca
190
+ ```
191
+
192
+ Or from a clone (use `-e` for development):
193
+
194
+ ```bash
195
+ git clone https://github.com/jahala/copeca
196
+ cd copeca
197
+ pip install .
198
+ ```
199
+
200
+ Requires Python ≥ 3.11. The Claude Code and Codex runners ship ready to use; the
201
+ runner interface is config-driven, so other CLIs are added by writing a YAML (and,
202
+ if their output format differs, a parser). See
203
+ [docs/runner-configuration.md](docs/runner-configuration.md).
204
+
205
+ ---
206
+
207
+ ## Documentation
208
+
209
+ - [Task authoring guide](docs/task-authoring.md) — write comprehensions and edits
210
+ - [Runner configuration](docs/runner-configuration.md) — output contract, pricing
211
+ - [Metrics & methodology](docs/metrics.md) — cost-per-correct math, delta-not-absolute
212
+ - [Known limitations](docs/known-limitations.md) — string matching, bootstrap CIs, modeled cost
213
+
214
+ ---
215
+
216
+ ## Support
217
+
218
+ [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://buymeacoffee.com/jahala)
219
+
220
+ ## License
221
+
222
+ MIT — see [LICENSE](LICENSE).
223
+
224
+ Copeca's bundled task corpus is derived from independent benchmark sources
225
+ under permissive licenses (Apache-2.0, MIT, CC BY 4.0). Each task carries a
226
+ `source:` field with provenance. Tasks from NonCommercial, ShareAlike, or
227
+ no-license sources are explicitly excluded.
228
+
229
+ ---
230
+
231
+ ## Related
232
+
233
+ Copeca is part of the [plotplot](https://github.com/plotplot-ai) garden of small,
234
+ sharp tools for building with AI. Siblings:
235
+ [tilth](https://github.com/jahala/tilth) (AST-aware code intelligence),
236
+ [umbel](https://github.com/jahala/umbel) (drive many agent CLIs from one session),
237
+ [pleach](https://github.com/jahala/pleach) (conduct agent work in isolated worktrees),
238
+ [petals](https://github.com/jahala/petals) (brand intelligence),
239
+ [tend](https://github.com/jahala/tend) (feature mapping across sessions).
@@ -0,0 +1,78 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "copeca"
7
+ version = "0.1.0"
8
+ description = "Cost per correct answer — a neutral, reproducible, verifiable benchmark for CLI-based coding agents"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Jan Hallvard Larsen", email = "jan@plotplot.ai" }]
13
+ keywords = ["benchmark", "ai-coding", "agent", "mcp", "cost-efficiency", "evaluation"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Software Development :: Testing",
21
+ ]
22
+ dependencies = [
23
+ "typer>=0.9",
24
+ "pyyaml>=6.0",
25
+ "jsonschema>=4.20",
26
+ "pydantic>=2.0",
27
+ "cryptography>=42.0",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://jahala.github.io/copeca/"
32
+ Repository = "https://github.com/jahala/copeca"
33
+ Issues = "https://github.com/jahala/copeca/issues"
34
+ Funding = "https://buymeacoffee.com/jahala"
35
+
36
+ [project.optional-dependencies]
37
+ dev = [
38
+ "pytest>=7.0",
39
+ "ruff>=0.1",
40
+ "mypy>=1.0",
41
+ "build>=1.0",
42
+ ]
43
+
44
+ [project.scripts]
45
+ copeca = "copeca.cli:app"
46
+
47
+ [tool.setuptools.packages.find]
48
+ where = ["src"]
49
+
50
+ [tool.setuptools.package-data]
51
+ copeca = ["data/**/*"]
52
+
53
+ [tool.ruff]
54
+ line-length = 100
55
+ target-version = "py311"
56
+ [tool.ruff.lint]
57
+ select = ["E", "F", "I", "N", "W", "UP", "B", "SIM"]
58
+ ignore = [
59
+ # B008: typer.Argument/Option() as function defaults is the canonical Typer idiom.
60
+ "B008",
61
+ # SIM102: nested-if collapses validator guard structure; keep separate for readability.
62
+ "SIM102",
63
+ # SIM108: ternary replaces a legible if/else in parser result dispatch; keep readable.
64
+ "SIM108",
65
+ # UP042: str+Enum gives string-serialisable enums without breaking JSON round-trips.
66
+ "UP042",
67
+ ]
68
+
69
+ [tool.mypy]
70
+ strict = true
71
+ python_version = "3.11"
72
+
73
+ [tool.pytest.ini_options]
74
+ testpaths = ["tests"]
75
+ python_files = ["test_*.py"]
76
+ markers = [
77
+ "e2e: end-to-end hermetic pipeline tests (no network, no API, no real LLM)",
78
+ ]
copeca-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """copeca — cost per correct answer. A neutral, reproducible benchmark for CLI coding agents."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,10 @@
1
+ """Module entry point so `python -m copeca` runs the CLI.
2
+
3
+ Tests and tooling invoke the CLI via ``sys.executable -m copeca`` for a portable
4
+ entry point that does not depend on a .venv/bin path or on PATH resolution.
5
+ """
6
+
7
+ from copeca.cli import app
8
+
9
+ if __name__ == "__main__":
10
+ app()