copeca 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. copeca/__init__.py +3 -0
  2. copeca/__main__.py +10 -0
  3. copeca/agnosticism.py +75 -0
  4. copeca/analysis/__init__.py +0 -0
  5. copeca/analysis/compare.py +126 -0
  6. copeca/analysis/report.py +457 -0
  7. copeca/analysis/stats.py +189 -0
  8. copeca/cli.py +795 -0
  9. copeca/config/__init__.py +0 -0
  10. copeca/config/loader.py +273 -0
  11. copeca/config/models.py +296 -0
  12. copeca/config/resources.py +16 -0
  13. copeca/contamination.py +75 -0
  14. copeca/data/contamination_blocklist.txt +24 -0
  15. copeca/data/defaults/modes/baseline.yaml +9 -0
  16. copeca/data/defaults/modes/hook.yaml +14 -0
  17. copeca/data/defaults/modes/indexed.yaml +12 -0
  18. copeca/data/defaults/modes/proxy.yaml +14 -0
  19. copeca/data/defaults/modes/wrapper.yaml +16 -0
  20. copeca/data/defaults/runners/claude.yaml +43 -0
  21. copeca/data/defaults/runners/codex.yaml +48 -0
  22. copeca/data/repos.yaml +44 -0
  23. copeca/data/schemas/scenario.schema.json +97 -0
  24. copeca/data/schemas/task.schema.json +155 -0
  25. copeca/data/tasks/express/express_app_init.yaml +21 -0
  26. copeca/data/tasks/express/express_app_render.yaml +20 -0
  27. copeca/data/tasks/express/express_diff_multi_mutation.yaml +47 -0
  28. copeca/data/tasks/express/express_edit_cookie_prefix.yaml +36 -0
  29. copeca/data/tasks/express/express_edit_json_type.yaml +35 -0
  30. copeca/data/tasks/express/express_edit_send_type.yaml +36 -0
  31. copeca/data/tasks/express/express_json_send.yaml +21 -0
  32. copeca/data/tasks/express/express_render_chain.yaml +21 -0
  33. copeca/data/tasks/express/express_res_send.yaml +18 -0
  34. copeca/data/tasks/express/swebenchlive_fix_middleware.yaml +34 -0
  35. copeca/data/tasks/express/t004_express_routing.yaml +28 -0
  36. copeca/data/tasks/express/t008_express_fix_route.yaml +31 -0
  37. copeca/data/tasks/fastapi/fastapi_dependency_resolution.yaml +21 -0
  38. copeca/data/tasks/fastapi/fastapi_depends_callers.yaml +19 -0
  39. copeca/data/tasks/fastapi/fastapi_depends_function.yaml +17 -0
  40. copeca/data/tasks/fastapi/fastapi_depends_internals.yaml +19 -0
  41. copeca/data/tasks/fastapi/fastapi_depends_processing.yaml +21 -0
  42. copeca/data/tasks/fastapi/fastapi_diff_which_commit.yaml +45 -0
  43. copeca/data/tasks/fastapi/fastapi_edit_dep_cache.yaml +33 -0
  44. copeca/data/tasks/fastapi/fastapi_edit_response_filter.yaml +33 -0
  45. copeca/data/tasks/fastapi/fastapi_edit_scope_cache.yaml +34 -0
  46. copeca/data/tasks/fastapi/fastapi_request_validation.yaml +20 -0
  47. copeca/data/tasks/fastapi/lca_bug_localization.yaml +26 -0
  48. copeca/data/tasks/fastapi/t002_fastapi_routing.yaml +28 -0
  49. copeca/data/tasks/fastapi/t006_fastapi_fix_status.yaml +29 -0
  50. copeca/data/tasks/fastapi/t010_fastapi_fix_validation.yaml +32 -0
  51. copeca/data/tasks/gin/crosscode_discovery.yaml +31 -0
  52. copeca/data/tasks/gin/gin_client_ip.yaml +20 -0
  53. copeca/data/tasks/gin/gin_context_next_peers.yaml +20 -0
  54. copeca/data/tasks/gin/gin_diff_comprehension.yaml +29 -0
  55. copeca/data/tasks/gin/gin_edit_abort_check.yaml +30 -0
  56. copeca/data/tasks/gin/gin_edit_context_reset.yaml +31 -0
  57. copeca/data/tasks/gin/gin_edit_middleware_skip.yaml +31 -0
  58. copeca/data/tasks/gin/gin_new_constructor.yaml +21 -0
  59. copeca/data/tasks/gin/gin_radix_tree.yaml +20 -0
  60. copeca/data/tasks/gin/gin_servehttp_flow.yaml +20 -0
  61. copeca/data/tasks/gin/t003_gin_middleware.yaml +28 -0
  62. copeca/data/tasks/gin/t007_gin_fix_binding.yaml +33 -0
  63. copeca/data/tasks/ripgrep/rg_diff_misdirected_error.yaml +32 -0
  64. copeca/data/tasks/ripgrep/rg_edit_line_count.yaml +31 -0
  65. copeca/data/tasks/ripgrep/rg_edit_line_locate.yaml +32 -0
  66. copeca/data/tasks/ripgrep/rg_edit_preceding.yaml +33 -0
  67. copeca/data/tasks/ripgrep/rg_flag_definition.yaml +19 -0
  68. copeca/data/tasks/ripgrep/rg_lineiter_definition.yaml +17 -0
  69. copeca/data/tasks/ripgrep/rg_lineiter_usage.yaml +19 -0
  70. copeca/data/tasks/ripgrep/rg_walker_parallel.yaml +18 -0
  71. copeca/data/tasks/ripgrep/scbench_find_function.yaml +25 -0
  72. copeca/data/tasks/ripgrep/t001_find_matcher_trait.yaml +25 -0
  73. copeca/data/tasks/ripgrep/t005_ripgrep_search_flow.yaml +28 -0
  74. copeca/data/tasks/ripgrep/t009_ripgrep_fix_pattern.yaml +33 -0
  75. copeca/data/tasks/ripgrep/terminal_cli_task.yaml +34 -0
  76. copeca/data/tasks/ripgrep/trait_implementors.yaml +24 -0
  77. copeca/orchestration/__init__.py +0 -0
  78. copeca/orchestration/check.py +124 -0
  79. copeca/orchestration/run.py +537 -0
  80. copeca/orchestration/state.py +128 -0
  81. copeca/orchestration/validation.py +213 -0
  82. copeca/repos/__init__.py +0 -0
  83. copeca/repos/manager.py +338 -0
  84. copeca/results/__init__.py +0 -0
  85. copeca/results/artifact.py +156 -0
  86. copeca/results/signing.py +137 -0
  87. copeca/results/verification.py +412 -0
  88. copeca/results/writer.py +20 -0
  89. copeca/runners/__init__.py +0 -0
  90. copeca/runners/base.py +172 -0
  91. copeca/runners/cost.py +31 -0
  92. copeca/runners/parsers/__init__.py +47 -0
  93. copeca/runners/parsers/base.py +79 -0
  94. copeca/runners/parsers/codex_json.py +155 -0
  95. copeca/runners/parsers/stream_json.py +116 -0
  96. copeca/runners/subprocess.py +152 -0
  97. copeca/tasks/__init__.py +0 -0
  98. copeca/tasks/mutations.py +93 -0
  99. copeca/tasks/validator.py +114 -0
  100. copeca-0.1.0.dist-info/METADATA +271 -0
  101. copeca-0.1.0.dist-info/RECORD +105 -0
  102. copeca-0.1.0.dist-info/WHEEL +5 -0
  103. copeca-0.1.0.dist-info/entry_points.txt +2 -0
  104. copeca-0.1.0.dist-info/licenses/LICENSE +21 -0
  105. copeca-0.1.0.dist-info/top_level.txt +1 -0
copeca/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """copeca — cost per correct answer. A neutral, reproducible benchmark for CLI coding agents."""
2
+
3
+ __version__ = "0.1.0"
copeca/__main__.py ADDED
@@ -0,0 +1,10 @@
1
+ """Module entry point so `python -m copeca` runs the CLI.
2
+
3
+ Tests and tooling invoke the CLI via ``sys.executable -m copeca`` for a portable
4
+ entry point that does not depend on a .venv/bin path or on PATH resolution.
5
+ """
6
+
7
+ from copeca.cli import app
8
+
9
+ if __name__ == "__main__":
10
+ app()
copeca/agnosticism.py ADDED
@@ -0,0 +1,75 @@
1
+ """Tool-agnosticism lint for task text — pure functions, no I/O.
2
+
3
+ A copeca task must name the INFORMATION or OUTCOME it requires, never the METHOD
4
+ the agent should use to get it: no tool names, no "search for / grep", and no cue
5
+ that rewards one tool's output shape ("one structured answer", "not piecemeal
6
+ searches"). This keeps the A/B fair — the retrieval method is the variable under
7
+ test, so a task that prescribes it pre-judges the experiment. It is the same
8
+ neutrality rule the corpus applies to prompts, made checkable.
9
+
10
+ Architecture (S.U.P.E.R.): pure functions, text in / violations out. The repo name
11
+ (ripgrep, gin, express, fastapi) is the task's SUBJECT and is never flagged; only
12
+ tool naming, single-shot-aggregator priming, and explicit method prescription are.
13
+ Patterns are curated for precision — domain words like "search" (ripgrep's whole
14
+ purpose) are not flagged unless used as a prescribed method ("search the codebase").
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import re
20
+
21
+ # (compiled pattern, human-readable reason). All case-insensitive.
22
+ _FORBIDDEN: list[tuple[re.Pattern[str], str]] = [
23
+ # Explicit tool / product names (the experimental tool or any named retriever).
24
+ # Underscore-aware boundary: tool names appear as `tilth_search` / `grok_gin_new`
25
+ # (a plain \b fails before `_`). Excluding letters on both sides avoids matching
26
+ # words that merely contain the substring (e.g. "grokking").
27
+ (re.compile(r"(?<![a-z])tilth(?![a-z])", re.I), "names the tilth tool"),
28
+ (re.compile(r"(?<![a-z])grok(?![a-z])", re.I), "names the grok tool"),
29
+ (re.compile(r"\bctags\b", re.I), "names a specific tool (ctags)"),
30
+ (re.compile(r"\b(language server|lsp)\b", re.I), "names a specific tool (LSP)"),
31
+ # Single-shot-aggregator priming — rewards a tool that returns one bundled answer.
32
+ (
33
+ re.compile(r"structured answer", re.I),
34
+ "primes a single-shot aggregator ('structured answer')",
35
+ ),
36
+ (
37
+ re.compile(r"consolidated view", re.I),
38
+ "primes a single-shot aggregator ('consolidated view')",
39
+ ),
40
+ (re.compile(r"piecemeal", re.I), "disparages multi-step retrieval ('piecemeal')"),
41
+ (re.compile(r"in (?:a )?single call|in one call", re.I), "primes a single-call tool"),
42
+ (
43
+ re.compile(r"(?:several|multiple|partial) (?:searches|calls|answers)", re.I),
44
+ "frames retrieval as a count of searches/calls",
45
+ ),
46
+ # Explicit method prescription (how to retrieve, not what to retrieve).
47
+ (
48
+ re.compile(r"grep for|run a search|do a search|search the codebase", re.I),
49
+ "prescribes a search method",
50
+ ),
51
+ (re.compile(r"use (?:your |the )?[\w-]+ tool", re.I), "prescribes using a specific tool"),
52
+ ]
53
+
54
+
55
+ def check_tool_agnostic(name: str, prompt: str, description: str = "") -> list[str]:
56
+ """Return tool-coupling violations in a task's text (empty list = clean).
57
+
58
+ Scans ``name`` + ``prompt`` + ``description`` (case-insensitive) for forbidden
59
+ tool names, single-shot-aggregator priming, and explicit method prescription.
60
+ Repo names are NOT flagged — they are the task's subject, not a method.
61
+
62
+ Args:
63
+ name: The task's ``name`` field.
64
+ prompt: The prompt sent to the agent (the main surface to keep neutral).
65
+ description: The human-readable ``description`` field (optional).
66
+
67
+ Returns:
68
+ A list of distinct human-readable violation reasons; empty when clean.
69
+ """
70
+ text = f"{name}\n{prompt}\n{description}"
71
+ violations: list[str] = []
72
+ for pattern, reason in _FORBIDDEN:
73
+ if pattern.search(text) and reason not in violations:
74
+ violations.append(reason)
75
+ return violations
File without changes
@@ -0,0 +1,126 @@
1
+ """Run comparison — pairwise comparison of two JSONL result sets.
2
+
3
+ Architecture: domain. Pure computation. No I/O, no imports from
4
+ runners/repos/results/orchestration.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ from copeca.analysis.stats import cost_per_correct
10
+
11
+
12
+ def _fmt_cpc(cpc: float | None) -> str:
13
+ """Format a cost-per-correct value for display."""
14
+ return f"${cpc:.4f}" if cpc is not None else "n/a (0 correct)"
15
+
16
+
17
+ def compare_runs(before: list[dict[str, Any]], after: list[dict[str, Any]]) -> str:
18
+ """Compare two JSONL result sets, produce markdown with per-task deltas.
19
+
20
+ Flags tasks where cost-per-correct changed by more than 10%.
21
+
22
+ Args:
23
+ before: List of records from the baseline / before run.
24
+ after: List of records from the experimental / after run.
25
+
26
+ Returns:
27
+ Markdown string with comparison report.
28
+ """
29
+ if not before and not after:
30
+ return "# Run Comparison\n\n*No results in either run.*\n"
31
+
32
+ lines: list[str] = []
33
+ lines.append("# Run Comparison")
34
+ lines.append("")
35
+
36
+ # Gather all task names from both sets
37
+ before_tasks: set[str] = {r["task"] for r in before}
38
+ after_tasks: set[str] = {r["task"] for r in after}
39
+ all_tasks = sorted(before_tasks | after_tasks)
40
+
41
+ only_before = before_tasks - after_tasks
42
+ only_after = after_tasks - before_tasks
43
+
44
+ # ── Overhead per-task table ────────────────────────────────────────────
45
+ lines.append("## Per-Task Deltas")
46
+ lines.append("")
47
+ lines.append("| Task | Before CPC | After CPC | Delta% |")
48
+ lines.append("|------|-----------:|----------:|-------:|")
49
+
50
+ flags: list[tuple[str, float]] = []
51
+
52
+ for task in all_tasks:
53
+ before_task = [r for r in before if r["task"] == task]
54
+ after_task = [r for r in after if r["task"] == task]
55
+
56
+ cpc_before: float | None = cost_per_correct(before_task) if before_task else None
57
+ cpc_after: float | None = cost_per_correct(after_task) if after_task else None
58
+
59
+ if before_task and after_task:
60
+ if cpc_before is None or cpc_after is None:
61
+ # At least one side has 0 correct — delta is undefined
62
+ lines.append(f"| {task} | {_fmt_cpc(cpc_before)} | {_fmt_cpc(cpc_after)} | N/A |")
63
+ else:
64
+ if cpc_before > 0:
65
+ delta_pct = ((cpc_after - cpc_before) / cpc_before) * 100
66
+ elif cpc_after > 0:
67
+ delta_pct = float("inf")
68
+ else:
69
+ delta_pct = 0.0
70
+
71
+ flag = ""
72
+ if abs(delta_pct) > 10:
73
+ flags.append((task, delta_pct))
74
+ flag = " **>10%**"
75
+
76
+ lines.append(
77
+ f"| {task} | {_fmt_cpc(cpc_before)} | {_fmt_cpc(cpc_after)} "
78
+ f"| {delta_pct:+.1f}%{flag} |"
79
+ )
80
+
81
+ elif before_task and not after_task:
82
+ lines.append(f"| {task} | {_fmt_cpc(cpc_before)} | *missing* | N/A |")
83
+
84
+ elif not before_task and after_task:
85
+ lines.append(f"| {task} | *missing* | {_fmt_cpc(cpc_after)} | N/A |")
86
+
87
+ lines.append("")
88
+
89
+ # ── Overall stats ──────────────────────────────────────────────────────
90
+ cpc_all_before: float | None = cost_per_correct(before) if before else None
91
+ cpc_all_after: float | None = cost_per_correct(after) if after else None
92
+ lines.append("## Overall")
93
+ lines.append("")
94
+ lines.append(f"- **Before:** {len(before)} records across {len(before_tasks)} tasks")
95
+ lines.append(f"- **After:** {len(after)} records across {len(after_tasks)} tasks")
96
+ lines.append(f"- **Overall CPC before:** {_fmt_cpc(cpc_all_before)}")
97
+ lines.append(f"- **Overall CPC after:** {_fmt_cpc(cpc_all_after)}")
98
+
99
+ if cpc_all_before is not None and cpc_all_after is not None and cpc_all_before > 0:
100
+ overall_delta = ((cpc_all_after - cpc_all_before) / cpc_all_before) * 100
101
+ lines.append(f"- **Overall delta:** {overall_delta:+.1f}%")
102
+ lines.append("")
103
+
104
+ # ── Flagged large changes ──────────────────────────────────────────────
105
+ if flags:
106
+ lines.append("## Flagged Tasks (>10% Change)")
107
+ lines.append("")
108
+ for task, delta in flags:
109
+ direction = "decrease" if delta < 0 else "increase"
110
+ lines.append(f"- **{task}:** {delta:+.1f}% {direction}")
111
+
112
+ lines.append("")
113
+
114
+ # ── Missing tasks ──────────────────────────────────────────────────────
115
+ if only_before or only_after:
116
+ lines.append("## Task Coverage Changes")
117
+ lines.append("")
118
+ if only_before:
119
+ tasks_str = ", ".join(sorted(only_before))
120
+ lines.append(f"- **Removed:** {tasks_str}")
121
+ if only_after:
122
+ tasks_str = ", ".join(sorted(only_after))
123
+ lines.append(f"- **Added:** {tasks_str}")
124
+ lines.append("")
125
+
126
+ return "\n".join(lines)
@@ -0,0 +1,457 @@
1
+ """Markdown report generation from JSONL records.
2
+
3
+ Architecture: domain. Pure text generation — no I/O, no imports from
4
+ runners/repos/results/orchestration.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ from copeca.analysis.stats import (
10
+ ascii_sparkline,
11
+ bootstrap_ci,
12
+ cost_per_correct,
13
+ group_by,
14
+ )
15
+
16
+ _ADVERSARIAL_FLAG_NAMES = [
17
+ "token_snowball",
18
+ "talkative_failure",
19
+ "error",
20
+ "timeout",
21
+ "budget_exhausted",
22
+ ]
23
+
24
+
25
+ def _compute_per_task_deltas(records: list[dict[str, Any]], modes: list[str]) -> list[float]:
26
+ """Compute per-task cost-per-correct deltas between two modes.
27
+
28
+ For each task, compute cost_per_correct for mode[0] and mode[1],
29
+ then return the list of (mode[1] - mode[0]) deltas for tasks where
30
+ both modes have a valid cost.
31
+
32
+ Returns empty list if fewer than 2 modes or no shared tasks.
33
+ """
34
+ if len(modes) < 2:
35
+ return []
36
+
37
+ m0, m1 = modes[0], modes[1]
38
+ by_task = group_by(records, key="task")
39
+
40
+ deltas: list[float] = []
41
+ for task_records in by_task.values():
42
+ by_mode = group_by(task_records, key="mode")
43
+ mode0_recs = by_mode.get(m0, [])
44
+ mode1_recs = by_mode.get(m1, [])
45
+ if not mode0_recs or not mode1_recs:
46
+ continue
47
+ cpc0 = cost_per_correct(mode0_recs)
48
+ cpc1 = cost_per_correct(mode1_recs)
49
+ # Exclude tasks where either mode has no correct answers (undefined CPC)
50
+ if cpc0 is None or cpc1 is None:
51
+ continue
52
+ if cpc0 > 0:
53
+ deltas.append(((cpc1 - cpc0) / cpc0) * 100)
54
+
55
+ return deltas
56
+
57
+
58
+ def _has_flags(records: list[dict[str, Any]]) -> bool:
59
+ """Check if any record carries adversarial_flags."""
60
+ return any("adversarial_flags" in r and r["adversarial_flags"] is not None for r in records)
61
+
62
+
63
+ def _has_turn_data(records: list[dict[str, Any]]) -> bool:
64
+ """Check if any record carries per-turn token data."""
65
+ return any(r.get("per_turn_output_tokens") or r.get("per_turn_context_tokens") for r in records)
66
+
67
+
68
+ def _has_tools(records: list[dict[str, Any]]) -> bool:
69
+ """Check if any record carries a non-empty tool_sequence."""
70
+ return any(r.get("tool_sequence") and len(r["tool_sequence"]) > 0 for r in records)
71
+
72
+
73
+ def _has_language(records: list[dict[str, Any]]) -> bool:
74
+ """Check if any record carries a language field."""
75
+ return any(r.get("language") is not None for r in records)
76
+
77
+
78
+ def _has_difficulty(records: list[dict[str, Any]]) -> bool:
79
+ """Check if any record carries a difficulty field."""
80
+ return any(r.get("difficulty") is not None for r in records)
81
+
82
+
83
+ def _has_category(records: list[dict[str, Any]]) -> bool:
84
+ """Check if any record carries a category (capability) field."""
85
+ return any(r.get("category") is not None for r in records)
86
+
87
+
88
+ def _tool_adoption_section(
89
+ records: list[dict[str, Any]],
90
+ by_mode: dict[Any, list[dict[str, Any]]],
91
+ modes: list[str],
92
+ ) -> list[str]:
93
+ """Build the Tool Adoption section lines.
94
+
95
+ Counts records with non-empty tool_sequence vs empty, per mode.
96
+ Returns empty list if no records carry tool_sequence.
97
+ """
98
+ if not _has_tools(records):
99
+ return []
100
+
101
+ lines: list[str] = []
102
+ lines.append("### Tool Adoption")
103
+ lines.append("")
104
+ lines.append("| Mode | Runs With Tools | Total Runs | Adoption % |")
105
+ lines.append("|------|----------------:|-----------:|----------:|")
106
+
107
+ for mode in modes:
108
+ mode_records = by_mode[mode]
109
+ total = len(mode_records)
110
+ with_tools = sum(
111
+ 1 for r in mode_records if r.get("tool_sequence") and len(r["tool_sequence"]) > 0
112
+ )
113
+ pct = (with_tools / total * 100) if total > 0 else 0.0
114
+ lines.append(f"| {mode} | {with_tools} | {total} | {pct:.1f}% |")
115
+
116
+ lines.append("")
117
+ return lines
118
+
119
+
120
+ def _per_category_section(
121
+ records: list[dict[str, Any]],
122
+ by_mode: dict[Any, list[dict[str, Any]]],
123
+ modes: list[str],
124
+ category: str,
125
+ title: str,
126
+ ) -> list[str]:
127
+ """Build a per-category cost-per-correct breakdown section.
128
+
129
+ Groups records by category field, then computes cost_per_correct
130
+ per category per mode. Returns empty list if the category field
131
+ is not present in any record.
132
+
133
+ Args:
134
+ records: All records.
135
+ by_mode: Records grouped by mode (precomputed).
136
+ modes: Sorted list of mode keys.
137
+ category: The field name to group by (e.g., 'language', 'difficulty').
138
+ title: Section heading (e.g., 'Per-Language Breakdown').
139
+ """
140
+ has_category = {
141
+ "language": _has_language,
142
+ "difficulty": _has_difficulty,
143
+ "category": _has_category,
144
+ }.get(category)
145
+ if has_category is None or not has_category(records):
146
+ return []
147
+
148
+ lines: list[str] = []
149
+ lines.append(f"### {title}")
150
+ lines.append("")
151
+
152
+ header = f"| {category.capitalize()} |"
153
+ sep = "|------|"
154
+ for mode in modes:
155
+ header += f" {mode} CPC |"
156
+ sep += "------:|"
157
+ if len(modes) == 2:
158
+ header += " Delta% |"
159
+ sep += "-------:|"
160
+ lines.append(header)
161
+ lines.append(sep)
162
+
163
+ cat_values = sorted(
164
+ {r.get(category) for r in records if r.get(category) is not None},
165
+ key=lambda v: str(v),
166
+ )
167
+
168
+ for cat_val in cat_values:
169
+ cat_records = [r for r in records if r.get(category) == cat_val]
170
+ by_mode_cat = group_by(cat_records, key="mode")
171
+ row = f"| {cat_val} |"
172
+ cpcs: list[float | None] = []
173
+ for mode in modes:
174
+ cpc = cost_per_correct(by_mode_cat.get(mode, []))
175
+ cpcs.append(cpc)
176
+ cpc_cell = f"${cpc:.4f}" if cpc is not None else "n/a (0 correct)"
177
+ row += f" {cpc_cell} |"
178
+ # Delta% (2 modes): the per-capability payoff — where the tool helps.
179
+ if len(modes) == 2:
180
+ c0, c1 = cpcs
181
+ if c0 is not None and c1 is not None and c0 > 0:
182
+ row += f" {((c1 - c0) / c0) * 100:+.1f}% |"
183
+ else:
184
+ row += " N/A |"
185
+ lines.append(row)
186
+
187
+ lines.append("")
188
+ return lines
189
+
190
+
191
+ def generate_report(records: list[dict[str, Any]]) -> str:
192
+ """Generate a markdown report from JSONL records.
193
+
194
+ Report structure:
195
+ 1. Delta-headline: cost-per-correct for each mode + delta + CI
196
+ 2. Per-task table: task name | mode1 cost | mode2 cost | delta% [95% CI]
197
+ 3. Cost breakdown: input/output/cache tokens per mode
198
+ 4. Corrections summary: correct/total per mode
199
+ 5. Adversarial Flags summary (if records carry flags)
200
+ 6. Token Usage Sparklines (if records carry per-turn data)
201
+ 7. Tool Adoption (if records carry tool_sequence)
202
+ 8. Per-Language Breakdown (if records carry language)
203
+ 9. Per-Difficulty Breakdown (if records carry difficulty)
204
+
205
+ Args:
206
+ records: List of dicts (JSONL records) with fields:
207
+ task, mode, total_cost_usd, correct, input_tokens,
208
+ output_tokens, cache_creation_tokens, cache_read_tokens.
209
+ Optional: adversarial_flags, per_turn_output_tokens,
210
+ per_turn_context_tokens, tool_sequence, language, difficulty.
211
+
212
+ Returns:
213
+ Markdown string.
214
+ """
215
+ if not records:
216
+ return "## Copeca Report\n\n*No results.*\n"
217
+
218
+ # Separate crashed/failed runs (error set) from valid measurements: they are
219
+ # surfaced in a Failed Runs section below but excluded from every metric so a
220
+ # crash can't deflate accuracy or skew cost (shakedown SD-B).
221
+ failed_records = [r for r in records if r.get("error")]
222
+ records = [r for r in records if not r.get("error")]
223
+ if not records:
224
+ return "## Copeca Report\n\n*No valid results — all runs failed.*\n"
225
+
226
+ lines: list[str] = []
227
+ lines.append("## Copeca Report")
228
+ lines.append("")
229
+ if failed_records:
230
+ lines.append("### Failed Runs")
231
+ lines.append("")
232
+ lines.append(
233
+ f"{len(failed_records)} run(s) failed and are excluded from the metrics below."
234
+ )
235
+ for r in failed_records:
236
+ raw = r.get("error") or "unknown error"
237
+ err = str(raw).splitlines()[0][:120]
238
+ lines.append(f"- **{r.get('mode', '?')}** / {r.get('task', '?')}: {err}")
239
+ lines.append("")
240
+
241
+ # Discover modes
242
+ by_mode = group_by(records, key="mode")
243
+ modes = sorted(by_mode.keys(), key=lambda m: str(m) if m is not None else "")
244
+
245
+ # ── 1. Delta-headline: cost-per-correct for each mode + delta + CI ─────
246
+ lines.append("### Cost Per Correct Answer")
247
+ lines.append("")
248
+ lines.append("| Mode | Cost per Correct | Accuracy |")
249
+ lines.append("|------|----------------:|---------:|")
250
+
251
+ cpc_by_mode: dict[str, float | None] = {}
252
+ correct_by_mode: dict[str, tuple[int, int]] = {}
253
+ for mode in modes:
254
+ mode_records = by_mode[mode]
255
+ cpc = cost_per_correct(mode_records)
256
+ correct_count = sum(1 for r in mode_records if r.get("correct"))
257
+ total_count = len(mode_records)
258
+ cpc_by_mode[mode] = cpc
259
+ correct_by_mode[mode] = (correct_count, total_count)
260
+ accuracy = f"{correct_count}/{total_count}"
261
+ cpc_cell = f"${cpc:.4f}" if cpc is not None else "n/a (0 correct)"
262
+ lines.append(f"| {mode} | {cpc_cell} | {accuracy} |")
263
+
264
+ lines.append("")
265
+
266
+ # Delta only when multiple modes exist
267
+ per_task_deltas: list[float] = []
268
+ if len(modes) == 2:
269
+ m0, m1 = modes[0], modes[1]
270
+ cpc0 = cpc_by_mode[m0]
271
+ cpc1 = cpc_by_mode[m1]
272
+
273
+ # Compute bootstrap CI on per-task deltas (excludes tasks where either CPC is None)
274
+ per_task_deltas = _compute_per_task_deltas(records, modes)
275
+
276
+ if cpc1 is None:
277
+ # Experimental got 0 correct — delta is undefined, not a bargain
278
+ n_correct, n_total = correct_by_mode[m1]
279
+ lines.append(f"**Delta:** n/a — {m1} got {n_correct}/{n_total} correct")
280
+ elif cpc0 is None:
281
+ # Baseline got 0 correct — experimental is strictly better, but no ratio
282
+ n_correct, n_total = correct_by_mode[m0]
283
+ lines.append(f"**Delta:** n/a — {m0} (baseline) got {n_correct}/{n_total} correct")
284
+ else:
285
+ if cpc0 > 0:
286
+ delta_pct = ((cpc1 - cpc0) / cpc0) * 100
287
+ elif cpc1 > 0:
288
+ delta_pct = float("inf")
289
+ else:
290
+ delta_pct = 0.0
291
+ direction = "lower" if delta_pct < 0 else "higher"
292
+
293
+ if per_task_deltas:
294
+ ci_lo, ci_hi, _, _ = bootstrap_ci(per_task_deltas)
295
+ lines.append(
296
+ f"**Delta:** {m1} is {delta_pct:+.1f}% {direction} than {m0} "
297
+ f"(${cpc1:.4f} vs ${cpc0:.4f}) "
298
+ f"[95% CI: {ci_lo:+.1f}%, {ci_hi:+.1f}%]"
299
+ )
300
+ else:
301
+ lines.append(
302
+ f"**Delta:** {m1} is {delta_pct:+.1f}% {direction} than {m0} "
303
+ f"(${cpc1:.4f} vs ${cpc0:.4f})"
304
+ )
305
+ lines.append("")
306
+
307
+ # ── 2. Per-task table ──────────────────────────────────────────────────
308
+ by_task = group_by(records, key="task")
309
+ tasks = sorted(by_task.keys(), key=lambda t: str(t) if t is not None else "")
310
+
311
+ lines.append("### Per-Task Cost")
312
+ lines.append("")
313
+
314
+ header = "| Task |"
315
+ sep = "|------|"
316
+ for mode in modes:
317
+ header += f" {mode} |"
318
+ sep += "------:|"
319
+ if len(modes) == 2:
320
+ if per_task_deltas:
321
+ header += " Delta% [95% CI] |"
322
+ sep += "------------------:|"
323
+ else:
324
+ header += " Delta% |"
325
+ sep += "-------:|"
326
+ lines.append(header)
327
+ lines.append(sep)
328
+
329
+ for task in tasks:
330
+ task_records = by_task[task]
331
+ by_mode_in_task = group_by(task_records, key="mode")
332
+
333
+ row = f"| {task} |"
334
+ costs: list[float | None] = []
335
+ for mode in modes:
336
+ mode_recs = by_mode_in_task.get(mode, [])
337
+ cpc = cost_per_correct(mode_recs)
338
+ costs.append(cpc)
339
+ cpc_cell = f"${cpc:.4f}" if cpc is not None else "n/a (0 correct)"
340
+ row += f" {cpc_cell} |"
341
+
342
+ if len(modes) == 2:
343
+ c0, c1 = costs
344
+ if c0 is None or c1 is None:
345
+ row += " N/A |"
346
+ elif c0 > 0:
347
+ delta = ((c1 - c0) / c0) * 100
348
+ row += f" {delta:+.1f}% |"
349
+ else:
350
+ row += " N/A |"
351
+
352
+ lines.append(row)
353
+
354
+ lines.append("")
355
+
356
+ # ── 3. Cost breakdown: tokens per mode ─────────────────────────────────
357
+ lines.append("### Token Breakdown")
358
+ lines.append("")
359
+ lines.append("| Mode | Total Input | Total Output | Total Cache Create | Total Cache Read |")
360
+ lines.append("|------|------------:|-------------:|-------------------:|-----------------:|")
361
+
362
+ for mode in modes:
363
+ mode_records = by_mode[mode]
364
+ total_input = sum(r.get("input_tokens", 0) or 0 for r in mode_records)
365
+ total_output = sum(r.get("output_tokens", 0) or 0 for r in mode_records)
366
+ total_cache_create = sum(r.get("cache_creation_tokens", 0) or 0 for r in mode_records)
367
+ total_cache_read = sum(r.get("cache_read_tokens", 0) or 0 for r in mode_records)
368
+
369
+ lines.append(
370
+ f"| {mode} | {total_input:,} | {total_output:,} | "
371
+ f"{total_cache_create:,} | {total_cache_read:,} |"
372
+ )
373
+
374
+ lines.append("")
375
+
376
+ # ── 4. Corrections summary: correct/total per mode ─────────────────────
377
+ lines.append("### Corrections Summary")
378
+ lines.append("")
379
+ lines.append("| Mode | Correct | Total | Rate |")
380
+ lines.append("|------|--------:|------:|-----:|")
381
+
382
+ for mode in modes:
383
+ mode_records = by_mode[mode]
384
+ correct_count = sum(1 for r in mode_records if r.get("correct"))
385
+ total_count = len(mode_records)
386
+ rate = (correct_count / total_count * 100) if total_count > 0 else 0.0
387
+ lines.append(f"| {mode} | {correct_count} | {total_count} | {rate:.1f}% |")
388
+
389
+ lines.append("")
390
+
391
+ # ── 5. Adversarial Flags (if records carry flags) ──────────────────────
392
+ if _has_flags(records):
393
+ lines.append("### Adversarial Flags")
394
+ lines.append("")
395
+ lines.append("| Flag | Rate |")
396
+ lines.append("|------|-----:|")
397
+
398
+ all_flags: list[dict[str, Any]] = []
399
+ for r in records:
400
+ flags = r.get("adversarial_flags")
401
+ if flags is not None and isinstance(flags, dict):
402
+ all_flags.append(flags)
403
+
404
+ total_with_flags = len(all_flags)
405
+ for flag_name in _ADVERSARIAL_FLAG_NAMES:
406
+ true_count = sum(1 for f in all_flags if f.get(flag_name) is True)
407
+ rate_pct = (true_count / total_with_flags * 100) if total_with_flags > 0 else 0.0
408
+ lines.append(f"| {flag_name} | {rate_pct:.1f}% |")
409
+
410
+ lines.append("")
411
+
412
+ # ── 6. Token Usage Sparklines (if records carry per-turn data) ─────────
413
+ if _has_turn_data(records):
414
+ lines.append("### Token Usage Sparklines")
415
+ lines.append("")
416
+ lines.append("*Per-turn output token sequences, sampled at 20 points.*")
417
+ lines.append("")
418
+
419
+ for task in tasks:
420
+ task_records = by_task[task]
421
+ by_m = group_by(task_records, key="mode")
422
+ for mode in modes:
423
+ for rec in by_m.get(mode, []):
424
+ spark_values = rec.get("per_turn_output_tokens")
425
+ if not spark_values or len(spark_values) < 2:
426
+ continue
427
+ spark = ascii_sparkline(spark_values, width=20)
428
+ mn = min(spark_values)
429
+ mx = max(spark_values)
430
+ lines.append(
431
+ f"- **{task}** ({mode}): `{spark}` "
432
+ f"min={mn} max={mx} n={len(spark_values)} turns"
433
+ )
434
+
435
+ lines.append("")
436
+
437
+ # ── 7. Tool Adoption (if records carry tool_sequence) ──────────────────
438
+ lines.extend(_tool_adoption_section(records, by_mode, modes))
439
+
440
+ # ── 8. Per-Language Breakdown (if records carry language) ───────────────
441
+ lines.extend(
442
+ _per_category_section(records, by_mode, modes, "language", "Per-Language Breakdown")
443
+ )
444
+
445
+ # ── 9. Per-Difficulty Breakdown (if records carry difficulty) ───────────
446
+ lines.extend(
447
+ _per_category_section(records, by_mode, modes, "difficulty", "Per-Difficulty Breakdown")
448
+ )
449
+
450
+ # ── 10. Per-Capability Breakdown (if records carry category) ────────────
451
+ # The payoff: cost-per-correct sliced by what the task demands (locate/trace/
452
+ # fix/debug), so the delta reveals WHERE a tool helps, not just how much overall.
453
+ lines.extend(
454
+ _per_category_section(records, by_mode, modes, "category", "Per-Capability Breakdown")
455
+ )
456
+
457
+ return "\n".join(lines)