codeprobe 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. codeprobe-0.2.1/PKG-INFO +224 -0
  2. codeprobe-0.2.1/README.md +189 -0
  3. {codeprobe-0.2.0 → codeprobe-0.2.1}/pyproject.toml +1 -1
  4. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/executor.py +19 -18
  5. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/isolation.py +33 -20
  6. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/preamble.py +8 -0
  7. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale.py +53 -33
  8. codeprobe-0.2.1/src/codeprobe.egg-info/PKG-INFO +224 -0
  9. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_executor.py +3 -3
  10. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_org_scale.py +70 -0
  11. codeprobe-0.2.0/PKG-INFO +0 -131
  12. codeprobe-0.2.0/README.md +0 -96
  13. codeprobe-0.2.0/src/codeprobe.egg-info/PKG-INFO +0 -131
  14. {codeprobe-0.2.0 → codeprobe-0.2.1}/LICENSE +0 -0
  15. {codeprobe-0.2.0 → codeprobe-0.2.1}/setup.cfg +0 -0
  16. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/__init__.py +0 -0
  17. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/__main__.py +0 -0
  18. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/__init__.py +0 -0
  19. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/_base.py +0 -0
  20. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/aider.py +0 -0
  21. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/claude.py +0 -0
  22. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/codex.py +0 -0
  23. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/copilot.py +0 -0
  24. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/openai_compat.py +0 -0
  25. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/protocol.py +0 -0
  26. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/session.py +0 -0
  27. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/adapters/telemetry.py +0 -0
  28. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/analysis/__init__.py +0 -0
  29. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/analysis/ranking.py +0 -0
  30. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/analysis/report.py +0 -0
  31. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/analysis/stats.py +0 -0
  32. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/api.py +0 -0
  33. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/assess/__init__.py +0 -0
  34. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/assess/heuristics.py +0 -0
  35. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/__init__.py +0 -0
  36. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/assess_cmd.py +0 -0
  37. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/experiment_cmd.py +0 -0
  38. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/init_cmd.py +0 -0
  39. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/interpret_cmd.py +0 -0
  40. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/mine_cmd.py +0 -0
  41. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/probe_cmd.py +0 -0
  42. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/ratings_cmd.py +0 -0
  43. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/run_cmd.py +0 -0
  44. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  45. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/wizard.py +0 -0
  46. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/cli/yaml_writer.py +0 -0
  47. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/config/__init__.py +0 -0
  48. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/config/loader.py +0 -0
  49. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/__init__.py +0 -0
  50. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/_shared.py +0 -0
  51. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/adaptive.py +0 -0
  52. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/counterfactual.py +0 -0
  53. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/debate.py +0 -0
  54. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/decision_tree.py +0 -0
  55. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/elo.py +0 -0
  56. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/fingerprint.py +0 -0
  57. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/mutation.py +0 -0
  58. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/pareto.py +0 -0
  59. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/sprt.py +0 -0
  60. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/contrib/tournament.py +0 -0
  61. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/__init__.py +0 -0
  62. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/checkpoint.py +0 -0
  63. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/experiment.py +0 -0
  64. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/llm.py +0 -0
  65. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/registry.py +0 -0
  66. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/sandbox.py +0 -0
  67. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/core/scoring.py +0 -0
  68. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/loaders/__init__.py +0 -0
  69. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/__init__.py +0 -0
  70. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/_lang.py +0 -0
  71. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/curator.py +0 -0
  72. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/curator_backends.py +0 -0
  73. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/curator_tiers.py +0 -0
  74. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/extractor.py +0 -0
  75. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_families.py +0 -0
  76. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  77. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_scanner.py +0 -0
  78. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_validate.py +0 -0
  79. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/sg_ground_truth.py +0 -0
  80. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/sources.py +0 -0
  81. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/mining/writer.py +0 -0
  82. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/models/__init__.py +0 -0
  83. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/models/evalrc.py +0 -0
  84. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/models/experiment.py +0 -0
  85. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/models/preamble.py +0 -0
  86. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/models/task.py +0 -0
  87. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/preambles/__init__.py +0 -0
  88. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/preambles/github.md +0 -0
  89. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/preambles/sourcegraph.md +0 -0
  90. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/probe/__init__.py +0 -0
  91. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/probe/generator.py +0 -0
  92. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/probe/writer.py +0 -0
  93. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/ratings/__init__.py +0 -0
  94. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/ratings/collector.py +0 -0
  95. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/scaffold/__init__.py +0 -0
  96. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/scaffold/writer.py +0 -0
  97. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/templates/__init__.py +0 -0
  98. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
  99. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  100. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  101. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe.egg-info/SOURCES.txt +0 -0
  102. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  103. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe.egg-info/entry_points.txt +0 -0
  104. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe.egg-info/requires.txt +0 -0
  105. {codeprobe-0.2.0 → codeprobe-0.2.1}/src/codeprobe.egg-info/top_level.txt +0 -0
  106. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_adapters.py +0 -0
  107. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_analysis.py +0 -0
  108. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_api.py +0 -0
  109. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_assess.py +0 -0
  110. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_changed_symbols.py +0 -0
  111. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_checkpoint.py +0 -0
  112. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_cli.py +0 -0
  113. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_config_loader.py +0 -0
  114. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_contrib.py +0 -0
  115. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_curator_backends.py +0 -0
  116. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_curator_core.py +0 -0
  117. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_curator_integration.py +0 -0
  118. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_curator_tiers.py +0 -0
  119. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_experiment_cmd.py +0 -0
  120. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_experiment_core.py +0 -0
  121. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_init_wizard.py +0 -0
  122. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_llm.py +0 -0
  123. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_loaders.py +0 -0
  124. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_mcp_families_mining.py +0 -0
  125. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_mcp_validate.py +0 -0
  126. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_mining.py +0 -0
  127. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_models.py +0 -0
  128. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_new_families.py +0 -0
  129. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_openai_compat.py +0 -0
  130. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_oracle_types.py +0 -0
  131. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_pipeline_integration.py +0 -0
  132. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_preamble.py +0 -0
  133. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_probe.py +0 -0
  134. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_ratings.py +0 -0
  135. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_ratings_cmd.py +0 -0
  136. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_registry.py +0 -0
  137. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_scaffold.py +0 -0
  138. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_scanner_refactor.py +0 -0
  139. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_scoring.py +0 -0
  140. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_session.py +0 -0
  141. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_sg_ground_truth.py +0 -0
  142. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_telemetry.py +0 -0
  143. {codeprobe-0.2.0 → codeprobe-0.2.1}/tests/test_weighted_f1.py +0 -0
@@ -0,0 +1,224 @@
1
+ Metadata-Version: 2.4
2
+ Name: codeprobe
3
+ Version: 0.2.1
4
+ Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
+ Author: codeprobe contributors
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/sjarmak/codeprobe
8
+ Project-URL: Repository, https://github.com/sjarmak/codeprobe
9
+ Project-URL: Issues, https://github.com/sjarmak/codeprobe/issues
10
+ Keywords: ai,benchmark,eval,coding-agent,mcp
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Software Development :: Testing
18
+ Requires-Python: >=3.11
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: click<9,>=8.0
22
+ Requires-Dist: pyyaml<7,>=6.0
23
+ Requires-Dist: anthropic>=0.39
24
+ Requires-Dist: openai>=1.66
25
+ Requires-Dist: tiktoken<1,>=0.7
26
+ Requires-Dist: scipy<2,>=1.11
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest<9,>=8.0; extra == "dev"
29
+ Requires-Dist: pytest-cov<6,>=5.0; extra == "dev"
30
+ Requires-Dist: ruff<1,>=0.4; extra == "dev"
31
+ Requires-Dist: mypy<2,>=1.10; extra == "dev"
32
+ Requires-Dist: types-PyYAML<7,>=6.0; extra == "dev"
33
+ Requires-Dist: scipy<2,>=1.11; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # codeprobe
37
+
38
+ Benchmark AI coding agents against **your own codebase**.
39
+
40
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code — not someone else's benchmark suite.
41
+
42
+ ## Why codeprobe?
43
+
44
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
45
+
46
+ ## Prerequisites
47
+
48
+ codeprobe orchestrates external AI coding agents — you need at least one installed:
49
+
50
+ | Agent | Install | Required env var |
51
+ | ------------------ | ------------------------------------------------ | ------------------------------- |
52
+ | **Claude Code** | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
53
+ | **GitHub Copilot** | `npm install -g @github/copilot-cli` (>= 1.0.4) | GitHub auth via `gh auth login` |
54
+ | **Codex** | Included via `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
55
+
56
+ You also need:
57
+
58
+ - **Python 3.11+**
59
+ - **Git** (for task mining and worktree isolation)
60
+ - **GitHub CLI** (`gh`) — optional, for mining tasks from GitHub PRs with linked issues
61
+
62
+ The `assess` and `mine --enrich` commands need an LLM for scoring/enrichment. codeprobe auto-detects the best available backend:
63
+
64
+ | Priority | Backend | Install | Env var |
65
+ | -------- | ------------- | ------------------------------------------------ | ------------------- |
66
+ | 1 | Anthropic SDK | `pip install codeprobe[anthropic]` | `ANTHROPIC_API_KEY` |
67
+ | 2 | OpenAI SDK | `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
68
+ | 3 | Claude CLI | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
69
+
70
+ Override with `CODEPROBE_LLM_BACKEND=anthropic|openai|claude-cli`. Without any backend, `assess` falls back to heuristic scoring.
71
+
72
+ ## Quick Start
73
+
74
+ ```bash
75
+ pip install codeprobe
76
+
77
+ cd /path/to/your/repo
78
+
79
+ codeprobe assess . # Score benchmarking potential (optional)
80
+ codeprobe mine . # Extract tasks from repo history
81
+ codeprobe run . # Run agents against tasks
82
+ codeprobe interpret . # Get recommendations
83
+ ```
84
+
85
+ ## Commands
86
+
87
+ | Command | Purpose |
88
+ | ------------------------ | ------------------------------------------------ |
89
+ | `codeprobe assess` | Score a codebase's benchmarking potential |
90
+ | `codeprobe init` | Interactive wizard — choose what to compare |
91
+ | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
92
+ | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
93
+ | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
94
+ | `codeprobe run` | Execute tasks against AI agents |
95
+ | `codeprobe interpret` | Analyze results, rank configurations |
96
+ | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
97
+ | `codeprobe scaffold` | Create/validate eval task directories |
98
+ | `codeprobe ratings` | Record and analyze agent session quality ratings |
99
+
100
+ ## Two Ways to Generate Tasks
101
+
102
+ ### 1. SDLC Tasks (from merged PRs)
103
+
104
+ Mine real code-change tasks from your git history. Agents must reproduce known fixes and features.
105
+
106
+ ```bash
107
+ codeprobe mine . --count 10 --source github
108
+ codeprobe mine . --count 5 --min-files 4 # Harder tasks (more files changed)
109
+ codeprobe mine . --enrich # LLM-enriched instructions
110
+ ```
111
+
112
+ ### 2. Micro-Benchmark Probes
113
+
114
+ Fast exact-match tasks (30s each) that test code navigation and comprehension — no agent sandbox needed.
115
+
116
+ ```bash
117
+ codeprobe probe . -n 10 -l python -s 42 -o ./probes
118
+ ```
119
+
120
+ Generates four probe types: find-function, count-callers, return-type, module-dependency.
121
+
122
+ ## MCP Comparison Experiments
123
+
124
+ Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
125
+
126
+ ### Mine org-scale comprehension tasks
127
+
128
+ ```bash
129
+ # Set up Sourcegraph credentials
130
+ export SOURCEGRAPH_TOKEN="your-token"
131
+
132
+ # Mine MCP-optimized tasks with Sourcegraph ground truth enrichment
133
+ codeprobe mine /path/to/repo \
134
+ --org-scale --mcp-families --count 5 \
135
+ --no-interactive --no-llm \
136
+ --sg-repo github.com/sg-evals/your-repo
137
+ ```
138
+
139
+ MCP task families: `symbol-reference-trace`, `type-hierarchy-consumers`, `change-scope-audit`.
140
+
141
+ ### Set up the experiment
142
+
143
+ ```bash
144
+ # Create experiment
145
+ codeprobe experiment init /path/to/repo --name mcp-comparison
146
+
147
+ # Copy mined tasks into the experiment
148
+ cp -r /path/to/repo/.codeprobe/tasks/* /path/to/repo/mcp-comparison/tasks/
149
+
150
+ # Baseline config (no MCP, no preamble)
151
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
152
+ --label baseline --agent claude --model claude-haiku-4-5-20251001
153
+
154
+ # Sourcegraph MCP config (preamble + MCP server)
155
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
156
+ --label with-sourcegraph --agent claude --model claude-haiku-4-5-20251001 \
157
+ --preamble sourcegraph \
158
+ --mcp-config '{"mcpServers":{"sourcegraph":{"type":"http","url":"https://sourcegraph.com/.api/mcp/v1","headers":{"Authorization":"token $SOURCEGRAPH_TOKEN"}}}}'
159
+
160
+ # Run and interpret
161
+ codeprobe run /path/to/repo/mcp-comparison --agent claude --max-cost-usd 5.00
162
+ codeprobe interpret /path/to/repo/mcp-comparison
163
+ ```
164
+
165
+ ### Preambles
166
+
167
+ Preambles are composable instruction templates prepended to the agent's prompt for MCP-enabled configs. Built-in preambles: `sourcegraph`, `github`.
168
+
169
+ Override built-ins by placing a `.md` file in:
170
+
171
+ - `<task_dir>/preambles/` (per-task)
172
+ - `.codeprobe/preambles/` (project-level)
173
+ - `~/.codeprobe/preambles/` (user-level)
174
+
175
+ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}}`
176
+
177
+ ## Key Flags
178
+
179
+ ```bash
180
+ # Running
181
+ codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
182
+ codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
183
+ codeprobe run . --dry-run # Estimate resource usage without running
184
+
185
+ # Mining
186
+ codeprobe mine . --enrich # Use LLM to improve weak task instructions
187
+ codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
188
+ codeprobe mine . --mcp-families # Include MCP-optimized task families
189
+ codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
190
+
191
+ # Experiment configs
192
+ codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
193
+ codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
194
+
195
+ # Output
196
+ codeprobe interpret . --format csv # Export for pivot tables
197
+ codeprobe interpret . --format html # Self-contained HTML report
198
+ ```
199
+
200
+ ## Supported Agents
201
+
202
+ - **Claude Code** (`--agent claude`) — headless via `claude -p`
203
+ - **GitHub Copilot** (`--agent copilot`) — via Copilot CLI
204
+ - **Codex** (`--agent codex`) — via OpenAI API
205
+ - Custom agents via the `AgentAdapter` protocol
206
+
207
+ ## Supported Git Hosts
208
+
209
+ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
210
+
211
+ ## Configuration
212
+
213
+ Create a `.evalrc.yaml` in your repo root:
214
+
215
+ ```yaml
216
+ name: my-experiment
217
+ agents: [claude, copilot]
218
+ models: [claude-sonnet-4-6, claude-opus-4-6]
219
+ tasks_dir: .codeprobe/tasks
220
+ ```
221
+
222
+ ## License
223
+
224
+ Apache-2.0
@@ -0,0 +1,189 @@
1
+ # codeprobe
2
+
3
+ Benchmark AI coding agents against **your own codebase**.
4
+
5
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code — not someone else's benchmark suite.
6
+
7
+ ## Why codeprobe?
8
+
9
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
10
+
11
+ ## Prerequisites
12
+
13
+ codeprobe orchestrates external AI coding agents — you need at least one installed:
14
+
15
+ | Agent | Install | Required env var |
16
+ | ------------------ | ------------------------------------------------ | ------------------------------- |
17
+ | **Claude Code** | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
18
+ | **GitHub Copilot** | `npm install -g @github/copilot-cli` (>= 1.0.4) | GitHub auth via `gh auth login` |
19
+ | **Codex** | Included via `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
20
+
21
+ You also need:
22
+
23
+ - **Python 3.11+**
24
+ - **Git** (for task mining and worktree isolation)
25
+ - **GitHub CLI** (`gh`) — optional, for mining tasks from GitHub PRs with linked issues
26
+
27
+ The `assess` and `mine --enrich` commands need an LLM for scoring/enrichment. codeprobe auto-detects the best available backend:
28
+
29
+ | Priority | Backend | Install | Env var |
30
+ | -------- | ------------- | ------------------------------------------------ | ------------------- |
31
+ | 1 | Anthropic SDK | `pip install codeprobe[anthropic]` | `ANTHROPIC_API_KEY` |
32
+ | 2 | OpenAI SDK | `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
33
+ | 3 | Claude CLI | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
34
+
35
+ Override with `CODEPROBE_LLM_BACKEND=anthropic|openai|claude-cli`. Without any backend, `assess` falls back to heuristic scoring.
36
+
37
+ ## Quick Start
38
+
39
+ ```bash
40
+ pip install codeprobe
41
+
42
+ cd /path/to/your/repo
43
+
44
+ codeprobe assess . # Score benchmarking potential (optional)
45
+ codeprobe mine . # Extract tasks from repo history
46
+ codeprobe run . # Run agents against tasks
47
+ codeprobe interpret . # Get recommendations
48
+ ```
49
+
50
+ ## Commands
51
+
52
+ | Command | Purpose |
53
+ | ------------------------ | ------------------------------------------------ |
54
+ | `codeprobe assess` | Score a codebase's benchmarking potential |
55
+ | `codeprobe init` | Interactive wizard — choose what to compare |
56
+ | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
57
+ | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
58
+ | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
59
+ | `codeprobe run` | Execute tasks against AI agents |
60
+ | `codeprobe interpret` | Analyze results, rank configurations |
61
+ | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
62
+ | `codeprobe scaffold` | Create/validate eval task directories |
63
+ | `codeprobe ratings` | Record and analyze agent session quality ratings |
64
+
65
+ ## Two Ways to Generate Tasks
66
+
67
+ ### 1. SDLC Tasks (from merged PRs)
68
+
69
+ Mine real code-change tasks from your git history. Agents must reproduce known fixes and features.
70
+
71
+ ```bash
72
+ codeprobe mine . --count 10 --source github
73
+ codeprobe mine . --count 5 --min-files 4 # Harder tasks (more files changed)
74
+ codeprobe mine . --enrich # LLM-enriched instructions
75
+ ```
76
+
77
+ ### 2. Micro-Benchmark Probes
78
+
79
+ Fast exact-match tasks (30s each) that test code navigation and comprehension — no agent sandbox needed.
80
+
81
+ ```bash
82
+ codeprobe probe . -n 10 -l python -s 42 -o ./probes
83
+ ```
84
+
85
+ Generates four probe types: find-function, count-callers, return-type, module-dependency.
86
+
87
+ ## MCP Comparison Experiments
88
+
89
+ Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
90
+
91
+ ### Mine org-scale comprehension tasks
92
+
93
+ ```bash
94
+ # Set up Sourcegraph credentials
95
+ export SOURCEGRAPH_TOKEN="your-token"
96
+
97
+ # Mine MCP-optimized tasks with Sourcegraph ground truth enrichment
98
+ codeprobe mine /path/to/repo \
99
+ --org-scale --mcp-families --count 5 \
100
+ --no-interactive --no-llm \
101
+ --sg-repo github.com/sg-evals/your-repo
102
+ ```
103
+
104
+ MCP task families: `symbol-reference-trace`, `type-hierarchy-consumers`, `change-scope-audit`.
105
+
106
+ ### Set up the experiment
107
+
108
+ ```bash
109
+ # Create experiment
110
+ codeprobe experiment init /path/to/repo --name mcp-comparison
111
+
112
+ # Copy mined tasks into the experiment
113
+ cp -r /path/to/repo/.codeprobe/tasks/* /path/to/repo/mcp-comparison/tasks/
114
+
115
+ # Baseline config (no MCP, no preamble)
116
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
117
+ --label baseline --agent claude --model claude-haiku-4-5-20251001
118
+
119
+ # Sourcegraph MCP config (preamble + MCP server)
120
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
121
+ --label with-sourcegraph --agent claude --model claude-haiku-4-5-20251001 \
122
+ --preamble sourcegraph \
123
+ --mcp-config '{"mcpServers":{"sourcegraph":{"type":"http","url":"https://sourcegraph.com/.api/mcp/v1","headers":{"Authorization":"token $SOURCEGRAPH_TOKEN"}}}}'
124
+
125
+ # Run and interpret
126
+ codeprobe run /path/to/repo/mcp-comparison --agent claude --max-cost-usd 5.00
127
+ codeprobe interpret /path/to/repo/mcp-comparison
128
+ ```
129
+
130
+ ### Preambles
131
+
132
+ Preambles are composable instruction templates prepended to the agent's prompt for MCP-enabled configs. Built-in preambles: `sourcegraph`, `github`.
133
+
134
+ Override built-ins by placing a `.md` file in:
135
+
136
+ - `<task_dir>/preambles/` (per-task)
137
+ - `.codeprobe/preambles/` (project-level)
138
+ - `~/.codeprobe/preambles/` (user-level)
139
+
140
+ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}}`
141
+
142
+ ## Key Flags
143
+
144
+ ```bash
145
+ # Running
146
+ codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
147
+ codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
148
+ codeprobe run . --dry-run # Estimate resource usage without running
149
+
150
+ # Mining
151
+ codeprobe mine . --enrich # Use LLM to improve weak task instructions
152
+ codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
153
+ codeprobe mine . --mcp-families # Include MCP-optimized task families
154
+ codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
155
+
156
+ # Experiment configs
157
+ codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
158
+ codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
159
+
160
+ # Output
161
+ codeprobe interpret . --format csv # Export for pivot tables
162
+ codeprobe interpret . --format html # Self-contained HTML report
163
+ ```
164
+
165
+ ## Supported Agents
166
+
167
+ - **Claude Code** (`--agent claude`) — headless via `claude -p`
168
+ - **GitHub Copilot** (`--agent copilot`) — via Copilot CLI
169
+ - **Codex** (`--agent codex`) — via OpenAI API
170
+ - Custom agents via the `AgentAdapter` protocol
171
+
172
+ ## Supported Git Hosts
173
+
174
+ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
175
+
176
+ ## Configuration
177
+
178
+ Create a `.evalrc.yaml` in your repo root:
179
+
180
+ ```yaml
181
+ name: my-experiment
182
+ agents: [claude, copilot]
183
+ models: [claude-sonnet-4-6, claude-opus-4-6]
184
+ tasks_dir: .codeprobe/tasks
185
+ ```
186
+
187
+ ## License
188
+
189
+ Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json as _json
6
6
  import logging
7
+ import shutil
7
8
  import subprocess
8
9
  import threading
9
10
  from collections.abc import Callable
@@ -13,7 +14,11 @@ from pathlib import Path
13
14
  from typing import TYPE_CHECKING
14
15
 
15
16
  from codeprobe.core.checkpoint import CheckpointStore
16
- from codeprobe.core.isolation import IsolationStrategy, WorktreeIsolation
17
+ from codeprobe.core.isolation import (
18
+ IsolationStrategy,
19
+ WorktreeIsolation,
20
+ git_restore_clean,
21
+ )
17
22
  from codeprobe.core.preamble import PreambleResolver, _base_prompt, compose_instruction
18
23
  from codeprobe.core.scoring import get_scorer, sanitize_secrets
19
24
  from codeprobe.models.experiment import CompletedTask, ExperimentConfig
@@ -255,18 +260,25 @@ def execute_task(
255
260
  # Done before error checks so partial results from timeouts are scored.
256
261
  effective_repo = worktree_path or repo_path
257
262
  answer_src = effective_repo / "answer.txt"
263
+ # Also check the original repo root — agent may have followed
264
+ # TASK_REPO_ROOT from the instruction (which points to the real repo
265
+ # when worktree_path wasn't rewritten in older instructions).
266
+ answer_fallback = repo_path / "answer.txt" if worktree_path else None
267
+ found_answer = None
258
268
  if answer_src.is_file():
269
+ found_answer = answer_src
270
+ elif answer_fallback is not None and answer_fallback.is_file():
271
+ found_answer = answer_fallback
272
+ if found_answer is not None:
259
273
  try:
260
- import shutil
261
-
262
- shutil.copy2(answer_src, task_dir / "answer.txt")
274
+ shutil.copy2(found_answer, task_dir / "answer.txt")
263
275
  except OSError:
264
276
  pass # Non-fatal; scorer will report missing answer
265
277
 
266
278
  # If the agent failed with no output AND no answer.txt was produced,
267
279
  # return an error. But if answer.txt exists (e.g. agent timed out
268
280
  # after writing it), fall through to scoring.
269
- has_answer = (task_dir / "answer.txt").is_file()
281
+ has_answer = found_answer is not None
270
282
  if output.exit_code != 0 and not output.stdout.strip() and not has_answer:
271
283
  error_msg = output.stderr or f"Agent exited with code {output.exit_code}"
272
284
  return TaskResult(
@@ -325,22 +337,11 @@ _BILLABLE_COST_MODELS = frozenset({"per_token"})
325
337
  def _git_reset_workdir(repo_path: Path) -> None:
326
338
  """Reset the working directory to a clean state between sequential tasks.
327
339
 
328
- Runs ``git checkout -- .`` and ``git clean -fd`` to discard modifications
340
+ Runs ``git restore .`` and ``git clean -fd`` to discard modifications
329
341
  and remove untracked files so task N's leftovers don't corrupt task N+1.
330
342
  """
331
343
  try:
332
- subprocess.run(
333
- ["git", "checkout", "--", "."],
334
- cwd=repo_path,
335
- check=True,
336
- capture_output=True,
337
- )
338
- subprocess.run(
339
- ["git", "clean", "-fd", "-e", ".codeprobe", "-e", ".codeprobe-worktrees"],
340
- cwd=repo_path,
341
- check=True,
342
- capture_output=True,
343
- )
344
+ git_restore_clean(repo_path)
344
345
  except subprocess.CalledProcessError as exc:
345
346
  logger.warning(
346
347
  "Git reset failed (exit %d): %s",
@@ -12,6 +12,38 @@ from typing import Protocol, runtime_checkable
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
+ def git_restore_clean(workdir: Path, *, extra_excludes: tuple[str, ...] = ()) -> None:
16
+ """Restore tracked files and remove untracked files in *workdir*.
17
+
18
+ Uses ``git restore .`` (tolerant of empty diffs) followed by
19
+ ``git clean -fd``. Always excludes ``.codeprobe`` and
20
+ ``.codeprobe-worktrees``; pass *extra_excludes* for more.
21
+ """
22
+ result = subprocess.run(
23
+ ["git", "restore", "."],
24
+ cwd=workdir,
25
+ capture_output=True,
26
+ )
27
+ if result.returncode != 0:
28
+ stderr = result.stderr.decode(errors="replace")
29
+ # "could not resolve HEAD" is expected in a truly empty/detached
30
+ # worktree — not worth warning about.
31
+ if "could not resolve" not in stderr:
32
+ logger.debug("git restore in %s: %s", workdir, stderr)
33
+ clean_cmd = [
34
+ "git",
35
+ "clean",
36
+ "-fd",
37
+ "-e",
38
+ ".codeprobe",
39
+ "-e",
40
+ ".codeprobe-worktrees",
41
+ ]
42
+ for exc in extra_excludes:
43
+ clean_cmd += ["-e", exc]
44
+ subprocess.run(clean_cmd, cwd=workdir, check=True, capture_output=True)
45
+
46
+
15
47
  @runtime_checkable
16
48
  class IsolationStrategy(Protocol):
17
49
  """Protocol for workspace isolation strategies."""
@@ -79,26 +111,7 @@ class WorktreeIsolation:
79
111
  def reset(self, workspace: Path) -> None:
80
112
  """Reset a worktree to clean state."""
81
113
  try:
82
- subprocess.run(
83
- ["git", "checkout", "--", "."],
84
- cwd=workspace,
85
- check=True,
86
- capture_output=True,
87
- )
88
- subprocess.run(
89
- [
90
- "git",
91
- "clean",
92
- "-fd",
93
- "-e",
94
- ".codeprobe",
95
- "-e",
96
- ".codeprobe-worktrees",
97
- ],
98
- cwd=workspace,
99
- check=True,
100
- capture_output=True,
101
- )
114
+ git_restore_clean(workspace)
102
115
  except subprocess.CalledProcessError as exc:
103
116
  logger.warning(
104
117
  "Worktree reset failed for %s (exit %d): %s",
@@ -88,6 +88,14 @@ def _base_prompt(
88
88
  references the worktree instead of the original repo path.
89
89
  """
90
90
  effective_path = worktree_path if worktree_path is not None else repo_path
91
+ # Rewrite TASK_REPO_ROOT in the instruction so agents write to the
92
+ # worktree, not the original repo (avoids cross-task collisions and
93
+ # ensures answer.txt lands where the executor expects it).
94
+ if worktree_path is not None:
95
+ instruction = instruction.replace(
96
+ f"TASK_REPO_ROOT={repo_path}",
97
+ f"TASK_REPO_ROOT={worktree_path}",
98
+ )
91
99
  return (
92
100
  f"You are working on the repository at {effective_path}. "
93
101
  "Follow the instruction below.\n\n"