codeprobe 0.1.7__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. codeprobe-0.2.1/PKG-INFO +224 -0
  2. codeprobe-0.2.1/README.md +189 -0
  3. {codeprobe-0.1.7 → codeprobe-0.2.1}/pyproject.toml +1 -1
  4. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/__init__.py +1 -1
  5. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/claude.py +18 -2
  6. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/__init__.py +108 -1
  7. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/experiment_cmd.py +4 -0
  8. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/init_cmd.py +72 -1
  9. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/mine_cmd.py +13 -0
  10. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/probe_cmd.py +11 -9
  11. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/wizard.py +53 -4
  12. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/checkpoint.py +40 -23
  13. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/executor.py +50 -29
  14. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/isolation.py +33 -12
  15. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/preamble.py +15 -0
  16. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale.py +372 -21
  17. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_families.py +106 -1
  18. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_scanner.py +464 -0
  19. codeprobe-0.2.1/src/codeprobe/mining/sg_ground_truth.py +163 -0
  20. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/writer.py +81 -32
  21. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/models/task.py +1 -0
  22. codeprobe-0.2.1/src/codeprobe/preambles/github.md +21 -0
  23. codeprobe-0.2.1/src/codeprobe/preambles/sourcegraph.md +44 -0
  24. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/probe/generator.py +60 -5
  25. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/probe/writer.py +8 -2
  26. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +6 -6
  27. codeprobe-0.2.1/src/codeprobe.egg-info/PKG-INFO +224 -0
  28. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe.egg-info/SOURCES.txt +5 -0
  29. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_adapters.py +31 -0
  30. codeprobe-0.2.1/tests/test_changed_symbols.py +241 -0
  31. codeprobe-0.2.1/tests/test_cli.py +201 -0
  32. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_executor.py +12 -4
  33. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_init_wizard.py +173 -2
  34. codeprobe-0.2.1/tests/test_mcp_families_mining.py +278 -0
  35. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_new_families.py +29 -3
  36. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_org_scale.py +76 -7
  37. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_preamble.py +36 -13
  38. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_probe.py +351 -0
  39. codeprobe-0.2.1/tests/test_sg_ground_truth.py +318 -0
  40. codeprobe-0.1.7/PKG-INFO +0 -131
  41. codeprobe-0.1.7/README.md +0 -96
  42. codeprobe-0.1.7/src/codeprobe/preambles/sourcegraph.md +0 -32
  43. codeprobe-0.1.7/src/codeprobe.egg-info/PKG-INFO +0 -131
  44. codeprobe-0.1.7/tests/test_cli.py +0 -51
  45. {codeprobe-0.1.7 → codeprobe-0.2.1}/LICENSE +0 -0
  46. {codeprobe-0.1.7 → codeprobe-0.2.1}/setup.cfg +0 -0
  47. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/__main__.py +0 -0
  48. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/__init__.py +0 -0
  49. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/_base.py +0 -0
  50. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/aider.py +0 -0
  51. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/codex.py +0 -0
  52. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/copilot.py +0 -0
  53. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/openai_compat.py +0 -0
  54. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/protocol.py +0 -0
  55. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/session.py +0 -0
  56. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/adapters/telemetry.py +0 -0
  57. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/analysis/__init__.py +0 -0
  58. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/analysis/ranking.py +0 -0
  59. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/analysis/report.py +0 -0
  60. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/analysis/stats.py +0 -0
  61. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/api.py +0 -0
  62. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/assess/__init__.py +0 -0
  63. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/assess/heuristics.py +0 -0
  64. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/assess_cmd.py +0 -0
  65. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/interpret_cmd.py +0 -0
  66. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/ratings_cmd.py +0 -0
  67. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/run_cmd.py +0 -0
  68. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  69. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/cli/yaml_writer.py +0 -0
  70. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/config/__init__.py +0 -0
  71. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/config/loader.py +0 -0
  72. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/__init__.py +0 -0
  73. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/_shared.py +0 -0
  74. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/adaptive.py +0 -0
  75. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/counterfactual.py +0 -0
  76. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/debate.py +0 -0
  77. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/decision_tree.py +0 -0
  78. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/elo.py +0 -0
  79. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/fingerprint.py +0 -0
  80. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/mutation.py +0 -0
  81. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/pareto.py +0 -0
  82. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/sprt.py +0 -0
  83. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/contrib/tournament.py +0 -0
  84. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/__init__.py +0 -0
  85. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/experiment.py +0 -0
  86. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/llm.py +0 -0
  87. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/registry.py +0 -0
  88. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/sandbox.py +0 -0
  89. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/core/scoring.py +0 -0
  90. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/loaders/__init__.py +0 -0
  91. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/__init__.py +0 -0
  92. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/_lang.py +0 -0
  93. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/curator.py +0 -0
  94. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/curator_backends.py +0 -0
  95. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/curator_tiers.py +0 -0
  96. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/extractor.py +0 -0
  97. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  98. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/org_scale_validate.py +0 -0
  99. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/mining/sources.py +0 -0
  100. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/models/__init__.py +0 -0
  101. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/models/evalrc.py +0 -0
  102. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/models/experiment.py +0 -0
  103. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/models/preamble.py +0 -0
  104. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/preambles/__init__.py +0 -0
  105. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/probe/__init__.py +0 -0
  106. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/ratings/__init__.py +0 -0
  107. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/ratings/collector.py +0 -0
  108. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/scaffold/__init__.py +0 -0
  109. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/scaffold/writer.py +0 -0
  110. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/templates/__init__.py +0 -0
  111. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  112. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  113. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  114. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe.egg-info/entry_points.txt +0 -0
  115. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe.egg-info/requires.txt +0 -0
  116. {codeprobe-0.1.7 → codeprobe-0.2.1}/src/codeprobe.egg-info/top_level.txt +0 -0
  117. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_analysis.py +0 -0
  118. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_api.py +0 -0
  119. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_assess.py +0 -0
  120. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_checkpoint.py +0 -0
  121. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_config_loader.py +0 -0
  122. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_contrib.py +0 -0
  123. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_curator_backends.py +0 -0
  124. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_curator_core.py +0 -0
  125. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_curator_integration.py +0 -0
  126. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_curator_tiers.py +0 -0
  127. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_experiment_cmd.py +0 -0
  128. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_experiment_core.py +0 -0
  129. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_llm.py +0 -0
  130. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_loaders.py +0 -0
  131. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_mcp_validate.py +0 -0
  132. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_mining.py +0 -0
  133. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_models.py +0 -0
  134. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_openai_compat.py +0 -0
  135. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_oracle_types.py +0 -0
  136. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_pipeline_integration.py +0 -0
  137. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_ratings.py +0 -0
  138. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_ratings_cmd.py +0 -0
  139. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_registry.py +0 -0
  140. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_scaffold.py +0 -0
  141. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_scanner_refactor.py +0 -0
  142. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_scoring.py +0 -0
  143. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_session.py +0 -0
  144. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_telemetry.py +0 -0
  145. {codeprobe-0.1.7 → codeprobe-0.2.1}/tests/test_weighted_f1.py +0 -0
@@ -0,0 +1,224 @@
1
+ Metadata-Version: 2.4
2
+ Name: codeprobe
3
+ Version: 0.2.1
4
+ Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
+ Author: codeprobe contributors
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/sjarmak/codeprobe
8
+ Project-URL: Repository, https://github.com/sjarmak/codeprobe
9
+ Project-URL: Issues, https://github.com/sjarmak/codeprobe/issues
10
+ Keywords: ai,benchmark,eval,coding-agent,mcp
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Software Development :: Testing
18
+ Requires-Python: >=3.11
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: click<9,>=8.0
22
+ Requires-Dist: pyyaml<7,>=6.0
23
+ Requires-Dist: anthropic>=0.39
24
+ Requires-Dist: openai>=1.66
25
+ Requires-Dist: tiktoken<1,>=0.7
26
+ Requires-Dist: scipy<2,>=1.11
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest<9,>=8.0; extra == "dev"
29
+ Requires-Dist: pytest-cov<6,>=5.0; extra == "dev"
30
+ Requires-Dist: ruff<1,>=0.4; extra == "dev"
31
+ Requires-Dist: mypy<2,>=1.10; extra == "dev"
32
+ Requires-Dist: types-PyYAML<7,>=6.0; extra == "dev"
33
+ Requires-Dist: scipy<2,>=1.11; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # codeprobe
37
+
38
+ Benchmark AI coding agents against **your own codebase**.
39
+
40
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code — not someone else's benchmark suite.
41
+
42
+ ## Why codeprobe?
43
+
44
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
45
+
46
+ ## Prerequisites
47
+
48
+ codeprobe orchestrates external AI coding agents — you need at least one installed:
49
+
50
+ | Agent | Install | Required env var |
51
+ | ------------------ | ------------------------------------------------ | ------------------------------- |
52
+ | **Claude Code** | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
53
+ | **GitHub Copilot** | `npm install -g @github/copilot-cli` (>= 1.0.4) | GitHub auth via `gh auth login` |
54
+ | **Codex** | Included via `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
55
+
56
+ You also need:
57
+
58
+ - **Python 3.11+**
59
+ - **Git** (for task mining and worktree isolation)
60
+ - **GitHub CLI** (`gh`) — optional, for mining tasks from GitHub PRs with linked issues
61
+
62
+ The `assess` and `mine --enrich` commands need an LLM for scoring/enrichment. codeprobe auto-detects the best available backend:
63
+
64
+ | Priority | Backend | Install | Env var |
65
+ | -------- | ------------- | ------------------------------------------------ | ------------------- |
66
+ | 1 | Anthropic SDK | `pip install codeprobe[anthropic]` | `ANTHROPIC_API_KEY` |
67
+ | 2 | OpenAI SDK | `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
68
+ | 3 | Claude CLI | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
69
+
70
+ Override with `CODEPROBE_LLM_BACKEND=anthropic|openai|claude-cli`. Without any backend, `assess` falls back to heuristic scoring.
71
+
72
+ ## Quick Start
73
+
74
+ ```bash
75
+ pip install codeprobe
76
+
77
+ cd /path/to/your/repo
78
+
79
+ codeprobe assess . # Score benchmarking potential (optional)
80
+ codeprobe mine . # Extract tasks from repo history
81
+ codeprobe run . # Run agents against tasks
82
+ codeprobe interpret . # Get recommendations
83
+ ```
84
+
85
+ ## Commands
86
+
87
+ | Command | Purpose |
88
+ | ------------------------ | ------------------------------------------------ |
89
+ | `codeprobe assess` | Score a codebase's benchmarking potential |
90
+ | `codeprobe init` | Interactive wizard — choose what to compare |
91
+ | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
92
+ | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
93
+ | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
94
+ | `codeprobe run` | Execute tasks against AI agents |
95
+ | `codeprobe interpret` | Analyze results, rank configurations |
96
+ | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
97
+ | `codeprobe scaffold` | Create/validate eval task directories |
98
+ | `codeprobe ratings` | Record and analyze agent session quality ratings |
99
+
100
+ ## Two Ways to Generate Tasks
101
+
102
+ ### 1. SDLC Tasks (from merged PRs)
103
+
104
+ Mine real code-change tasks from your git history. Agents must reproduce known fixes and features.
105
+
106
+ ```bash
107
+ codeprobe mine . --count 10 --source github
108
+ codeprobe mine . --count 5 --min-files 4 # Harder tasks (more files changed)
109
+ codeprobe mine . --enrich # LLM-enriched instructions
110
+ ```
111
+
112
+ ### 2. Micro-Benchmark Probes
113
+
114
+ Fast exact-match tasks (30s each) that test code navigation and comprehension — no agent sandbox needed.
115
+
116
+ ```bash
117
+ codeprobe probe . -n 10 -l python -s 42 -o ./probes
118
+ ```
119
+
120
+ Generates four probe types: find-function, count-callers, return-type, module-dependency.
121
+
122
+ ## MCP Comparison Experiments
123
+
124
+ Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
125
+
126
+ ### Mine org-scale comprehension tasks
127
+
128
+ ```bash
129
+ # Set up Sourcegraph credentials
130
+ export SOURCEGRAPH_TOKEN="your-token"
131
+
132
+ # Mine MCP-optimized tasks with Sourcegraph ground truth enrichment
133
+ codeprobe mine /path/to/repo \
134
+ --org-scale --mcp-families --count 5 \
135
+ --no-interactive --no-llm \
136
+ --sg-repo github.com/sg-evals/your-repo
137
+ ```
138
+
139
+ MCP task families: `symbol-reference-trace`, `type-hierarchy-consumers`, `change-scope-audit`.
140
+
141
+ ### Set up the experiment
142
+
143
+ ```bash
144
+ # Create experiment
145
+ codeprobe experiment init /path/to/repo --name mcp-comparison
146
+
147
+ # Copy mined tasks into the experiment
148
+ cp -r /path/to/repo/.codeprobe/tasks/* /path/to/repo/mcp-comparison/tasks/
149
+
150
+ # Baseline config (no MCP, no preamble)
151
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
152
+ --label baseline --agent claude --model claude-haiku-4-5-20251001
153
+
154
+ # Sourcegraph MCP config (preamble + MCP server)
155
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
156
+ --label with-sourcegraph --agent claude --model claude-haiku-4-5-20251001 \
157
+ --preamble sourcegraph \
158
+ --mcp-config '{"mcpServers":{"sourcegraph":{"type":"http","url":"https://sourcegraph.com/.api/mcp/v1","headers":{"Authorization":"token $SOURCEGRAPH_TOKEN"}}}}'
159
+
160
+ # Run and interpret
161
+ codeprobe run /path/to/repo/mcp-comparison --agent claude --max-cost-usd 5.00
162
+ codeprobe interpret /path/to/repo/mcp-comparison
163
+ ```
164
+
165
+ ### Preambles
166
+
167
+ Preambles are composable instruction templates prepended to the agent's prompt for MCP-enabled configs. Built-in preambles: `sourcegraph`, `github`.
168
+
169
+ Override built-ins by placing a `.md` file in:
170
+
171
+ - `<task_dir>/preambles/` (per-task)
172
+ - `.codeprobe/preambles/` (project-level)
173
+ - `~/.codeprobe/preambles/` (user-level)
174
+
175
+ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}}`
176
+
177
+ ## Key Flags
178
+
179
+ ```bash
180
+ # Running
181
+ codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
182
+ codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
183
+ codeprobe run . --dry-run # Estimate resource usage without running
184
+
185
+ # Mining
186
+ codeprobe mine . --enrich # Use LLM to improve weak task instructions
187
+ codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
188
+ codeprobe mine . --mcp-families # Include MCP-optimized task families
189
+ codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
190
+
191
+ # Experiment configs
192
+ codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
193
+ codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
194
+
195
+ # Output
196
+ codeprobe interpret . --format csv # Export for pivot tables
197
+ codeprobe interpret . --format html # Self-contained HTML report
198
+ ```
199
+
200
+ ## Supported Agents
201
+
202
+ - **Claude Code** (`--agent claude`) — headless via `claude -p`
203
+ - **GitHub Copilot** (`--agent copilot`) — via Copilot CLI
204
+ - **Codex** (`--agent codex`) — via OpenAI API
205
+ - Custom agents via the `AgentAdapter` protocol
206
+
207
+ ## Supported Git Hosts
208
+
209
+ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
210
+
211
+ ## Configuration
212
+
213
+ Create a `.evalrc.yaml` in your repo root:
214
+
215
+ ```yaml
216
+ name: my-experiment
217
+ agents: [claude, copilot]
218
+ models: [claude-sonnet-4-6, claude-opus-4-6]
219
+ tasks_dir: .codeprobe/tasks
220
+ ```
221
+
222
+ ## License
223
+
224
+ Apache-2.0
@@ -0,0 +1,189 @@
1
+ # codeprobe
2
+
3
+ Benchmark AI coding agents against **your own codebase**.
4
+
5
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code — not someone else's benchmark suite.
6
+
7
+ ## Why codeprobe?
8
+
9
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
10
+
11
+ ## Prerequisites
12
+
13
+ codeprobe orchestrates external AI coding agents — you need at least one installed:
14
+
15
+ | Agent | Install | Required env var |
16
+ | ------------------ | ------------------------------------------------ | ------------------------------- |
17
+ | **Claude Code** | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
18
+ | **GitHub Copilot** | `npm install -g @github/copilot-cli` (>= 1.0.4) | GitHub auth via `gh auth login` |
19
+ | **Codex** | Included via `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
20
+
21
+ You also need:
22
+
23
+ - **Python 3.11+**
24
+ - **Git** (for task mining and worktree isolation)
25
+ - **GitHub CLI** (`gh`) — optional, for mining tasks from GitHub PRs with linked issues
26
+
27
+ The `assess` and `mine --enrich` commands need an LLM for scoring/enrichment. codeprobe auto-detects the best available backend:
28
+
29
+ | Priority | Backend | Install | Env var |
30
+ | -------- | ------------- | ------------------------------------------------ | ------------------- |
31
+ | 1 | Anthropic SDK | `pip install codeprobe[anthropic]` | `ANTHROPIC_API_KEY` |
32
+ | 2 | OpenAI SDK | `pip install codeprobe[codex]` | `OPENAI_API_KEY` |
33
+ | 3 | Claude CLI | [claude.ai/download](https://claude.ai/download) | `ANTHROPIC_API_KEY` |
34
+
35
+ Override with `CODEPROBE_LLM_BACKEND=anthropic|openai|claude-cli`. Without any backend, `assess` falls back to heuristic scoring.
36
+
37
+ ## Quick Start
38
+
39
+ ```bash
40
+ pip install codeprobe
41
+
42
+ cd /path/to/your/repo
43
+
44
+ codeprobe assess . # Score benchmarking potential (optional)
45
+ codeprobe mine . # Extract tasks from repo history
46
+ codeprobe run . # Run agents against tasks
47
+ codeprobe interpret . # Get recommendations
48
+ ```
49
+
50
+ ## Commands
51
+
52
+ | Command | Purpose |
53
+ | ------------------------ | ------------------------------------------------ |
54
+ | `codeprobe assess` | Score a codebase's benchmarking potential |
55
+ | `codeprobe init` | Interactive wizard — choose what to compare |
56
+ | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
57
+ | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
58
+ | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
59
+ | `codeprobe run` | Execute tasks against AI agents |
60
+ | `codeprobe interpret` | Analyze results, rank configurations |
61
+ | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
62
+ | `codeprobe scaffold` | Create/validate eval task directories |
63
+ | `codeprobe ratings` | Record and analyze agent session quality ratings |
64
+
65
+ ## Two Ways to Generate Tasks
66
+
67
+ ### 1. SDLC Tasks (from merged PRs)
68
+
69
+ Mine real code-change tasks from your git history. Agents must reproduce known fixes and features.
70
+
71
+ ```bash
72
+ codeprobe mine . --count 10 --source github
73
+ codeprobe mine . --count 5 --min-files 4 # Harder tasks (more files changed)
74
+ codeprobe mine . --enrich # LLM-enriched instructions
75
+ ```
76
+
77
+ ### 2. Micro-Benchmark Probes
78
+
79
+ Fast exact-match tasks (30s each) that test code navigation and comprehension — no agent sandbox needed.
80
+
81
+ ```bash
82
+ codeprobe probe . -n 10 -l python -s 42 -o ./probes
83
+ ```
84
+
85
+ Generates four probe types: find-function, count-callers, return-type, module-dependency.
86
+
87
+ ## MCP Comparison Experiments
88
+
89
+ Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
90
+
91
+ ### Mine org-scale comprehension tasks
92
+
93
+ ```bash
94
+ # Set up Sourcegraph credentials
95
+ export SOURCEGRAPH_TOKEN="your-token"
96
+
97
+ # Mine MCP-optimized tasks with Sourcegraph ground truth enrichment
98
+ codeprobe mine /path/to/repo \
99
+ --org-scale --mcp-families --count 5 \
100
+ --no-interactive --no-llm \
101
+ --sg-repo github.com/sg-evals/your-repo
102
+ ```
103
+
104
+ MCP task families: `symbol-reference-trace`, `type-hierarchy-consumers`, `change-scope-audit`.
105
+
106
+ ### Set up the experiment
107
+
108
+ ```bash
109
+ # Create experiment
110
+ codeprobe experiment init /path/to/repo --name mcp-comparison
111
+
112
+ # Copy mined tasks into the experiment
113
+ cp -r /path/to/repo/.codeprobe/tasks/* /path/to/repo/mcp-comparison/tasks/
114
+
115
+ # Baseline config (no MCP, no preamble)
116
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
117
+ --label baseline --agent claude --model claude-haiku-4-5-20251001
118
+
119
+ # Sourcegraph MCP config (preamble + MCP server)
120
+ codeprobe experiment add-config /path/to/repo/mcp-comparison \
121
+ --label with-sourcegraph --agent claude --model claude-haiku-4-5-20251001 \
122
+ --preamble sourcegraph \
123
+ --mcp-config '{"mcpServers":{"sourcegraph":{"type":"http","url":"https://sourcegraph.com/.api/mcp/v1","headers":{"Authorization":"token $SOURCEGRAPH_TOKEN"}}}}'
124
+
125
+ # Run and interpret
126
+ codeprobe run /path/to/repo/mcp-comparison --agent claude --max-cost-usd 5.00
127
+ codeprobe interpret /path/to/repo/mcp-comparison
128
+ ```
129
+
130
+ ### Preambles
131
+
132
+ Preambles are composable instruction templates prepended to the agent's prompt for MCP-enabled configs. Built-in preambles: `sourcegraph`, `github`.
133
+
134
+ Override built-ins by placing a `.md` file in:
135
+
136
+ - `<task_dir>/preambles/` (per-task)
137
+ - `.codeprobe/preambles/` (project-level)
138
+ - `~/.codeprobe/preambles/` (user-level)
139
+
140
+ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}}`
141
+
142
+ ## Key Flags
143
+
144
+ ```bash
145
+ # Running
146
+ codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
147
+ codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
148
+ codeprobe run . --dry-run # Estimate resource usage without running
149
+
150
+ # Mining
151
+ codeprobe mine . --enrich # Use LLM to improve weak task instructions
152
+ codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
153
+ codeprobe mine . --mcp-families # Include MCP-optimized task families
154
+ codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
155
+
156
+ # Experiment configs
157
+ codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
158
+ codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
159
+
160
+ # Output
161
+ codeprobe interpret . --format csv # Export for pivot tables
162
+ codeprobe interpret . --format html # Self-contained HTML report
163
+ ```
164
+
165
+ ## Supported Agents
166
+
167
+ - **Claude Code** (`--agent claude`) — headless via `claude -p`
168
+ - **GitHub Copilot** (`--agent copilot`) — via Copilot CLI
169
+ - **Codex** (`--agent codex`) — via OpenAI API
170
+ - Custom agents via the `AgentAdapter` protocol
171
+
172
+ ## Supported Git Hosts
173
+
174
+ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
175
+
176
+ ## Configuration
177
+
178
+ Create a `.evalrc.yaml` in your repo root:
179
+
180
+ ```yaml
181
+ name: my-experiment
182
+ agents: [claude, copilot]
183
+ models: [claude-sonnet-4-6, claude-opus-4-6]
184
+ tasks_dir: .codeprobe/tasks
185
+ ```
186
+
187
+ ## License
188
+
189
+ Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.1.7"
3
+ version = "0.2.1"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -1,3 +1,3 @@
1
1
  """codeprobe — Benchmark AI coding agents against your own codebase."""
2
2
 
3
- __version__ = "0.1.7"
3
+ __version__ = "0.2.0"
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  import re
7
+ import shutil
7
8
  import subprocess
8
9
  import tempfile
9
10
  from pathlib import Path
@@ -69,16 +70,31 @@ class ClaudeAdapter(BaseAdapter):
69
70
 
70
71
  mcp_path = self._write_mcp_config(config)
71
72
  if mcp_path:
72
- cmd.extend(["--mcp-config", mcp_path])
73
+ cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
73
74
 
74
75
  return cmd
75
76
 
76
77
  def isolate_session(self, slot_id: int) -> dict[str, str]:
77
- """Return a per-slot CLAUDE_CONFIG_DIR for session isolation."""
78
+ """Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
79
+
80
+ Copies authentication credentials from the real ``~/.claude/``
81
+ directory so the agent subprocess can authenticate.
82
+ """
78
83
  config_dir = (
79
84
  Path(tempfile.gettempdir()) / "codeprobe-claude" / f"slot-{slot_id}"
80
85
  )
81
86
  config_dir.mkdir(parents=True, exist_ok=True)
87
+
88
+ # Copy auth credentials from the user's real config dir.
89
+ # Without these the subprocess gets "Not logged in".
90
+ real_config = Path.home() / ".claude"
91
+ if real_config.is_dir():
92
+ for name in ("credentials.json", ".credentials.json"):
93
+ src = real_config / name
94
+ dst = config_dir / name
95
+ if src.is_file():
96
+ shutil.copy2(src, dst)
97
+
82
98
  return {"CLAUDE_CONFIG_DIR": str(config_dir)}
83
99
 
84
100
  def parse_output(
@@ -1,18 +1,89 @@
1
1
  """CLI entry point for codeprobe."""
2
2
 
3
+ import json as _json
4
+ import logging
5
+ import sys
6
+
3
7
  import click
4
8
 
5
9
  from codeprobe import __version__
6
10
 
7
11
 
12
+ class _JsonFormatter(logging.Formatter):
13
+ """Emit one JSON object per log line."""
14
+
15
+ def format(self, record: logging.LogRecord) -> str:
16
+ payload = {
17
+ "level": record.levelname,
18
+ "logger": record.name,
19
+ "message": record.getMessage(),
20
+ "timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
21
+ }
22
+ return _json.dumps(payload)
23
+
24
+
25
+ def _configure_logging(verbose: int, quiet: bool, log_format: str = "text") -> None:
26
+ """Configure namespace-scoped logging for codeprobe.* modules.
27
+
28
+ Attaches a StreamHandler to `logging.getLogger("codeprobe")` so that
29
+ all 26+ codeprobe.* modules emit through hierarchy without touching
30
+ third-party loggers (httpx, urllib3, etc.).
31
+ """
32
+ if quiet:
33
+ level = logging.WARNING
34
+ elif verbose >= 1:
35
+ level = logging.DEBUG
36
+ else:
37
+ level = logging.INFO
38
+
39
+ logger = logging.getLogger("codeprobe")
40
+ logger.setLevel(level)
41
+ logger.propagate = False # don't bubble to root
42
+
43
+ # Idempotent: tests / repeat invocations must not duplicate handlers.
44
+ for h in list(logger.handlers):
45
+ logger.removeHandler(h)
46
+
47
+ handler = logging.StreamHandler(sys.stderr)
48
+ if log_format == "json":
49
+ handler.setFormatter(_JsonFormatter())
50
+ elif verbose >= 1:
51
+ fmt = "%(levelname)s %(name)s: %(message)s"
52
+ handler.setFormatter(logging.Formatter(fmt))
53
+ else:
54
+ fmt = "%(levelname)s: %(message)s"
55
+ handler.setFormatter(logging.Formatter(fmt))
56
+ logger.addHandler(handler)
57
+
58
+
8
59
  @click.group()
60
+ @click.option(
61
+ "-v",
62
+ "--verbose",
63
+ count=True,
64
+ help="Increase log verbosity (-v sets DEBUG).",
65
+ )
66
+ @click.option(
67
+ "-q",
68
+ "--quiet",
69
+ is_flag=True,
70
+ default=False,
71
+ help="Suppress INFO logs (WARNING and above only).",
72
+ )
73
+ @click.option(
74
+ "--log-format",
75
+ type=click.Choice(["text", "json"]),
76
+ default="text",
77
+ help="Log output format (default: text). 'json' emits one JSON object per line.",
78
+ )
9
79
  @click.version_option(version=__version__, prog_name="codeprobe")
10
- def main() -> None:
80
+ def main(verbose: int, quiet: bool, log_format: str) -> None:
11
81
  """Benchmark AI coding agents against your own codebase.
12
82
 
13
83
  Mine real tasks from your repo history, run agents against them,
14
84
  and interpret the results to find which setup works best for YOUR code.
15
85
  """
86
+ _configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
16
87
 
17
88
 
18
89
  @main.command()
@@ -121,6 +192,20 @@ def init(path: str) -> None:
121
192
  default=False,
122
193
  help="Run LLM verification on curated ground truth.",
123
194
  )
195
+ @click.option(
196
+ "--mcp-families",
197
+ is_flag=True,
198
+ default=False,
199
+ help="Include MCP-advantaged task families (symbol-reference-trace, "
200
+ "type-hierarchy-consumers, change-scope-audit). Only with --org-scale.",
201
+ )
202
+ @click.option(
203
+ "--sg-repo",
204
+ default="",
205
+ help="Sourcegraph repo identifier for ground truth enrichment "
206
+ "(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
207
+ "when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
208
+ )
124
209
  def mine(
125
210
  path: str,
126
211
  count: int,
@@ -139,6 +224,8 @@ def mine(
139
224
  curate: bool,
140
225
  backends: tuple[str, ...],
141
226
  verify_curation_flag: bool,
227
+ mcp_families: bool,
228
+ sg_repo: str,
142
229
  ) -> None:
143
230
  """Mine eval tasks from a repository's history.
144
231
 
@@ -175,6 +262,8 @@ def mine(
175
262
  curate=curate,
176
263
  backends=backends,
177
264
  verify_curation_flag=verify_curation_flag,
265
+ mcp_families=mcp_families,
266
+ sg_repo=sg_repo,
178
267
  )
179
268
 
180
269
 
@@ -272,6 +361,20 @@ def init_experiment(path: str, name: str, description: str) -> None:
272
361
  @click.option(
273
362
  "--mcp-config", default=None, help="MCP config as JSON string or file path."
274
363
  )
364
+ @click.option(
365
+ "--instruction-variant",
366
+ default=None,
367
+ help="Instruction file variant (e.g., instruction_mcp.md). Default: instruction.md.",
368
+ )
369
+ @click.option(
370
+ "--preamble",
371
+ "preambles",
372
+ multiple=True,
373
+ help=(
374
+ "Preamble to prepend to the instruction. Repeatable. "
375
+ "Built-ins: sourcegraph, github. Or path to a custom .md file."
376
+ ),
377
+ )
275
378
  def add_config(
276
379
  path: str,
277
380
  label: str,
@@ -279,6 +382,8 @@ def add_config(
279
382
  model: str | None,
280
383
  permission_mode: str,
281
384
  mcp_config: str | None,
385
+ instruction_variant: str | None,
386
+ preambles: tuple[str, ...],
282
387
  ) -> None:
283
388
  """Add a configuration to an existing experiment."""
284
389
  from codeprobe.cli.experiment_cmd import experiment_add_config
@@ -290,6 +395,8 @@ def add_config(
290
395
  model=model,
291
396
  permission_mode=permission_mode,
292
397
  mcp_config_str=mcp_config,
398
+ instruction_variant=instruction_variant,
399
+ preambles=preambles,
293
400
  )
294
401
 
295
402
 
@@ -63,6 +63,8 @@ def experiment_add_config(
63
63
  model: str | None,
64
64
  permission_mode: str,
65
65
  mcp_config_str: str | None,
66
+ instruction_variant: str | None = None,
67
+ preambles: tuple[str, ...] = (),
66
68
  ) -> None:
67
69
  """Add a configuration to an existing experiment."""
68
70
  exp_dir = Path(path)
@@ -104,6 +106,8 @@ def experiment_add_config(
104
106
  model=model,
105
107
  permission_mode=permission_mode,
106
108
  mcp_config=mcp_config,
109
+ instruction_variant=instruction_variant,
110
+ preambles=preambles,
107
111
  )
108
112
 
109
113
  # Validate the label is a safe path component