codeprobe 0.3.5__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {codeprobe-0.3.5 → codeprobe-0.3.7}/PKG-INFO +24 -2
  2. {codeprobe-0.3.5 → codeprobe-0.3.7}/README.md +23 -1
  3. {codeprobe-0.3.5 → codeprobe-0.3.7}/pyproject.toml +4 -1
  4. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/__init__.py +1 -1
  5. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/_base.py +50 -3
  6. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/claude.py +1 -0
  7. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/protocol.py +8 -0
  8. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/telemetry.py +26 -0
  9. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/api.py +11 -2
  10. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/assess/heuristics.py +21 -7
  11. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/__init__.py +148 -40
  12. codeprobe-0.3.7/src/codeprobe/cli/auth_cmd.py +81 -0
  13. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/init_cmd.py +27 -9
  14. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/mine_cmd.py +389 -66
  15. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/rich_display.py +10 -2
  16. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/run_cmd.py +24 -4
  17. codeprobe-0.3.7/src/codeprobe/config/redact.py +45 -0
  18. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/executor.py +115 -3
  19. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/experiment.py +8 -1
  20. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/isolation.py +138 -0
  21. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/extractor.py +76 -5
  22. codeprobe-0.3.7/src/codeprobe/mining/multi_repo.py +499 -0
  23. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/org_scale.py +19 -16
  24. codeprobe-0.3.7/src/codeprobe/mining/sg_auth.py +318 -0
  25. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/sg_ground_truth.py +103 -28
  26. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/writer.py +130 -6
  27. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/models/experiment.py +15 -0
  28. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/models/task.py +37 -0
  29. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe.egg-info/PKG-INFO +24 -2
  30. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe.egg-info/SOURCES.txt +12 -0
  31. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_adapters.py +149 -1
  32. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_assess.py +90 -11
  33. codeprobe-0.3.7/tests/test_auth_cmd.py +149 -0
  34. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_cli.py +101 -0
  35. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_executor.py +303 -7
  36. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_init_wizard.py +2 -2
  37. codeprobe-0.3.7/tests/test_isolation.py +179 -0
  38. codeprobe-0.3.7/tests/test_mine_cli.py +138 -0
  39. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_mine_goals.py +383 -49
  40. codeprobe-0.3.7/tests/test_mine_presets.py +309 -0
  41. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_mining.py +251 -0
  42. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_models.py +10 -0
  43. codeprobe-0.3.7/tests/test_multi_repo_e2e.py +465 -0
  44. codeprobe-0.3.7/tests/test_multi_repo_mining.py +225 -0
  45. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_org_scale.py +134 -0
  46. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_pipeline_integration.py +6 -6
  47. codeprobe-0.3.7/tests/test_secret_redaction.py +234 -0
  48. codeprobe-0.3.7/tests/test_sg_auth.py +307 -0
  49. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_sg_ground_truth.py +141 -29
  50. codeprobe-0.3.7/tests/test_suite_manifest.py +142 -0
  51. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_telemetry.py +36 -0
  52. codeprobe-0.3.5/tests/test_mine_presets.py +0 -163
  53. {codeprobe-0.3.5 → codeprobe-0.3.7}/LICENSE +0 -0
  54. {codeprobe-0.3.5 → codeprobe-0.3.7}/setup.cfg +0 -0
  55. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/__main__.py +0 -0
  56. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/__init__.py +0 -0
  57. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/codex.py +0 -0
  58. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/copilot.py +0 -0
  59. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/openai_compat.py +0 -0
  60. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/adapters/session.py +0 -0
  61. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/analysis/__init__.py +0 -0
  62. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/analysis/ranking.py +0 -0
  63. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/analysis/report.py +0 -0
  64. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/analysis/stats.py +0 -0
  65. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/assess/__init__.py +0 -0
  66. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/assess_cmd.py +0 -0
  67. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/doctor_cmd.py +0 -0
  68. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/experiment_cmd.py +0 -0
  69. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/interpret_cmd.py +0 -0
  70. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/json_display.py +0 -0
  71. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/preamble_cmd.py +0 -0
  72. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/probe_cmd.py +0 -0
  73. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/ratings_cmd.py +0 -0
  74. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  75. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/validate_cmd.py +0 -0
  76. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/wizard.py +0 -0
  77. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/cli/yaml_writer.py +0 -0
  78. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/config/__init__.py +0 -0
  79. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/config/loader.py +0 -0
  80. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/__init__.py +0 -0
  81. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/_shared.py +0 -0
  82. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/adaptive.py +0 -0
  83. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/counterfactual.py +0 -0
  84. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/debate.py +0 -0
  85. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/decision_tree.py +0 -0
  86. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/elo.py +0 -0
  87. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/fingerprint.py +0 -0
  88. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/mutation.py +0 -0
  89. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/pareto.py +0 -0
  90. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/sprt.py +0 -0
  91. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/contrib/tournament.py +0 -0
  92. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/__init__.py +0 -0
  93. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/__main__.py +0 -0
  94. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/checkpoint.py +0 -0
  95. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/events.py +0 -0
  96. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/llm.py +0 -0
  97. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/mcp_discovery.py +0 -0
  98. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/preamble.py +0 -0
  99. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/registry.py +0 -0
  100. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/sandbox.py +0 -0
  101. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/core/scoring.py +0 -0
  102. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/loaders/__init__.py +0 -0
  103. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/loaders/suite.py +0 -0
  104. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/__init__.py +0 -0
  105. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/_graph.py +0 -0
  106. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/_lang.py +0 -0
  107. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/comprehension.py +0 -0
  108. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/comprehension_writer.py +0 -0
  109. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/curator.py +0 -0
  110. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/curator_backends.py +0 -0
  111. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/curator_tiers.py +0 -0
  112. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/org_scale_families.py +0 -0
  113. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  114. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/org_scale_scanner.py +0 -0
  115. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/org_scale_validate.py +0 -0
  116. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/mining/sources.py +0 -0
  117. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/models/__init__.py +0 -0
  118. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/models/evalrc.py +0 -0
  119. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/models/preamble.py +0 -0
  120. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/models/suite.py +0 -0
  121. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/preambles/__init__.py +0 -0
  122. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/preambles/github.md +0 -0
  123. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/preambles/sourcegraph.md +0 -0
  124. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/probe/__init__.py +0 -0
  125. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/probe/adapter.py +0 -0
  126. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/probe/generator.py +0 -0
  127. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/probe/writer.py +0 -0
  128. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/ratings/__init__.py +0 -0
  129. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/ratings/collector.py +0 -0
  130. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/scaffold/__init__.py +0 -0
  131. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/scaffold/writer.py +0 -0
  132. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/templates/__init__.py +0 -0
  133. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
  134. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  135. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  136. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  137. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe.egg-info/entry_points.txt +0 -0
  138. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe.egg-info/requires.txt +0 -0
  139. {codeprobe-0.3.5 → codeprobe-0.3.7}/src/codeprobe.egg-info/top_level.txt +0 -0
  140. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_adapter_contracts.py +0 -0
  141. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_analysis.py +0 -0
  142. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_api.py +0 -0
  143. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_artifact_scorer.py +0 -0
  144. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_changed_symbols.py +0 -0
  145. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_checkpoint.py +0 -0
  146. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_checkpoint_scoring.py +0 -0
  147. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_comprehension.py +0 -0
  148. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_config_loader.py +0 -0
  149. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_contrib.py +0 -0
  150. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_ctrlc_integration.py +0 -0
  151. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_curator_backends.py +0 -0
  152. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_curator_core.py +0 -0
  153. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_curator_integration.py +0 -0
  154. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_curator_tiers.py +0 -0
  155. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_doctor_cmd.py +0 -0
  156. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_events.py +0 -0
  157. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_executor_events.py +0 -0
  158. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_experiment_cmd.py +0 -0
  159. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_experiment_core.py +0 -0
  160. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_json_display.py +0 -0
  161. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_llm.py +0 -0
  162. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_loaders.py +0 -0
  163. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_mcp_families_mining.py +0 -0
  164. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_mcp_validate.py +0 -0
  165. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_mine_profiles.py +0 -0
  166. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_new_families.py +0 -0
  167. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_openai_compat.py +0 -0
  168. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_oracle_types.py +0 -0
  169. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_preamble.py +0 -0
  170. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_preamble_cmd.py +0 -0
  171. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_probe.py +0 -0
  172. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_probe_adapter.py +0 -0
  173. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_ratings.py +0 -0
  174. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_ratings_cmd.py +0 -0
  175. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_registry.py +0 -0
  176. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_run_config_resolution.py +0 -0
  177. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_scaffold.py +0 -0
  178. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_scanner_refactor.py +0 -0
  179. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_scoring.py +0 -0
  180. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_session.py +0 -0
  181. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_shell_shim.py +0 -0
  182. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_show_prompt.py +0 -0
  183. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_suite.py +0 -0
  184. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_validate_cmd.py +0 -0
  185. {codeprobe-0.3.5 → codeprobe-0.3.7}/tests/test_weighted_f1.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeprobe
3
- Version: 0.3.5
3
+ Version: 0.3.7
4
4
  Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
5
  Author: codeprobe contributors
6
6
  License-Expression: Apache-2.0
@@ -42,7 +42,7 @@ Mine real tasks from your repo history, run agents against them, and find out wh
42
42
 
43
43
  ## Why codeprobe?
44
44
 
45
- Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
45
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
46
46
 
47
47
  ## Prerequisites
48
48
 
@@ -122,6 +122,28 @@ codeprobe probe . -n 10 -l python -s 42 -o ./probes
122
122
 
123
123
  Generates four probe types: find-function, count-callers, return-type, module-dependency.
124
124
 
125
+ ## Curation Workflows
126
+
127
+ End-to-end flows from a raw repo to ranked agent results. Each workflow covers the full `assess → mine → validate → run → interpret` pipeline.
128
+
129
+ | Workflow | When to use | Guide |
130
+ | -------------- | ----------------------------------------- | ------------------------------------------------------------ |
131
+ | **Standard** | Repo has merged PRs/MRs | [docs/workflows/standard.md](docs/workflows/standard.md) |
132
+ | **Cold-start** | New repo, squashed history, vendored code | [docs/workflows/cold-start.md](docs/workflows/cold-start.md) |
133
+ | **Cross-repo** | Tasks spanning multiple repositories | [docs/workflows/cross-repo.md](docs/workflows/cross-repo.md) |
134
+
135
+ **Quick start (standard path):**
136
+
137
+ ```bash
138
+ codeprobe assess /path/to/repo
139
+ codeprobe mine /path/to/repo --goal quality --count 10 --no-interactive
140
+ codeprobe validate /path/to/repo/.codeprobe/tasks/<task-id>
141
+ codeprobe run /path/to/repo --agent claude --max-cost-usd 5.00
142
+ codeprobe interpret /path/to/repo
143
+ ```
144
+
145
+ For the full MCP comparison setup (preambles, baseline vs with-MCP configs), see the next section.
146
+
125
147
  ## MCP Comparison Experiments
126
148
 
127
149
  Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
@@ -6,7 +6,7 @@ Mine real tasks from your repo history, run agents against them, and find out wh
6
6
 
7
7
  ## Why codeprobe?
8
8
 
9
- Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
9
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
10
10
 
11
11
  ## Prerequisites
12
12
 
@@ -86,6 +86,28 @@ codeprobe probe . -n 10 -l python -s 42 -o ./probes
86
86
 
87
87
  Generates four probe types: find-function, count-callers, return-type, module-dependency.
88
88
 
89
+ ## Curation Workflows
90
+
91
+ End-to-end flows from a raw repo to ranked agent results. Each workflow covers the full `assess → mine → validate → run → interpret` pipeline.
92
+
93
+ | Workflow | When to use | Guide |
94
+ | -------------- | ----------------------------------------- | ------------------------------------------------------------ |
95
+ | **Standard** | Repo has merged PRs/MRs | [docs/workflows/standard.md](docs/workflows/standard.md) |
96
+ | **Cold-start** | New repo, squashed history, vendored code | [docs/workflows/cold-start.md](docs/workflows/cold-start.md) |
97
+ | **Cross-repo** | Tasks spanning multiple repositories | [docs/workflows/cross-repo.md](docs/workflows/cross-repo.md) |
98
+
99
+ **Quick start (standard path):**
100
+
101
+ ```bash
102
+ codeprobe assess /path/to/repo
103
+ codeprobe mine /path/to/repo --goal quality --count 10 --no-interactive
104
+ codeprobe validate /path/to/repo/.codeprobe/tasks/<task-id>
105
+ codeprobe run /path/to/repo --agent claude --max-cost-usd 5.00
106
+ codeprobe interpret /path/to/repo
107
+ ```
108
+
109
+ For the full MCP comparison setup (preambles, baseline vs with-MCP configs), see the next section.
110
+
89
111
  ## MCP Comparison Experiments
90
112
 
91
113
  Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.3.5"
3
+ version = "0.3.7"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -75,6 +75,9 @@ where = ["src"]
75
75
 
76
76
  [tool.pytest.ini_options]
77
77
  testpaths = ["tests"]
78
+ markers = [
79
+ "integration: requires external services (skipped by default in CI)",
80
+ ]
78
81
 
79
82
  [tool.mypy]
80
83
  python_version = "3.11"
@@ -1,3 +1,3 @@
1
1
  """codeprobe — Benchmark AI coding agents against your own codebase."""
2
2
 
3
- __version__ = "0.3.1"
3
+ __version__ = "0.3.7"
@@ -63,6 +63,19 @@ def _adapter_safe_env(extra: dict[str, str] | None = None) -> dict[str, str]:
63
63
  return env
64
64
 
65
65
 
66
+ def _decode_timeout_output(raw: str | bytes | None) -> str:
67
+ """Decode stdout/stderr from a TimeoutExpired exception.
68
+
69
+ The exception may carry ``str``, ``bytes``, or ``None`` depending on
70
+ how ``subprocess.run`` was called and how the process was killed.
71
+ """
72
+ if raw is None:
73
+ return ""
74
+ if isinstance(raw, bytes):
75
+ return raw.decode("utf-8", errors="replace")
76
+ return raw
77
+
78
+
66
79
  class BaseAdapter:
67
80
  """Base class for CLI-based agent adapters.
68
81
 
@@ -162,12 +175,46 @@ class BaseAdapter:
162
175
  )
163
176
  except subprocess.TimeoutExpired as exc:
164
177
  duration = time.monotonic() - start
178
+ timeout_error = f"Agent timed out after {config.timeout_seconds}s"
179
+
180
+ raw_stdout = _decode_timeout_output(exc.stdout)
181
+ raw_stderr = _decode_timeout_output(exc.stderr) or None
182
+
183
+ if raw_stdout:
184
+ try:
185
+ partial_result = subprocess.CompletedProcess(
186
+ args=cmd,
187
+ returncode=-1,
188
+ stdout=raw_stdout,
189
+ stderr=raw_stderr or "",
190
+ )
191
+ parsed = self.parse_output(partial_result, duration)
192
+ merged_error = timeout_error
193
+ if parsed.error:
194
+ merged_error = f"{timeout_error}; {parsed.error}"
195
+ return AgentOutput(
196
+ stdout=parsed.stdout,
197
+ stderr=parsed.stderr,
198
+ exit_code=-1,
199
+ duration_seconds=duration,
200
+ input_tokens=parsed.input_tokens,
201
+ output_tokens=parsed.output_tokens,
202
+ cache_read_tokens=parsed.cache_read_tokens,
203
+ cost_usd=parsed.cost_usd,
204
+ cost_model=parsed.cost_model,
205
+ cost_source=parsed.cost_source,
206
+ error=merged_error,
207
+ tool_call_count=parsed.tool_call_count,
208
+ )
209
+ except Exception as parse_exc:
210
+ timeout_error = f"{timeout_error}; parse_output failed: {parse_exc}"
211
+
165
212
  return AgentOutput(
166
- stdout=exc.stdout if isinstance(exc.stdout, str) else "",
167
- stderr=exc.stderr if isinstance(exc.stderr, str) else None,
213
+ stdout=raw_stdout,
214
+ stderr=raw_stderr,
168
215
  exit_code=-1,
169
216
  duration_seconds=duration,
170
- error=f"Agent timed out after {config.timeout_seconds}s",
217
+ error=timeout_error,
171
218
  )
172
219
  except FileNotFoundError as exc:
173
220
  raise AdapterSetupError(f"Binary not found at runtime: {exc}") from exc
@@ -132,4 +132,5 @@ class ClaudeAdapter(BaseAdapter):
132
132
  cost_model=usage.cost_model,
133
133
  cost_source=usage.cost_source,
134
134
  error=usage.error,
135
+ tool_call_count=usage.tool_call_count,
135
136
  )
@@ -44,6 +44,7 @@ class AgentOutput:
44
44
  cost_model: str = "unknown"
45
45
  error: str | None = None
46
46
  cost_source: str = "unavailable"
47
+ tool_call_count: int | None = None
47
48
 
48
49
  def __post_init__(self) -> None:
49
50
  if self.cost_model not in ALLOWED_COST_MODELS:
@@ -81,6 +82,13 @@ class AgentAdapter(Protocol):
81
82
 
82
83
  [project.entry-points."codeprobe.agents"]
83
84
  myagent = "my_package:MyAgentAdapter"
85
+
86
+ For cross-repo tasks, the executor may lay out additional
87
+ repositories under ``<workspace>/repos/<name>``, each pinned to its
88
+ own pre-merge commit. Adapters don't need special handling — the
89
+ paths are available for the model to navigate, and the primary
90
+ workspace remains at its existing location for backwards
91
+ compatibility with single-repo tasks.
84
92
  """
85
93
 
86
94
  @property
@@ -65,6 +65,7 @@ class UsageData:
65
65
  cost_model: str = "unknown"
66
66
  cost_source: str = "unavailable"
67
67
  error: str | None = None
68
+ tool_call_count: int | None = None
68
69
 
69
70
  def __post_init__(self) -> None:
70
71
  if self.cost_model not in ALLOWED_COST_MODELS:
@@ -86,6 +87,28 @@ class TelemetryCollector(Protocol):
86
87
  def collect(self, raw_output: str, **context: Any) -> UsageData: ...
87
88
 
88
89
 
90
+ def _count_tool_use_blocks(envelope: dict[str, Any]) -> int | None:
91
+ """Count ``tool_use`` content blocks in a Claude CLI JSON envelope.
92
+
93
+ Iterates the ``messages`` array (when present) and counts content
94
+ blocks with ``type == "tool_use"`` in assistant messages.
95
+ Returns ``None`` when the envelope has no ``messages`` key.
96
+ """
97
+ messages = envelope.get("messages")
98
+ if messages is None:
99
+ return None
100
+
101
+ count = 0
102
+ for msg in messages:
103
+ content = msg.get("content")
104
+ if not isinstance(content, list):
105
+ continue
106
+ for block in content:
107
+ if isinstance(block, dict) and block.get("type") == "tool_use":
108
+ count += 1
109
+ return count
110
+
111
+
89
112
  class JsonStdoutCollector:
90
113
  """Extract telemetry from Claude CLI JSON envelope on stdout.
91
114
 
@@ -125,6 +148,8 @@ class JsonStdoutCollector:
125
148
  cost_model = "unknown"
126
149
  cost_source = "unavailable"
127
150
 
151
+ tool_call_count = _count_tool_use_blocks(envelope)
152
+
128
153
  return UsageData(
129
154
  input_tokens=input_tokens,
130
155
  output_tokens=output_tokens,
@@ -132,6 +157,7 @@ class JsonStdoutCollector:
132
157
  cost_usd=cost_usd_raw,
133
158
  cost_model=cost_model,
134
159
  cost_source=cost_source,
160
+ tool_call_count=tool_call_count,
135
161
  )
136
162
 
137
163
 
@@ -185,8 +185,17 @@ def run_experiment(
185
185
 
186
186
  save_config_results(experiment_dir, exp_config.label, results)
187
187
 
188
- passed = sum(1 for r in results if r.automated_score >= 1.0)
189
- logger.info("[%s] %d/%d passed", exp_config.label, passed, len(results))
188
+ scoring = sum(1 for r in results if r.automated_score > 0.0)
189
+ mean = (
190
+ sum(r.automated_score for r in results) / len(results) if results else 0.0
191
+ )
192
+ logger.info(
193
+ "[%s] %d/%d scored (mean=%.2f)",
194
+ exp_config.label,
195
+ scoring,
196
+ len(results),
197
+ mean,
198
+ )
190
199
 
191
200
  all_config_results.append(
192
201
  ConfigResults(config=exp_config.label, completed=results)
@@ -69,6 +69,16 @@ _TEST_GLOBS: list[str] = [
69
69
  "*.spec.js",
70
70
  ]
71
71
 
72
+ # Recursive variants for repos with nested test layouts (e.g. numpy/_core/tests/).
73
+ _RECURSIVE_TEST_DIR_GLOBS: list[str] = [
74
+ "**/tests/**",
75
+ "**/test/**",
76
+ "**/spec/**",
77
+ "**/__tests__/**",
78
+ ]
79
+
80
+ _RECURSIVE_TEST_FILE_GLOBS: list[str] = [f"**/{p}" for p in _TEST_GLOBS]
81
+
72
82
  # ---------------------------------------------------------------------------
73
83
  # Fixed rubric — model scores against these, doesn't invent them
74
84
  # ---------------------------------------------------------------------------
@@ -217,16 +227,20 @@ def _detect_primary_languages(file_list: str) -> list[str]:
217
227
 
218
228
 
219
229
  def _has_tests(repo_path: Path) -> bool:
220
- """Check whether the repo appears to contain tests."""
230
+ """Check whether the repo appears to contain tests.
231
+
232
+ Checks top-level test directories first, then falls back to recursive
233
+ git ls-files glob patterns to catch repos with nested test layouts
234
+ (e.g. numpy/_core/tests/, numpy/tests/).
235
+ """
236
+ # Fast path: top-level test directories
221
237
  for d in _TEST_DIRS:
222
238
  if (repo_path / d).is_dir():
223
239
  return True
224
- # Check for test files via git ls-files
225
- for pattern in _TEST_GLOBS:
226
- out = _run_git(["ls-files", "--", pattern], cwd=repo_path)
227
- if out:
228
- return True
229
- return False
240
+ # Single git ls-files call with all patterns (top-level + recursive)
241
+ all_patterns = _TEST_GLOBS + _RECURSIVE_TEST_DIR_GLOBS + _RECURSIVE_TEST_FILE_GLOBS
242
+ out = _run_git(["ls-files", "--", *all_patterns], cwd=repo_path)
243
+ return bool(out)
230
244
 
231
245
 
232
246
  def _has_ci(repo_path: Path) -> bool: