codeprobe 0.5.2__tar.gz → 0.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. {codeprobe-0.5.2 → codeprobe-0.5.4}/PKG-INFO +1 -1
  2. {codeprobe-0.5.2 → codeprobe-0.5.4}/pyproject.toml +1 -1
  3. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/claude.py +52 -4
  4. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/protocol.py +14 -1
  5. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/telemetry.py +86 -5
  6. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/stats.py +25 -1
  7. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/api.py +2 -0
  8. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/__init__.py +28 -0
  9. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/experiment_cmd.py +4 -0
  10. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/run_cmd.py +2 -0
  11. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/executor.py +1 -0
  12. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/experiment.py +4 -0
  13. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/experiment.py +16 -1
  14. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/PKG-INFO +1 -1
  15. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_adapters.py +162 -0
  16. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_stats.py +57 -0
  17. {codeprobe-0.5.2 → codeprobe-0.5.4}/LICENSE +0 -0
  18. {codeprobe-0.5.2 → codeprobe-0.5.4}/README.md +0 -0
  19. {codeprobe-0.5.2 → codeprobe-0.5.4}/setup.cfg +0 -0
  20. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/__init__.py +0 -0
  21. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/__main__.py +0 -0
  22. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/acceptance_compiler.py +0 -0
  23. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/__init__.py +0 -0
  24. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/_base.py +0 -0
  25. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/codex.py +0 -0
  26. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/copilot.py +0 -0
  27. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/openai_compat.py +0 -0
  28. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/session.py +0 -0
  29. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/__init__.py +0 -0
  30. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/dual.py +0 -0
  31. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/ranking.py +0 -0
  32. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/report.py +0 -0
  33. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/assess/__init__.py +0 -0
  34. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/assess/heuristics.py +0 -0
  35. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/assess/oracle_diff.py +0 -0
  36. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/assess_cmd.py +0 -0
  37. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/auth_cmd.py +0 -0
  38. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/doctor_cmd.py +0 -0
  39. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/init_cmd.py +0 -0
  40. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/interpret_cmd.py +0 -0
  41. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/json_display.py +0 -0
  42. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/mine_cmd.py +0 -0
  43. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/preamble_cmd.py +0 -0
  44. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/probe_cmd.py +0 -0
  45. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/ratings_cmd.py +0 -0
  46. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/rich_display.py +0 -0
  47. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  48. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/validate_cmd.py +0 -0
  49. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/wizard.py +0 -0
  50. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/yaml_writer.py +0 -0
  51. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/config/__init__.py +0 -0
  52. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/config/loader.py +0 -0
  53. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/config/redact.py +0 -0
  54. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/__init__.py +0 -0
  55. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/_shared.py +0 -0
  56. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/adaptive.py +0 -0
  57. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/counterfactual.py +0 -0
  58. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/debate.py +0 -0
  59. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/decision_tree.py +0 -0
  60. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/elo.py +0 -0
  61. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/fingerprint.py +0 -0
  62. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/mutation.py +0 -0
  63. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/pareto.py +0 -0
  64. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/sprt.py +0 -0
  65. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/tournament.py +0 -0
  66. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/__init__.py +0 -0
  67. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/__main__.py +0 -0
  68. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/checkpoint.py +0 -0
  69. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/events.py +0 -0
  70. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/isolation.py +0 -0
  71. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/llm.py +0 -0
  72. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/mcp_discovery.py +0 -0
  73. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/preamble.py +0 -0
  74. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/registry.py +0 -0
  75. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/repo_hygiene.py +0 -0
  76. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/sandbox.py +0 -0
  77. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/scoring.py +0 -0
  78. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/loaders/__init__.py +0 -0
  79. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/loaders/suite.py +0 -0
  80. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/__init__.py +0 -0
  81. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/_graph.py +0 -0
  82. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/_lang.py +0 -0
  83. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/comprehension.py +0 -0
  84. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/comprehension_writer.py +0 -0
  85. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/curator.py +0 -0
  86. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/curator_backends.py +0 -0
  87. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/curator_tiers.py +0 -0
  88. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/extractor.py +0 -0
  89. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/multi_repo.py +0 -0
  90. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale.py +0 -0
  91. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_families.py +0 -0
  92. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  93. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_scanner.py +0 -0
  94. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_validate.py +0 -0
  95. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/sg_auth.py +0 -0
  96. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/sg_ground_truth.py +0 -0
  97. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/sources.py +0 -0
  98. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/task_types.py +0 -0
  99. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/writer.py +0 -0
  100. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/__init__.py +0 -0
  101. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/evalrc.py +0 -0
  102. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/preamble.py +0 -0
  103. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/suite.py +0 -0
  104. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/task.py +0 -0
  105. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/preambles/__init__.py +0 -0
  106. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/preambles/github.md +0 -0
  107. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/preambles/sourcegraph.md +0 -0
  108. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/__init__.py +0 -0
  109. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/adapter.py +0 -0
  110. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/generator.py +0 -0
  111. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/writer.py +0 -0
  112. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/ratings/__init__.py +0 -0
  113. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/ratings/collector.py +0 -0
  114. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/scaffold/__init__.py +0 -0
  115. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/scaffold/writer.py +0 -0
  116. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/__init__.py +0 -0
  117. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
  118. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  119. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  120. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/SOURCES.txt +0 -0
  121. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  122. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/entry_points.txt +0 -0
  123. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/requires.txt +0 -0
  124. {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/top_level.txt +0 -0
  125. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_acceptance_compiler.py +0 -0
  126. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_acceptance_compiler_integration.py +0 -0
  127. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_adapter_contracts.py +0 -0
  128. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_analysis.py +0 -0
  129. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_api.py +0 -0
  130. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_artifact_scorer.py +0 -0
  131. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_assess.py +0 -0
  132. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_auth_cmd.py +0 -0
  133. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_changed_symbols.py +0 -0
  134. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_checkpoint.py +0 -0
  135. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_checkpoint_scoring.py +0 -0
  136. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_cli.py +0 -0
  137. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_comprehension.py +0 -0
  138. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_config_loader.py +0 -0
  139. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_contrib.py +0 -0
  140. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_convergence.py +0 -0
  141. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_criteria_loader.py +0 -0
  142. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ctrlc_integration.py +0 -0
  143. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_backends.py +0 -0
  144. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_core.py +0 -0
  145. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_integration.py +0 -0
  146. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_tiers.py +0 -0
  147. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_doctor_cmd.py +0 -0
  148. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_adversarial_fixes.py +0 -0
  149. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_composite.py +0 -0
  150. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_e2e.py +0 -0
  151. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_matrix.py +0 -0
  152. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_scorer.py +0 -0
  153. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_scoring_details.py +0 -0
  154. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_events.py +0 -0
  155. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_examples_dual.py +0 -0
  156. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_executor.py +0 -0
  157. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_executor_dual_isolation.py +0 -0
  158. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_executor_events.py +0 -0
  159. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_experiment_cmd.py +0 -0
  160. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_experiment_core.py +0 -0
  161. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ground_truth_schema.py +0 -0
  162. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_init_wizard.py +0 -0
  163. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_isolation.py +0 -0
  164. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_json_display.py +0 -0
  165. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_listeners_dual.py +0 -0
  166. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_llm.py +0 -0
  167. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_loader.py +0 -0
  168. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_loaders.py +0 -0
  169. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_loaders_dual.py +0 -0
  170. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mcp_families_mining.py +0 -0
  171. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mcp_validate.py +0 -0
  172. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_cli.py +0 -0
  173. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_goals.py +0 -0
  174. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_presets.py +0 -0
  175. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_profiles.py +0 -0
  176. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mining.py +0 -0
  177. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mining_dual.py +0 -0
  178. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_models.py +0 -0
  179. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_multi_repo_e2e.py +0 -0
  180. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_multi_repo_mining.py +0 -0
  181. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_new_families.py +0 -0
  182. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_openai_compat.py +0 -0
  183. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_oracle_diff.py +0 -0
  184. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_oracle_registry.py +0 -0
  185. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_oracle_types.py +0 -0
  186. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_org_scale.py +0 -0
  187. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_pipeline_integration.py +0 -0
  188. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_preamble.py +0 -0
  189. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_preamble_cmd.py +0 -0
  190. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_probe.py +0 -0
  191. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_probe_adapter.py +0 -0
  192. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ratings.py +0 -0
  193. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ratings_cmd.py +0 -0
  194. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_registry.py +0 -0
  195. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_regression_gate.py +0 -0
  196. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_release_gate.py +0 -0
  197. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_repo_hygiene.py +0 -0
  198. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_report_dual.py +0 -0
  199. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_run_config_resolution.py +0 -0
  200. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_safe_leg_score.py +0 -0
  201. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scaffold.py +0 -0
  202. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scaffold_upgrade.py +0 -0
  203. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scanner_refactor.py +0 -0
  204. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_score_result.py +0 -0
  205. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scoring.py +0 -0
  206. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scoring_extended.py +0 -0
  207. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scoring_v2.py +0 -0
  208. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_sdlc_ground_truth.py +0 -0
  209. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_secret_redaction.py +0 -0
  210. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_session.py +0 -0
  211. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_sg_auth.py +0 -0
  212. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_sg_ground_truth.py +0 -0
  213. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_shell_shim.py +0 -0
  214. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_show_prompt.py +0 -0
  215. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_suite.py +0 -0
  216. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_suite_manifest.py +0 -0
  217. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_task_model.py +0 -0
  218. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_task_types.py +0 -0
  219. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_telemetry.py +0 -0
  220. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_validate_cmd.py +0 -0
  221. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_validate_dual.py +0 -0
  222. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_verifier.py +0 -0
  223. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_verify.py +0 -0
  224. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_weighted_checklist.py +0 -0
  225. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_weighted_f1.py +0 -0
  226. {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_writer_dual.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeprobe
3
- Version: 0.5.2
3
+ Version: 0.5.4
4
4
  Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
5
  Author: codeprobe contributors
6
6
  License-Expression: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.5.2"
3
+ version = "0.5.4"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -243,7 +243,13 @@ class ClaudeAdapter(BaseAdapter):
243
243
 
244
244
  def build_command(self, prompt: str, config: AgentConfig) -> list[str]:
245
245
  binary = self._require_binary()
246
- cmd = [binary, "-p", prompt, "--output-format", "json"]
246
+ # stream-json + --verbose emits newline-delimited events including
247
+ # every assistant message (with tool_use content blocks) and ends
248
+ # with a ``type: "result"`` event mirroring the ``json`` envelope.
249
+ # This is what gives us accurate per-run tool_call_count and
250
+ # per-tool observability; the collector reconstructs the envelope
251
+ # from the terminal event.
252
+ cmd = [binary, "-p", prompt, "--output-format", "stream-json", "--verbose"]
247
253
 
248
254
  if config.model:
249
255
  cmd.extend(["--model", _normalize_model_for_cli(config.model)])
@@ -262,6 +268,27 @@ class ClaudeAdapter(BaseAdapter):
262
268
  if mcp_path:
263
269
  cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
264
270
 
271
+ # Tool restrictions. Claude CLI has three related flags:
272
+ # --tools "" disables all built-in tools
273
+ # --allowedTools X,Y auto-approves these tools (no permission
274
+ # prompt); names may include MCP tools as
275
+ # ``mcp__<server>__<tool>``
276
+ # --disallowedTools X,Y blocks these tools outright
277
+ # We treat ``allowed_tools`` as a whitelist: when set, built-ins
278
+ # are disabled (``--tools ""``) and listed names are auto-approved
279
+ # (``--allowedTools``). This yields true MCP-only runs when the
280
+ # whitelist contains only ``mcp__*`` names — verified against
281
+ # claude 2.1.x: without auto-approval the agent hits permission
282
+ # prompts and ends the turn early.
283
+ if config.allowed_tools is not None:
284
+ cmd.extend(["--tools", ""])
285
+ if config.allowed_tools:
286
+ cmd.extend(["--allowedTools", ",".join(config.allowed_tools)])
287
+ if config.disallowed_tools:
288
+ cmd.extend(
289
+ ["--disallowedTools", ",".join(config.disallowed_tools)]
290
+ )
291
+
265
292
  return cmd
266
293
 
267
294
  def isolate_session(self, slot_id: int) -> dict[str, str]:
@@ -290,15 +317,35 @@ class ClaudeAdapter(BaseAdapter):
290
317
  return {}
291
318
 
292
319
  def parse_output(self, result: subprocess.CompletedProcess[str], duration: float) -> AgentOutput:
293
- """Parse Claude CLI JSON envelope into AgentOutput."""
320
+ """Parse Claude CLI JSON envelope into AgentOutput.
321
+
322
+ Handles both ``--output-format json`` (single envelope) and
323
+ ``--output-format stream-json --verbose`` (newline-delimited
324
+ events) — the collector auto-detects. When parsing a stream, the
325
+ final ``type: "result"`` event carries the same fields as the
326
+ single-envelope shape, so we reconstruct ``result`` text from it.
327
+ """
294
328
  usage = self._collector.collect(result.stdout)
295
329
 
296
- # Extract content text from the JSON envelope
330
+ # Extract content text. For stream-json, the terminal result event
331
+ # has a ``result`` field; iterate events to find it. For single
332
+ # envelope, json.loads works directly.
333
+ stdout_text = result.stdout
297
334
  try:
298
335
  envelope = json.loads(result.stdout)
299
336
  stdout_text = envelope.get("result", result.stdout)
300
337
  except (json.JSONDecodeError, ValueError):
301
- stdout_text = result.stdout
338
+ for line in reversed(result.stdout.splitlines()):
339
+ line = line.strip()
340
+ if not line:
341
+ continue
342
+ try:
343
+ ev = json.loads(line)
344
+ except (json.JSONDecodeError, ValueError):
345
+ continue
346
+ if isinstance(ev, dict) and ev.get("type") == "result":
347
+ stdout_text = ev.get("result", result.stdout)
348
+ break
302
349
 
303
350
  return AgentOutput(
304
351
  stdout=stdout_text,
@@ -313,4 +360,5 @@ class ClaudeAdapter(BaseAdapter):
313
360
  cost_source=usage.cost_source,
314
361
  error=usage.error,
315
362
  tool_call_count=usage.tool_call_count,
363
+ tool_use_by_name=usage.tool_use_by_name,
316
364
  )
@@ -45,6 +45,9 @@ class AgentOutput:
45
45
  error: str | None = None
46
46
  cost_source: str = "unavailable"
47
47
  tool_call_count: int | None = None
48
+ # Per-tool usage counts (e.g. {"Read": 5, "mcp__sourcegraph__...": 2}).
49
+ # None when the adapter couldn't capture a streaming transcript.
50
+ tool_use_by_name: dict[str, int] | None = None
48
51
 
49
52
  def __post_init__(self) -> None:
50
53
  if self.cost_model not in ALLOWED_COST_MODELS:
@@ -63,12 +66,22 @@ class AgentOutput:
63
66
 
64
67
  @dataclass(frozen=True)
65
68
  class AgentConfig:
66
- """Configuration passed to an agent adapter."""
69
+ """Configuration passed to an agent adapter.
70
+
71
+ ``allowed_tools`` / ``disallowed_tools`` restrict which tools the agent
72
+ may call. When both are ``None`` the adapter uses its default tool set.
73
+ When ``allowed_tools`` is an empty list, the adapter disables all
74
+ built-in tools (useful for MCP-only experiments: MCP tools are still
75
+ available because they come from ``mcp_config``, but no built-in
76
+ ``Read``/``Grep``/``Bash``/etc. are).
77
+ """
67
78
 
68
79
  model: str | None = None
69
80
  permission_mode: str = "default"
70
81
  timeout_seconds: int = 3600
71
82
  mcp_config: dict | None = None
83
+ allowed_tools: list[str] | None = None
84
+ disallowed_tools: list[str] | None = None
72
85
  extra: dict | None = None
73
86
  cwd: str | None = None
74
87
 
@@ -66,6 +66,11 @@ class UsageData:
66
66
  cost_source: str = "unavailable"
67
67
  error: str | None = None
68
68
  tool_call_count: int | None = None
69
+ # Tool-use counts broken down by tool name (e.g. ``{"Read": 5,
70
+ # "mcp__sourcegraph__keyword_search": 2}``). Populated only when the
71
+ # adapter captured a streaming transcript. None means "not captured",
72
+ # not "no tool calls".
73
+ tool_use_by_name: dict[str, int] | None = None
69
74
 
70
75
  def __post_init__(self) -> None:
71
76
  if self.cost_model not in ALLOWED_COST_MODELS:
@@ -145,6 +150,45 @@ def _count_tool_use_blocks(envelope: dict[str, Any]) -> int | None:
145
150
  return count
146
151
 
147
152
 
153
+ def _parse_stream_json(raw_output: str) -> tuple[dict[str, Any] | None, int, dict[str, int]]:
154
+ """Parse a ``--output-format stream-json --verbose`` transcript.
155
+
156
+ Returns ``(result_event, tool_use_count, tool_use_by_name)``.
157
+ ``result_event`` is the final ``type: "result"`` event (same shape as
158
+ ``--output-format json`` envelope), or None when the stream is
159
+ malformed or has no terminal event. ``tool_use_by_name`` aggregates
160
+ tool-use block counts by tool name (including MCP tools, which appear
161
+ as ``mcp__<server>__<tool>``), useful for observability.
162
+ """
163
+ result_event: dict[str, Any] | None = None
164
+ tool_use_count = 0
165
+ by_name: dict[str, int] = {}
166
+ for line in raw_output.splitlines():
167
+ line = line.strip()
168
+ if not line:
169
+ continue
170
+ try:
171
+ ev = json.loads(line)
172
+ except (json.JSONDecodeError, ValueError):
173
+ continue
174
+ if not isinstance(ev, dict):
175
+ continue
176
+ if ev.get("type") == "assistant":
177
+ msg = ev.get("message")
178
+ if isinstance(msg, dict):
179
+ for block in msg.get("content", []) or []:
180
+ if not isinstance(block, dict):
181
+ continue
182
+ if block.get("type") == "tool_use":
183
+ tool_use_count += 1
184
+ name = block.get("name", "")
185
+ if isinstance(name, str) and name:
186
+ by_name[name] = by_name.get(name, 0) + 1
187
+ if ev.get("type") == "result":
188
+ result_event = ev
189
+ return result_event, tool_use_count, by_name
190
+
191
+
148
192
  class JsonStdoutCollector:
149
193
  """Extract telemetry from Claude CLI JSON envelope on stdout.
150
194
 
@@ -162,10 +206,40 @@ class JsonStdoutCollector:
162
206
  """
163
207
 
164
208
  def collect(self, raw_output: str, **context: Any) -> UsageData:
165
- try:
166
- envelope = json.loads(raw_output)
167
- except (json.JSONDecodeError, ValueError) as exc:
168
- return UsageData(error=f"JSON parse failed: {exc}")
209
+ # Two accepted shapes:
210
+ # 1. ``--output-format json`` — a single JSON envelope; no
211
+ # per-tool-use trace, so tool_call_count stays None.
212
+ # 2. ``--output-format stream-json --verbose`` — newline-delimited
213
+ # events ending in a ``type: "result"`` event that mirrors
214
+ # shape (1). We also count ``tool_use`` blocks across all
215
+ # ``assistant`` events for accurate tool_call_count.
216
+ stream_tool_count: int | None = None
217
+ stream_tool_by_name: dict[str, int] = {}
218
+ trimmed = raw_output.lstrip()
219
+ if trimmed.startswith("{\n") or trimmed.startswith("{"):
220
+ # Try single-envelope path first — most adapters still use
221
+ # ``--output-format json``.
222
+ try:
223
+ envelope = json.loads(raw_output)
224
+ if envelope.get("type") == "result" and "\n" in raw_output.rstrip():
225
+ # Ambiguous: looks like a single-line event from the
226
+ # stream. Fall through to stream parsing below.
227
+ raise ValueError("ambiguous envelope — retry as stream")
228
+ except (json.JSONDecodeError, ValueError):
229
+ envelope = None
230
+ else:
231
+ envelope = None
232
+ if envelope is None:
233
+ result_ev, stream_tool_count, stream_tool_by_name = _parse_stream_json(
234
+ raw_output
235
+ )
236
+ if result_ev is None:
237
+ return UsageData(
238
+ error="JSON parse failed: output is neither a valid "
239
+ "envelope nor a stream-json transcript ending in a "
240
+ "'result' event"
241
+ )
242
+ envelope = result_ev
169
243
 
170
244
  usage = envelope.get("usage")
171
245
  if usage is None:
@@ -197,7 +271,13 @@ class JsonStdoutCollector:
197
271
  cost_model = "unknown"
198
272
  cost_source = "unavailable"
199
273
 
200
- tool_call_count = _count_tool_use_blocks(envelope)
274
+ # Prefer stream-json count when the transcript was streamed — it's
275
+ # always present and accurate. Fall back to the envelope's
276
+ # ``messages`` array (when some future CLI flag surfaces it), else
277
+ # stays None.
278
+ tool_call_count = stream_tool_count
279
+ if tool_call_count is None:
280
+ tool_call_count = _count_tool_use_blocks(envelope)
201
281
 
202
282
  return UsageData(
203
283
  input_tokens=input_tokens,
@@ -207,6 +287,7 @@ class JsonStdoutCollector:
207
287
  cost_model=cost_model,
208
288
  cost_source=cost_source,
209
289
  tool_call_count=tool_call_count,
290
+ tool_use_by_name=stream_tool_by_name or None,
210
291
  error=envelope_error,
211
292
  )
212
293
 
@@ -583,7 +583,31 @@ def compare_configs(
583
583
  elif speed_diff > 0:
584
584
  parts.append(f"{speed_diff:.1f}s slower")
585
585
 
586
- summary = f"{a.label} vs {b.label}: {', '.join(parts)} " f"\u2192 {winner} wins"
586
+ # Soften the verdict when the effect is negligible or the test is
587
+ # underpowered, so we don't confidently declare a "winner" on what may
588
+ # be noise. Thresholds:
589
+ # Cohen's d: |d| < 0.2 is "negligible" (Cohen 1988).
590
+ # Cliff's delta: |delta| < 0.147 is "negligible" (Romano et al. 2006).
591
+ # p-value > 0.05: not significant at the conventional threshold.
592
+ scores_tied = abs(score_diff) < 0.01
593
+ negligible_threshold = 0.2 if eff_method == "cohens_d" else 0.147
594
+ small_effect = (
595
+ eff_size is not None and abs(eff_size) < negligible_threshold
596
+ )
597
+ not_significant = p_val is not None and p_val > 0.05
598
+
599
+ if scores_tied:
600
+ verdict = "effectively tied"
601
+ elif small_effect and not_significant:
602
+ verdict = f"{winner} nominally ahead (not significant; small effect)"
603
+ elif small_effect:
604
+ verdict = f"{winner} nominally ahead (small effect size)"
605
+ elif not_significant:
606
+ verdict = f"{winner} nominally ahead (not significant at p=0.05)"
607
+ else:
608
+ verdict = f"{winner} wins"
609
+
610
+ summary = f"{a.label} vs {b.label}: {', '.join(parts)} \u2192 {verdict}"
587
611
 
588
612
  return PairwiseComparison(
589
613
  config_a=a.label,
@@ -151,6 +151,8 @@ def run_experiment(
151
151
  permission_mode=perm,
152
152
  timeout_seconds=timeout,
153
153
  mcp_config=exp_config.mcp_config,
154
+ allowed_tools=exp_config.allowed_tools,
155
+ disallowed_tools=exp_config.disallowed_tools,
154
156
  cwd=str(experiment_dir.resolve()),
155
157
  )
156
158
 
@@ -794,6 +794,23 @@ def init_experiment(
794
794
  "Built-ins: sourcegraph, github. Or path to a custom .md file."
795
795
  ),
796
796
  )
797
+ @click.option(
798
+ "--allowed-tools",
799
+ default=None,
800
+ help=(
801
+ "Restrict the agent to this comma-separated list of built-in "
802
+ "tool names (e.g. 'Read,Grep'). Pass an empty string ('') to "
803
+ "disable all built-in tools for an MCP-only comparison."
804
+ ),
805
+ )
806
+ @click.option(
807
+ "--disallowed-tools",
808
+ default=None,
809
+ help=(
810
+ "Block the agent from these comma-separated built-in tool names "
811
+ "(e.g. 'Bash,Write'). Applies on top of --allowed-tools."
812
+ ),
813
+ )
797
814
  def add_config(
798
815
  path: str,
799
816
  label: str,
@@ -803,10 +820,19 @@ def add_config(
803
820
  mcp_config: str | None,
804
821
  instruction_variant: str | None,
805
822
  preambles: tuple[str, ...],
823
+ allowed_tools: str | None,
824
+ disallowed_tools: str | None,
806
825
  ) -> None:
807
826
  """Add a configuration to an existing experiment."""
808
827
  from codeprobe.cli.experiment_cmd import experiment_add_config
809
828
 
829
+ # Parse comma-separated tool lists. An empty string means "MCP-only":
830
+ # disable all built-in tools. None means "adapter default".
831
+ def _parse_tools(raw: str | None) -> list[str] | None:
832
+ if raw is None:
833
+ return None
834
+ return [t.strip() for t in raw.split(",") if t.strip()]
835
+
810
836
  experiment_add_config(
811
837
  path,
812
838
  label=label,
@@ -816,6 +842,8 @@ def add_config(
816
842
  mcp_config_str=mcp_config,
817
843
  instruction_variant=instruction_variant,
818
844
  preambles=preambles,
845
+ allowed_tools=_parse_tools(allowed_tools),
846
+ disallowed_tools=_parse_tools(disallowed_tools),
819
847
  )
820
848
 
821
849
 
@@ -142,6 +142,8 @@ def experiment_add_config(
142
142
  mcp_config_str: str | None,
143
143
  instruction_variant: str | None = None,
144
144
  preambles: tuple[str, ...] = (),
145
+ allowed_tools: list[str] | None = None,
146
+ disallowed_tools: list[str] | None = None,
145
147
  ) -> None:
146
148
  """Add a configuration to an existing experiment."""
147
149
  exp_dir = Path(path)
@@ -191,6 +193,8 @@ def experiment_add_config(
191
193
  mcp_config=mcp_config,
192
194
  instruction_variant=instruction_variant,
193
195
  preambles=preambles,
196
+ allowed_tools=allowed_tools,
197
+ disallowed_tools=disallowed_tools,
194
198
  )
195
199
 
196
200
  # Validate the label is a safe path component
@@ -482,6 +482,8 @@ def run_eval(
482
482
  permission_mode=perm,
483
483
  timeout_seconds=resolved_timeout,
484
484
  mcp_config=exp_config.mcp_config,
485
+ allowed_tools=exp_config.allowed_tools,
486
+ disallowed_tools=exp_config.disallowed_tools,
485
487
  cwd=str(repo_root),
486
488
  )
487
489
 
@@ -387,6 +387,7 @@ def execute_task(
387
387
  cost_model=output.cost_model,
388
388
  cost_source=output.cost_source,
389
389
  tool_call_count=output.tool_call_count,
390
+ tool_use_by_name=output.tool_use_by_name,
390
391
  )
391
392
 
392
393
  # For oracle tasks, the agent writes answer.txt / answer.json to the
@@ -98,7 +98,11 @@ def load_experiment(exp_dir: Path) -> Experiment:
98
98
  model=c.get("model"),
99
99
  permission_mode=c.get("permission_mode", "default"),
100
100
  mcp_config=c.get("mcp_config"),
101
+ allowed_tools=c.get("allowed_tools"),
102
+ disallowed_tools=c.get("disallowed_tools"),
101
103
  instruction_variant=c.get("instruction_variant"),
104
+ preambles=tuple(c.get("preambles", ())),
105
+ reward_type=c.get("reward_type", "binary"),
102
106
  extra=c.get("extra", {}),
103
107
  )
104
108
  for c in data.get("configs", [])
@@ -8,13 +8,23 @@ from typing import Any
8
8
 
9
9
  @dataclass(frozen=True)
10
10
  class ExperimentConfig:
11
- """A single configuration to evaluate (e.g., 'baseline' or 'with-mcp')."""
11
+ """A single configuration to evaluate (e.g., 'baseline' or 'with-mcp').
12
+
13
+ ``allowed_tools`` / ``disallowed_tools`` restrict which tools the
14
+ agent is allowed to call during this config's runs. Semantics mirror
15
+ the underlying CLI (Claude's ``--allowedTools`` / ``--disallowedTools``
16
+ / ``--tools``). Set ``allowed_tools=[]`` to disable all built-in tools
17
+ for an MCP-only comparison — MCP tools are still reachable because
18
+ they come from ``mcp_config``.
19
+ """
12
20
 
13
21
  label: str
14
22
  agent: str = "claude"
15
23
  model: str | None = None
16
24
  permission_mode: str = "default"
17
25
  mcp_config: dict | None = None
26
+ allowed_tools: list[str] | None = None
27
+ disallowed_tools: list[str] | None = None
18
28
  instruction_variant: str | None = None
19
29
  preambles: tuple[str, ...] = ()
20
30
  reward_type: str = "binary"
@@ -29,6 +39,8 @@ class ExperimentConfig:
29
39
  f"ExperimentConfig(label={self.label!r}, agent={self.agent!r}, "
30
40
  f"model={self.model!r}, permission_mode={self.permission_mode!r}, "
31
41
  f"mcp_config={redacted_mcp!r}, "
42
+ f"allowed_tools={self.allowed_tools!r}, "
43
+ f"disallowed_tools={self.disallowed_tools!r}, "
32
44
  f"instruction_variant={self.instruction_variant!r}, "
33
45
  f"preambles={self.preambles!r}, reward_type={self.reward_type!r}, "
34
46
  f"extra={self.extra!r})"
@@ -113,6 +125,9 @@ class CompletedTask:
113
125
  cost_model: str = "unknown"
114
126
  cost_source: str = "unavailable"
115
127
  tool_call_count: int | None = None
128
+ # Per-tool usage breakdown (e.g. {"Read": 5,
129
+ # "mcp__sourcegraph__keyword_search": 2}). None when not captured.
130
+ tool_use_by_name: dict[str, int] | None = None
116
131
  error_category: str | None = None
117
132
  scoring_details: dict = field(default_factory=dict)
118
133
  metadata: dict = field(default_factory=dict)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeprobe
3
- Version: 0.5.2
3
+ Version: 0.5.4
4
4
  Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
5
  Author: codeprobe contributors
6
6
  License-Expression: Apache-2.0
@@ -1824,3 +1824,165 @@ class TestClaudeModelNormalization:
1824
1824
  cmd = adapter.build_command("test", config)
1825
1825
  idx = cmd.index("--model")
1826
1826
  assert cmd[idx + 1] == "claude-sonnet-4-6"
1827
+
1828
+
1829
+ # ---------------------------------------------------------------------------
1830
+ # 0.5.4: tool-restriction flags + stream-json tool_use capture
1831
+ # ---------------------------------------------------------------------------
1832
+
1833
+
1834
+ class TestClaudeToolRestrictions:
1835
+ """Claude adapter wires AgentConfig.{allowed,disallowed}_tools to CLI."""
1836
+
1837
+ def test_allowed_tools_empty_list_maps_to_tools_empty(self) -> None:
1838
+ """``allowed_tools=[]`` means MCP-only → ``--tools ""``."""
1839
+ adapter = ClaudeAdapter()
1840
+ if not adapter.find_binary():
1841
+ pytest.skip("claude binary not available")
1842
+ config = AgentConfig(allowed_tools=[])
1843
+ cmd = adapter.build_command("test", config)
1844
+ # Should contain --tools followed immediately by empty string.
1845
+ assert "--tools" in cmd
1846
+ idx = cmd.index("--tools")
1847
+ assert cmd[idx + 1] == ""
1848
+
1849
+ def test_allowed_tools_nonempty_emits_both_flags(self) -> None:
1850
+ """Non-empty allowed_tools = whitelist. Adapter disables built-ins
1851
+ via --tools "" AND auto-approves listed names via --allowedTools,
1852
+ because in claude 2.1.x, --allowedTools alone doesn't restrict
1853
+ the available tool set (it just auto-approves) and without both
1854
+ flags the agent either burns turns on permission prompts or calls
1855
+ unlisted tools."""
1856
+ adapter = ClaudeAdapter()
1857
+ if not adapter.find_binary():
1858
+ pytest.skip("claude binary not available")
1859
+ config = AgentConfig(allowed_tools=["Read", "Grep"])
1860
+ cmd = adapter.build_command("test", config)
1861
+ assert "--tools" in cmd
1862
+ assert cmd[cmd.index("--tools") + 1] == ""
1863
+ assert "--allowedTools" in cmd
1864
+ assert cmd[cmd.index("--allowedTools") + 1] == "Read,Grep"
1865
+
1866
+ def test_disallowed_tools_maps_to_disallowedTools(self) -> None:
1867
+ adapter = ClaudeAdapter()
1868
+ if not adapter.find_binary():
1869
+ pytest.skip("claude binary not available")
1870
+ config = AgentConfig(disallowed_tools=["Bash", "Write"])
1871
+ cmd = adapter.build_command("test", config)
1872
+ assert "--disallowedTools" in cmd
1873
+ idx = cmd.index("--disallowedTools")
1874
+ assert cmd[idx + 1] == "Bash,Write"
1875
+
1876
+ def test_both_tool_restrictions_coexist(self) -> None:
1877
+ adapter = ClaudeAdapter()
1878
+ if not adapter.find_binary():
1879
+ pytest.skip("claude binary not available")
1880
+ config = AgentConfig(
1881
+ allowed_tools=["Read"], disallowed_tools=["Bash"]
1882
+ )
1883
+ cmd = adapter.build_command("test", config)
1884
+ assert "--allowedTools" in cmd
1885
+ assert "--disallowedTools" in cmd
1886
+
1887
+ def test_none_tool_restrictions_omit_flags(self) -> None:
1888
+ """Default behavior: no --tools / --allowedTools / --disallowedTools."""
1889
+ adapter = ClaudeAdapter()
1890
+ if not adapter.find_binary():
1891
+ pytest.skip("claude binary not available")
1892
+ config = AgentConfig()
1893
+ cmd = adapter.build_command("test", config)
1894
+ assert "--tools" not in cmd
1895
+ assert "--allowedTools" not in cmd
1896
+ assert "--disallowedTools" not in cmd
1897
+
1898
+ def test_stream_json_is_default_output_format(self) -> None:
1899
+ """Claude adapter switched to stream-json for tool_use capture."""
1900
+ adapter = ClaudeAdapter()
1901
+ if not adapter.find_binary():
1902
+ pytest.skip("claude binary not available")
1903
+ cmd = adapter.build_command("test", AgentConfig())
1904
+ assert "--output-format" in cmd
1905
+ idx = cmd.index("--output-format")
1906
+ assert cmd[idx + 1] == "stream-json"
1907
+ assert "--verbose" in cmd
1908
+
1909
+
1910
+ class TestStreamJsonToolUseCapture:
1911
+ """JsonStdoutCollector parses stream-json and counts tool_use blocks."""
1912
+
1913
+ def _make_stream(self, tool_names: list[str]) -> str:
1914
+ """Build a minimal stream-json transcript with given tool_use blocks."""
1915
+ import json as _json
1916
+
1917
+ lines = [
1918
+ _json.dumps({
1919
+ "type": "system", "subtype": "init",
1920
+ "mcp_servers": [{"name": "sourcegraph", "status": "connected"}],
1921
+ })
1922
+ ]
1923
+ for name in tool_names:
1924
+ lines.append(_json.dumps({
1925
+ "type": "assistant",
1926
+ "message": {"content": [{"type": "tool_use", "name": name}]},
1927
+ }))
1928
+ # Terminal result event carries the envelope-shape fields.
1929
+ lines.append(_json.dumps({
1930
+ "type": "result",
1931
+ "subtype": "success",
1932
+ "result": "Done.",
1933
+ "is_error": False,
1934
+ "usage": {
1935
+ "input_tokens": 10,
1936
+ "output_tokens": 20,
1937
+ "cache_read_input_tokens": 100,
1938
+ },
1939
+ "total_cost_usd": 0.05,
1940
+ }))
1941
+ return "\n".join(lines) + "\n"
1942
+
1943
+ def test_counts_all_tool_use_blocks(self) -> None:
1944
+ from codeprobe.adapters.telemetry import JsonStdoutCollector
1945
+
1946
+ stream = self._make_stream(["Read", "Grep", "Read", "Bash"])
1947
+ u = JsonStdoutCollector().collect(stream)
1948
+ assert u.tool_call_count == 4
1949
+ assert u.tool_use_by_name == {"Read": 2, "Grep": 1, "Bash": 1}
1950
+
1951
+ def test_counts_mcp_tool_names(self) -> None:
1952
+ """MCP tools show up as ``mcp__<server>__<tool>``; counted correctly."""
1953
+ from codeprobe.adapters.telemetry import JsonStdoutCollector
1954
+
1955
+ stream = self._make_stream([
1956
+ "Read", "mcp__sourcegraph__keyword_search",
1957
+ "mcp__sourcegraph__find_references",
1958
+ ])
1959
+ u = JsonStdoutCollector().collect(stream)
1960
+ assert u.tool_call_count == 3
1961
+ assert u.tool_use_by_name["mcp__sourcegraph__keyword_search"] == 1
1962
+ assert u.tool_use_by_name["mcp__sourcegraph__find_references"] == 1
1963
+
1964
+ def test_empty_stream_returns_no_tool_calls(self) -> None:
1965
+ from codeprobe.adapters.telemetry import JsonStdoutCollector
1966
+
1967
+ stream = self._make_stream([])
1968
+ u = JsonStdoutCollector().collect(stream)
1969
+ assert u.tool_call_count == 0
1970
+ assert u.tool_use_by_name is None # sentinel: nothing captured
1971
+
1972
+ def test_single_envelope_still_works(self) -> None:
1973
+ """Back-compat: legacy --output-format json single envelope parses."""
1974
+ from codeprobe.adapters.telemetry import JsonStdoutCollector
1975
+
1976
+ envelope = {
1977
+ "result": "ok",
1978
+ "usage": {
1979
+ "input_tokens": 5, "output_tokens": 10,
1980
+ "cache_read_input_tokens": 0,
1981
+ },
1982
+ "total_cost_usd": 0.01,
1983
+ }
1984
+ import json as _json
1985
+
1986
+ u = JsonStdoutCollector().collect(_json.dumps(envelope))
1987
+ assert u.input_tokens == 5
1988
+ assert u.tool_call_count is None # envelope has no messages