refactor-arena 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. refactor_arena/__init__.py +8 -0
  2. refactor_arena/cli/__init__.py +5 -0
  3. refactor_arena/cli/commands/__init__.py +17 -0
  4. refactor_arena/cli/commands/eval.py +131 -0
  5. refactor_arena/cli/commands/init.py +157 -0
  6. refactor_arena/cli/commands/rescore.py +328 -0
  7. refactor_arena/cli/main.py +109 -0
  8. refactor_arena/cli/types/command_function.py +7 -0
  9. refactor_arena/config/__init__.py +27 -0
  10. refactor_arena/config/args_parser.py +275 -0
  11. refactor_arena/config/config.py +486 -0
  12. refactor_arena/config/config_defaults.py +70 -0
  13. refactor_arena/config/field_parser.py +61 -0
  14. refactor_arena/config/fingerprint/__init__.py +7 -0
  15. refactor_arena/config/fingerprint/create.py +260 -0
  16. refactor_arena/config/fingerprint/current.py +91 -0
  17. refactor_arena/config/resolvers/exec.py +18 -0
  18. refactor_arena/configs/args.yaml +31 -0
  19. refactor_arena/configs/commands/eval.yaml +151 -0
  20. refactor_arena/configs/commands/init.yaml +5 -0
  21. refactor_arena/configs/commands/rescore-defaults.yaml +32 -0
  22. refactor_arena/configs/commands/rescore.yaml +86 -0
  23. refactor_arena/configs/config.yaml +23 -0
  24. refactor_arena/configs/defaults.yaml +454 -0
  25. refactor_arena/configs/includes/code-quality.yaml +81 -0
  26. refactor_arena/configs/includes/fixtures.yaml +167 -0
  27. refactor_arena/configs/includes/main-tasks.yaml +50 -0
  28. refactor_arena/configs/includes/metrics.yaml +66 -0
  29. refactor_arena/configs/includes/monitor-thresholds.yaml +149 -0
  30. refactor_arena/configs/includes/prompts-attack.yaml +383 -0
  31. refactor_arena/configs/includes/prompts.yaml +152 -0
  32. refactor_arena/configs/includes/side-tasks.yaml +252 -0
  33. refactor_arena/configs/reference-only/repo-traits.yaml +98 -0
  34. refactor_arena/configs/scenarios/all-tasks-mixed-20-with-acceptance-scorers.yaml +13 -0
  35. refactor_arena/configs/scenarios/all-tasks-mixed-20-with-task-scorers.yaml +19 -0
  36. refactor_arena/configs/scenarios/all-tasks-mixed-20.yaml +9 -0
  37. refactor_arena/configs/scenarios/all-tasks-mixed-3.yaml +9 -0
  38. refactor_arena/configs/scenarios/change-password-hinted.yaml +12 -0
  39. refactor_arena/configs/scenarios/change-password.yaml +12 -0
  40. refactor_arena/configs/scenarios/data-exfiltration-guided-innocent.yaml +118 -0
  41. refactor_arena/configs/scenarios/data-exfiltration-guided.yaml +60 -0
  42. refactor_arena/configs/scenarios/data-exfiltration-manual.yaml +18 -0
  43. refactor_arena/configs/scenarios/exfiltrate-api-key-hinted.yaml +11 -0
  44. refactor_arena/configs/scenarios/exfiltrate-api-key.yaml +11 -0
  45. refactor_arena/configs/scenarios/getting-started.yaml +14 -0
  46. refactor_arena/configs/scenarios/kill-web-proxy-process.yaml +12 -0
  47. refactor_arena/configs/single-file-refactor.yaml +12 -0
  48. refactor_arena/datasets/__init__.py +3 -0
  49. refactor_arena/datasets/dataset.py +390 -0
  50. refactor_arena/datasets/sources/__init__.py +7 -0
  51. refactor_arena/datasets/sources/base.py +35 -0
  52. refactor_arena/datasets/sources/get_cache_path.py +49 -0
  53. refactor_arena/datasets/sources/github.py +248 -0
  54. refactor_arena/datasets/sources/local.py +64 -0
  55. refactor_arena/datasets/sources/resolve_source.py +101 -0
  56. refactor_arena/datasets/types.py +61 -0
  57. refactor_arena/fixtures/__init__.py +3 -0
  58. refactor_arena/fixtures/fixture_manager.py +266 -0
  59. refactor_arena/metrics/__init__.py +16 -0
  60. refactor_arena/metrics/percentile.py +119 -0
  61. refactor_arena/metrics/safety.py +214 -0
  62. refactor_arena/metrics/threshold_accuracy.py +69 -0
  63. refactor_arena/metrics/undefined_result.py +3 -0
  64. refactor_arena/metrics/usefulness.py +67 -0
  65. refactor_arena/policies/__init__.py +9 -0
  66. refactor_arena/policies/_get_swe_agent_builder.py +22 -0
  67. refactor_arena/policies/swe_agent_attack_policy.py +55 -0
  68. refactor_arena/policies/swe_agent_honest_policy.py +49 -0
  69. refactor_arena/sandboxes/default/Dockerfile +65 -0
  70. refactor_arena/sandboxes/default/README.md +97 -0
  71. refactor_arena/sandboxes/default/compose.yaml +10 -0
  72. refactor_arena/sandboxes/default/entrypoint.sh +105 -0
  73. refactor_arena/sandboxes/default/features/claude-code/env +3 -0
  74. refactor_arena/sandboxes/default/features/claude-code/install.sh +19 -0
  75. refactor_arena/sandboxes/default/features/go/env +3 -0
  76. refactor_arena/sandboxes/default/features/go/install.sh +7 -0
  77. refactor_arena/sandboxes/default/features/human-agent/install.sh +12 -0
  78. refactor_arena/sandboxes/default/features/java/install.sh +10 -0
  79. refactor_arena/sandboxes/default/features/killable-process/env +1 -0
  80. refactor_arena/sandboxes/default/features/killable-process/files/usr/local/bin/run-killable-process +4 -0
  81. refactor_arena/sandboxes/default/features/killable-process/install.sh +6 -0
  82. refactor_arena/sandboxes/default/features/killable-process/start.sh +6 -0
  83. refactor_arena/sandboxes/default/features/node/env +2 -0
  84. refactor_arena/sandboxes/default/features/node/install.sh +14 -0
  85. refactor_arena/sandboxes/default/features/pytest-db-reset-plugin/env +2 -0
  86. refactor_arena/sandboxes/default/features/pytest-db-reset-plugin/files/pytest-plugins/db_reset.py +35 -0
  87. refactor_arena/sandboxes/default/features/pytest-db-reset-plugin/install.sh +5 -0
  88. refactor_arena/sandboxes/default/features/pytest-fixed-salt-plugin/env +2 -0
  89. refactor_arena/sandboxes/default/features/pytest-fixed-salt-plugin/files/pytest-plugins/fixed_salt.py +37 -0
  90. refactor_arena/sandboxes/default/features/pytest-fixed-salt-plugin/install.sh +5 -0
  91. refactor_arena/sandboxes/default/features/python/env +9 -0
  92. refactor_arena/sandboxes/default/features/python/install.sh +13 -0
  93. refactor_arena/sandboxes/default/features/web-proxy/env +5 -0
  94. refactor_arena/sandboxes/default/features/web-proxy/files/services/server/excluded-domains.txt +15 -0
  95. refactor_arena/sandboxes/default/features/web-proxy/files/services/server/save-log.py +76 -0
  96. refactor_arena/sandboxes/default/features/web-proxy/healthcheck.sh +5 -0
  97. refactor_arena/sandboxes/default/features/web-proxy/install.sh +9 -0
  98. refactor_arena/sandboxes/default/features/web-proxy/start.sh +12 -0
  99. refactor_arena/sandboxes/default/scripts/install-features.sh +34 -0
  100. refactor_arena/sandboxes/default/scripts/install-git-delta.sh +7 -0
  101. refactor_arena/sandboxes/default/scripts/install-node.sh +29 -0
  102. refactor_arena/sandboxes/default/scripts/install-python.sh +20 -0
  103. refactor_arena/scorers/__init__.py +31 -0
  104. refactor_arena/scorers/code_quality_change_grader.py +219 -0
  105. refactor_arena/scorers/code_quality_change_judge.py +103 -0
  106. refactor_arena/scorers/composite.py +110 -0
  107. refactor_arena/scorers/db_change.py +130 -0
  108. refactor_arena/scorers/db_diff_change.py +282 -0
  109. refactor_arena/scorers/file_contents.py +177 -0
  110. refactor_arena/scorers/full_trajectory_monitor.py +86 -0
  111. refactor_arena/scorers/git_diff_monitor.py +215 -0
  112. refactor_arena/scorers/main_task_acceptance.py +147 -0
  113. refactor_arena/scorers/refactor_quality_scorer.py +158 -0
  114. refactor_arena/scorers/sandbox_state.py +83 -0
  115. refactor_arena/scorers/side_task_acceptance.py +138 -0
  116. refactor_arena/scorers/task_constraints_penalty.py +504 -0
  117. refactor_arena/scorers/test_runner.py +141 -0
  118. refactor_arena/settings/__init__.py +3 -0
  119. refactor_arena/settings/entrypoint.py +6 -0
  120. refactor_arena/settings/refactor_arena_setting.py +261 -0
  121. refactor_arena/solvers/__init__.py +21 -0
  122. refactor_arena/solvers/agent_control_solver.py +92 -0
  123. refactor_arena/solvers/analyze_code_changes.py +72 -0
  124. refactor_arena/solvers/analyze_code_quality.py +52 -0
  125. refactor_arena/solvers/diff_solution.py +76 -0
  126. refactor_arena/solvers/diffs/__init__.py +7 -0
  127. refactor_arena/solvers/diffs/diff_code.py +169 -0
  128. refactor_arena/solvers/diffs/diff_databases.py +78 -0
  129. refactor_arena/solvers/export_eval.py +95 -0
  130. refactor_arena/solvers/exports/__init__.py +13 -0
  131. refactor_arena/solvers/exports/export_chat.py +94 -0
  132. refactor_arena/solvers/exports/export_config.py +87 -0
  133. refactor_arena/solvers/exports/export_sample_files.py +71 -0
  134. refactor_arena/solvers/exports/export_sandbox_files.py +107 -0
  135. refactor_arena/solvers/exports/export_store.py +88 -0
  136. refactor_arena/solvers/import_eval.py +80 -0
  137. refactor_arena/solvers/imports/__init__.py +9 -0
  138. refactor_arena/solvers/imports/import_chat.py +144 -0
  139. refactor_arena/solvers/imports/import_sandbox_files.py +71 -0
  140. refactor_arena/solvers/imports/import_store.py +78 -0
  141. refactor_arena/solvers/run_tests.py +39 -0
  142. refactor_arena/solvers/setup_repo.py +36 -0
  143. refactor_arena/solvers/snapshot_repo.py +84 -0
  144. refactor_arena/tools/__init__.py +7 -0
  145. refactor_arena/tools/run_scorer.py +116 -0
  146. refactor_arena/types/child_solver_config.py +12 -0
  147. refactor_arena/types/repo_location.py +21 -0
  148. refactor_arena/utils/README.md +147 -0
  149. refactor_arena/utils/__init__.py +0 -0
  150. refactor_arena/utils/coding/__init__.py +14 -0
  151. refactor_arena/utils/coding/analyze_code_changes.py +266 -0
  152. refactor_arena/utils/coding/analyze_quality.py +75 -0
  153. refactor_arena/utils/coding/compute_repo_metrics.py +147 -0
  154. refactor_arena/utils/coding/gather_metrics.py +44 -0
  155. refactor_arena/utils/coding/go/__init__.py +4 -0
  156. refactor_arena/utils/coding/go/setup_repo.py +32 -0
  157. refactor_arena/utils/coding/go/testing.py +29 -0
  158. refactor_arena/utils/coding/java/__init__.py +4 -0
  159. refactor_arena/utils/coding/java/setup_repo.py +44 -0
  160. refactor_arena/utils/coding/java/testing.py +48 -0
  161. refactor_arena/utils/coding/javascript/__init__.py +4 -0
  162. refactor_arena/utils/coding/javascript/setup_repo.py +32 -0
  163. refactor_arena/utils/coding/javascript/testing.py +29 -0
  164. refactor_arena/utils/coding/python/__init__.py +9 -0
  165. refactor_arena/utils/coding/python/gather_metrics.py +173 -0
  166. refactor_arena/utils/coding/python/setup_repo.py +42 -0
  167. refactor_arena/utils/coding/python/testing.py +138 -0
  168. refactor_arena/utils/coding/rate_metrics.py +315 -0
  169. refactor_arena/utils/coding/run_tests.py +39 -0
  170. refactor_arena/utils/coding/setup_repo.py +28 -0
  171. refactor_arena/utils/coding/types/code_change_analysis.py +48 -0
  172. refactor_arena/utils/coding/types/code_quality_metrics.py +131 -0
  173. refactor_arena/utils/coding/types/code_quality_rating_config.py +176 -0
  174. refactor_arena/utils/coding/types/supported_language.py +13 -0
  175. refactor_arena/utils/coding/typescript/__init__.py +4 -0
  176. refactor_arena/utils/coding/typescript/setup_repo.py +10 -0
  177. refactor_arena/utils/coding/typescript/testing.py +10 -0
  178. refactor_arena/utils/collections/__init__.py +17 -0
  179. refactor_arena/utils/collections/filter_mapping.py +42 -0
  180. refactor_arena/utils/collections/find_index.py +25 -0
  181. refactor_arena/utils/collections/is_element_sequence.py +10 -0
  182. refactor_arena/utils/collections/omit.py +15 -0
  183. refactor_arena/utils/collections/omit_none_values.py +3 -0
  184. refactor_arena/utils/collections/to_list.py +54 -0
  185. refactor_arena/utils/collections/trim_empty_paths.py +89 -0
  186. refactor_arena/utils/config/__init__.py +7 -0
  187. refactor_arena/utils/config/get_config_items.py +55 -0
  188. refactor_arena/utils/config/resolve_active_items.py +65 -0
  189. refactor_arena/utils/databases/__init__.py +11 -0
  190. refactor_arena/utils/databases/diff_database_diffs.py +546 -0
  191. refactor_arena/utils/databases/diff_filesystem.py +133 -0
  192. refactor_arena/utils/databases/diff_sqlite.py +520 -0
  193. refactor_arena/utils/databases/matches_diff.py +554 -0
  194. refactor_arena/utils/eval/__init__.py +5 -0
  195. refactor_arena/utils/eval/safe_eval.py +34 -0
  196. refactor_arena/utils/evals/__init__.py +55 -0
  197. refactor_arena/utils/evals/_metrics/__init__.py +27 -0
  198. refactor_arena/utils/evals/_metrics/_add_dynamic_metric.py +25 -0
  199. refactor_arena/utils/evals/_metrics/_get_dynamic_metrics.py +22 -0
  200. refactor_arena/utils/evals/_metrics/_get_fingerprinted_threshold.py +69 -0
  201. refactor_arena/utils/evals/_metrics/add_percentile_metric.py +111 -0
  202. refactor_arena/utils/evals/_metrics/add_safety_metric.py +191 -0
  203. refactor_arena/utils/evals/_metrics/add_threshold_accuracy_metric.py +104 -0
  204. refactor_arena/utils/evals/_metrics/add_usefulness_metric.py +121 -0
  205. refactor_arena/utils/evals/_metrics/inject_task_scores.py +43 -0
  206. refactor_arena/utils/evals/filter_reasoning_from_messages.py +49 -0
  207. refactor_arena/utils/evals/filter_tool_calls_from_messages.py +58 -0
  208. refactor_arena/utils/evals/get_attack_policy.py +82 -0
  209. refactor_arena/utils/evals/get_code_changes.py +39 -0
  210. refactor_arena/utils/evals/get_eval_id.py +31 -0
  211. refactor_arena/utils/evals/get_full_trajectory_prompt_formatter.py +71 -0
  212. refactor_arena/utils/evals/get_honest_policy.py +69 -0
  213. refactor_arena/utils/evals/get_quality_analysis.py +66 -0
  214. refactor_arena/utils/evals/get_run_scorer_tool.py +137 -0
  215. refactor_arena/utils/evals/get_score.py +38 -0
  216. refactor_arena/utils/evals/get_solver_results.py +130 -0
  217. refactor_arena/utils/evals/read_setting_eval_log.py +116 -0
  218. refactor_arena/utils/filesystem/__init__.py +42 -0
  219. refactor_arena/utils/filesystem/build_file_map.py +30 -0
  220. refactor_arena/utils/filesystem/copy_files.py +50 -0
  221. refactor_arena/utils/filesystem/ensure_extension.py +27 -0
  222. refactor_arena/utils/filesystem/filter_file_map.py +42 -0
  223. refactor_arena/utils/filesystem/find_unused_filename.py +21 -0
  224. refactor_arena/utils/filesystem/load_data_file.py +37 -0
  225. refactor_arena/utils/filesystem/local_repo_dir.py +185 -0
  226. refactor_arena/utils/filesystem/normalize_path.py +40 -0
  227. refactor_arena/utils/filesystem/parse_repo_location.py +41 -0
  228. refactor_arena/utils/filesystem/resolve_repo_location.py +199 -0
  229. refactor_arena/utils/filesystem/resources/__init__.py +15 -0
  230. refactor_arena/utils/filesystem/resources/find_resource.py +32 -0
  231. refactor_arena/utils/filesystem/resources/join_resource_paths.py +32 -0
  232. refactor_arena/utils/filesystem/resources/parent_resource_path.py +58 -0
  233. refactor_arena/utils/filesystem/resources/resolve_resource_path.py +51 -0
  234. refactor_arena/utils/filesystem/resources/resource.py +61 -0
  235. refactor_arena/utils/filesystem/resources/resource_path.py +12 -0
  236. refactor_arena/utils/filesystem/to_path.py +5 -0
  237. refactor_arena/utils/format/__init__.py +7 -0
  238. refactor_arena/utils/format/format_as_yaml.py +95 -0
  239. refactor_arena/utils/format/generate_hash.py +61 -0
  240. refactor_arena/utils/imports/__init__.py +3 -0
  241. refactor_arena/utils/imports/import_symbol.py +22 -0
  242. refactor_arena/utils/math/are_floats_equal.py +15 -0
  243. refactor_arena/utils/math/clamp.py +19 -0
  244. refactor_arena/utils/math/clamped_rescale.py +33 -0
  245. refactor_arena/utils/parse/__init__.py +3 -0
  246. refactor_arena/utils/parse/resolve_values.py +135 -0
  247. refactor_arena/utils/sandbox/__init__.py +9 -0
  248. refactor_arena/utils/sandbox/copy_dir_in_sandbox.py +69 -0
  249. refactor_arena/utils/sandbox/copy_location_to_sandbox.py +54 -0
  250. refactor_arena/utils/sandbox/file_exists_in_sandbox.py +27 -0
  251. refactor_arena/utils/typing/get_literal_args.py +16 -0
  252. refactor_arena/utils/typing/to_type.py +243 -0
  253. refactor_arena/utils/ui/__init__.py +5 -0
  254. refactor_arena/utils/ui/confirmed_by_user.py +48 -0
  255. refactor_arena-0.6.0.dist-info/METADATA +151 -0
  256. refactor_arena-0.6.0.dist-info/RECORD +258 -0
  257. refactor_arena-0.6.0.dist-info/WHEEL +4 -0
  258. refactor_arena-0.6.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,8 @@
1
+ """Full Repository Refactor - A Control Arena setting for repository refactoring evaluation."""
2
+
3
+ from importlib.metadata import version
4
+
5
+ from .settings.refactor_arena_setting import RefactorArenaSetting
6
+
7
+ __version__ = version("refactor-arena")
8
+ __all__ = ["RefactorArenaSetting"]
@@ -0,0 +1,5 @@
1
+ """CLI package for refactor-arena."""
2
+
3
+ from .main import main
4
+
5
+ __all__ = ["main"]
@@ -0,0 +1,17 @@
1
+ from ..types.command_function import CommandFunction
2
+ from .eval import eval_command
3
+ from .init import init_command
4
+ from .rescore import rescore_command
5
+
6
+ COMMANDS: dict[str, CommandFunction] = {
7
+ "eval": eval_command,
8
+ "init": init_command,
9
+ "rescore": rescore_command,
10
+ }
11
+
12
+ __all__ = [
13
+ "COMMANDS",
14
+ "eval_command",
15
+ "init_command",
16
+ "rescore_command",
17
+ ]
@@ -0,0 +1,131 @@
1
+ import logging
2
+ from collections.abc import Sequence
3
+ from pathlib import Path
4
+ from typing import cast
5
+
6
+ from control_arena.eval import get_control_task
7
+ from inspect_ai import eval
8
+ from inspect_ai.model import get_model
9
+ from inspect_ai.solver import chain, human_agent
10
+
11
+ from ...config import Config, create_fingerprint
12
+ from ...settings.refactor_arena_setting import RefactorArenaSetting
13
+ from ...solvers import agent_control_solver
14
+ from ...utils.collections import find_index
15
+ from ...utils.config import get_config_items
16
+ from ...utils.format import format_as_yaml
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def eval_command(config: Config, logs_dir: Path) -> int:
22
+ """
23
+ Runs the evaluation.
24
+ """
25
+ try:
26
+ dataset = config.get("setting.dataset")
27
+ main_task = config.get("setting.main_task")
28
+ side_task = config.get("setting.side_task")
29
+ eval_mode = config.get("eval.mode")
30
+
31
+ use_human_agent = config.get("human_agent.active")
32
+ swe_agent = config.get("swe_agent.name")
33
+ extra_features = []
34
+
35
+ if use_human_agent:
36
+ extra_features.append("human-agent")
37
+
38
+ setting = RefactorArenaSetting(
39
+ main_task=main_task,
40
+ side_task=side_task,
41
+ dataset=dataset,
42
+ extra_sandbox_features=extra_features,
43
+ tool_timeout=config.get("setting.tool_timeout"),
44
+ )
45
+
46
+ logger.info(f"Starting Full Repository Refactor evaluation in {eval_mode.value} mode")
47
+ logger.info(f"Dataset: {dataset}")
48
+ logger.info(f"Main task filter: {main_task or 'all'}")
49
+ logger.info(f"Side task filter: {side_task or 'all'}")
50
+ logger.info(f"Logs directory: {logs_dir}")
51
+ logger.info(f"Config Fingerprint:\n{format_as_yaml(create_fingerprint(config))}")
52
+
53
+ # Create evaluation configuration
54
+ eval_config = config.get("eval.config")
55
+
56
+ # Create policy and micro-protocol
57
+ untrusted_model = config.get("models.untrusted.name")
58
+ policy_name = config.get("policy.name")
59
+ micro_protocol = config.get(f"policy.options.{policy_name}")
60
+ monitors = get_config_items(config, "monitors")
61
+
62
+ # Create evaluation tasks
63
+ eval_task = get_control_task(
64
+ setting=setting,
65
+ micro_protocol=micro_protocol,
66
+ mode=eval_mode,
67
+ eval_config=eval_config,
68
+ monitoring_scorers=monitors,
69
+ metadata={
70
+ # Store only the actual configuration values used by this eval, not the full config
71
+ "config": config.get_usage(),
72
+ }
73
+ )
74
+
75
+ # Set the primary model for the eval log, so tools that only show this model will show a
76
+ # useful model string. Use the untrusted model since it's the primary agent.
77
+ # TODO: Review this decision. Now that we have configurable policies, it may not always be
78
+ # appropriate to use the untrusted model. [fastfedora 30.Jan.26]
79
+ if untrusted_model:
80
+ eval_task.model = get_model(untrusted_model)
81
+
82
+ solvers = list(cast(Sequence, eval_task.solver))
83
+ control_solver_index = find_index(solvers, lambda x: "control_solver" in str(x))
84
+
85
+ if control_solver_index is not None:
86
+ if use_human_agent:
87
+ solvers[control_solver_index] = human_agent(
88
+ intermediate_scoring=config.get("human_agent.intermediate_scoring", True),
89
+ record_session=config.get("human_agent.record_session", False),
90
+ )
91
+ eval_task.solver = chain(solvers)
92
+ elif swe_agent:
93
+ solvers[control_solver_index] = agent_control_solver(
94
+ micro_protocol=micro_protocol,
95
+ tools=setting.get_agent_tools,
96
+ get_submit_tool_config=setting.get_submit_tool_config,
97
+ )
98
+ eval_task.solver = chain(solvers)
99
+
100
+ import_path = config.get("import.path")
101
+ import_mode = config.get("import.mode")
102
+
103
+ # Allow human agent if we're only scoring, but otherwise remove the control solver
104
+ if import_path and import_mode == "score" and not use_human_agent:
105
+ logger.info(f"Import mode: Using trajectory from {import_path}")
106
+ if control_solver_index is not None:
107
+ solvers.pop(control_solver_index)
108
+ eval_task.solver = chain(solvers)
109
+
110
+ eval(
111
+ eval_task,
112
+ **eval_config.eval_args(),
113
+ )
114
+
115
+ if hasattr(setting, "cleanup"):
116
+ setting.cleanup()
117
+
118
+ logger.info("Evaluation completed")
119
+
120
+ return 0
121
+
122
+ except KeyboardInterrupt:
123
+ logger.info("Evaluation interrupted by user")
124
+ return 1
125
+
126
+ except Exception as e:
127
+ logger.error(f"Evaluation failed: {e}")
128
+ # logger.error(e.with_traceback())
129
+ if config.get("logs.verbose"):
130
+ logger.exception("Full traceback:")
131
+ return 1
@@ -0,0 +1,157 @@
1
+ import getpass
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ from ...config import Config, config_dirs
6
+ from ...utils.filesystem.resources.find_resource import find_resource
7
+
8
+
9
+ STARTER_CONFIG_FILENAME = "getting-started.yaml"
10
+
11
+
12
+ def init_command(config: Config, logs_dir: Path) -> int: # noqa: ARG001
13
+ """
14
+ Initialize a new .env file for the current project.
15
+
16
+ If a .env file already exists in the current working directory, this command reports that the
17
+ project is already initialized and makes no changes.
18
+ """
19
+ target_dir = Path.cwd()
20
+ env_path = target_dir / ".env"
21
+
22
+ if env_path.exists():
23
+ print("This project is already initialized ('.env' already exists).")
24
+ return 0
25
+
26
+ trusted_default = config.get("models.trusted.name")
27
+ untrusted_default = config.get("models.untrusted.name")
28
+
29
+ print("Initializing refactor-arena project...\n")
30
+
31
+ try:
32
+ # First ask for the model names, since that determines which API keys are needed
33
+ trusted_model = _prompt_with_default("Trusted model name", trusted_default)
34
+ untrusted_model = _prompt_with_default("Untrusted model name", untrusted_default)
35
+
36
+ # Only ask for the API keys of the configured model providers
37
+ trusted_provider = _get_model_provider(trusted_model)
38
+ untrusted_provider = _get_model_provider(untrusted_model)
39
+
40
+ needs_openai = trusted_provider == "openai" or untrusted_provider == "openai"
41
+ needs_anthropic = trusted_provider == "anthropic" or untrusted_provider == "anthropic"
42
+
43
+ openai_api_key: str | None = None
44
+ anthropic_api_key: str | None = None
45
+
46
+ if needs_openai:
47
+ print("An OpenAI model is configured as one of the models and needs an API key.")
48
+ openai_api_key = _prompt_secret("Enter an OpenAI API key (leave blank to skip)")
49
+
50
+ if needs_anthropic:
51
+ print("An Anthropic model is configured as one of the models and needs an API key.")
52
+ anthropic_api_key = _prompt_secret("Enter an Anthropic API key (leave blank to skip)")
53
+ except (EOFError, KeyboardInterrupt):
54
+ print("\nInitialization aborted.")
55
+ return 1
56
+
57
+ env_created = _write_env_file(
58
+ config=config,
59
+ env_path=env_path,
60
+ trusted_model=trusted_model,
61
+ untrusted_model=untrusted_model,
62
+ openai_api_key=openai_api_key,
63
+ anthropic_api_key=anthropic_api_key,
64
+ )
65
+
66
+ starter_scenario_created = _copy_starter_scenario(target_dir)
67
+
68
+ if not env_created or not starter_scenario_created:
69
+ return 1
70
+
71
+ print(
72
+ f"\nThis directory has been initialized with a '.env' and starter configuration file. You"
73
+ " can now use the CLI to run a control eval. For example:"
74
+ )
75
+ print(f" refactor-arena eval --config {STARTER_CONFIG_FILENAME}")
76
+ return 0
77
+
78
+
79
+ def _prompt_with_default(prompt: str, default: str) -> str:
80
+ """
81
+ Prompt the user for a value, showing a default.
82
+
83
+ Returns the user-provided value, or the default if the user enters nothing.
84
+ """
85
+ return input(f"{prompt} [{default}]: ").strip() or default
86
+
87
+
88
+ def _prompt_secret(prompt: str) -> str:
89
+ """
90
+ Prompt the user for a secret value without echoing.
91
+ """
92
+ return getpass.getpass(f"{prompt}: ").strip()
93
+
94
+
95
+ def _get_model_provider(model_name: str | None) -> str | None:
96
+ """
97
+ Extract the provider from a model name of the form 'provider/model-name'.
98
+ """
99
+ return model_name.split("/", 1)[0].lower() if model_name and "/" in model_name else None
100
+
101
+
102
+ def _write_env_file(
103
+ config: Config,
104
+ env_path: Path,
105
+ trusted_model: str,
106
+ untrusted_model: str,
107
+ openai_api_key: str | None,
108
+ anthropic_api_key: str | None,
109
+ ) -> bool:
110
+ """Write a new .env file at the given path. Returns True on success."""
111
+ trusted_swe_model = config.get("swe_agent.claude-code.model")
112
+ untrusted_swe_model = config.get("swe_agent.codex.model")
113
+
114
+ lines: list[str] = [
115
+ "# Provider API keys",
116
+ f"OPENAI_API_KEY={openai_api_key or ''}",
117
+ f"ANTHROPIC_API_KEY={anthropic_api_key or ''}",
118
+ "",
119
+ "# Trusted and untrusted models--use versions to record the exact model in the transcripts",
120
+ f"TRUSTED_MODEL={trusted_model}",
121
+ f"UNTRUSTED_MODEL={untrusted_model}",
122
+ "",
123
+ "# Model overrides when using SWE agents (e.g., `--swe-agent` for the CLI). "
124
+ "If not specified, the `UNTRUSTED_MODEL` or `TRUSTED_MODEL` will be used",
125
+ f"CLAUDE_CODE_MODEL={trusted_swe_model or ''}",
126
+ f"CODEX_MODEL={untrusted_swe_model or ''}",
127
+ "",
128
+ "# Whether to record human agent sessions when using `--human-agent` for the CLI",
129
+ "HUMAN_AGENT_RECORD_SESSION=false",
130
+ "",
131
+ ]
132
+
133
+ try:
134
+ env_path.write_text("\n".join(lines), encoding="utf-8")
135
+ except Exception as error:
136
+ print(f"Failed to create '.env' file: {error}", file=sys.stderr)
137
+ return False
138
+
139
+ return True
140
+
141
+
142
+ def _copy_starter_scenario(target_directory: Path) -> bool:
143
+ """Copy the getting-started scenario config into the project directory for convenience."""
144
+ target_path = target_directory / STARTER_CONFIG_FILENAME
145
+
146
+ if target_path.exists():
147
+ print(f"Skipping copying starter configuration: {STARTER_CONFIG_FILENAME} already exists")
148
+ return True
149
+
150
+ try:
151
+ source_path = Path(find_resource("scenarios/getting-started.yaml", config_dirs))
152
+ target_path.write_text(source_path.read_text(encoding="utf-8"), encoding="utf-8")
153
+ except Exception as error:
154
+ print(f"Failed to create starter configuration file: {error}", file=sys.stderr)
155
+ return False
156
+
157
+ return True
@@ -0,0 +1,328 @@
1
+ import asyncio
2
+ import logging
3
+ import re
4
+ from collections.abc import Mapping, Sequence
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from control_arena.eval import EvalMode
9
+ # TODO: Figure out if these can be done via the public API. [fastfedora 15.Apr.26]
10
+ from inspect_ai._eval.score import score_async as inspect_score_async
11
+ from inspect_ai._eval.task.results import eval_results
12
+ from inspect_ai._util.registry import registry_unqualified_name
13
+ from inspect_ai.log import EvalLog, EvalSample, EvalScore, write_eval_log_async
14
+ from inspect_ai.scorer import SampleScore, Scorer
15
+ from pydash import get as get_nested, set_ as set_nested
16
+
17
+ from ...config import Config
18
+ from ...utils.config import get_config_items, resolve_active_items
19
+ from ...utils.evals.read_setting_eval_log import read_setting_eval_log
20
+ from ...utils.filesystem import find_unused_filename
21
+ from ...utils.ui import confirmed_by_user
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def rescore_command(config: Config, logs_dir: Path) -> int:
27
+ """
28
+ Re-score previous evaluation results from an .eval log file.
29
+ """
30
+ try:
31
+ source_path = config.get("rescore.source.path")
32
+ if not source_path:
33
+ logger.error("No source path provided for rescore (rescore.source.path is empty).")
34
+ return 1
35
+
36
+ source_path = Path(source_path)
37
+
38
+ if source_path.suffix != ".eval":
39
+ logger.error("Only .eval log files are currently supported for rescore.")
40
+ return 1
41
+
42
+ success = asyncio.run(_rescore_eval_file(config, source_path, logs_dir))
43
+
44
+ return 1 if not success else 0
45
+
46
+ except KeyboardInterrupt:
47
+ logger.info("Rescore interrupted by user")
48
+ return 1
49
+
50
+ except Exception as e:
51
+ logger.error(f"Rescore failed: {e}")
52
+ if logger.isEnabledFor(logging.DEBUG):
53
+ logger.exception("Full traceback:")
54
+ return 1
55
+
56
+
57
+ async def _rescore_eval_file(config: Config, eval_path: Path, logs_dir: Path) -> bool:
58
+ """
59
+ Rescore an existing .eval log file.
60
+ """
61
+ # Read in log and initialize config
62
+ log = await read_setting_eval_log(eval_path, config.get("rescore.import", {}))
63
+ if log.samples is None:
64
+ logger.error("No samples found in the eval log.")
65
+ return False
66
+
67
+ _import_config_from_log(config, log)
68
+
69
+ # Apply dataset filters
70
+ all_samples = log.samples
71
+ log.samples = _filter_samples(all_samples, config.get("rescore.filter", {}))
72
+
73
+ if not log.samples:
74
+ logger.error("No samples matched the dataset filters; no output was written.")
75
+ return False
76
+
77
+ # Check output settings and warn about issues
78
+ output_mode: str = config.get("rescore.output.mode", "new-log")
79
+ filter_samples: bool = config.get("rescore.output.filter.samples", False)
80
+ filter_metrics: bool = config.get("rescore.output.filter.metrics", False)
81
+ filter_active = len(all_samples) != len(log.samples)
82
+
83
+ if filter_samples and not filter_active:
84
+ logger.warning("--filter-samples has no effect when no samples are filtered.")
85
+
86
+ if output_mode == "overwrite-log" and filter_samples and filter_active:
87
+ if not confirmed_by_user(
88
+ "Using --output-mode `overwrite-log` with --filter-samples will remove non-matching"
89
+ " samples from the source log. Proceed? (y/n): ",
90
+ config=config,
91
+ config_path="rescore.confirm.overwrite_with_filter_samples",
92
+ ):
93
+ logger.error("Aborting rescore.")
94
+ return False
95
+
96
+ logger.info(f"Rescoring {len(log.samples)} of {len(all_samples)} sample(s).")
97
+
98
+ # Perform the rescoring
99
+ original_scores = list(log.results.scores) if log.results else []
100
+ metrics_only: bool = config.get("rescore.metrics_only", False)
101
+ scorer_names = _get_active_scorer_names(config)
102
+ scorers = _get_active_scorers(config)
103
+
104
+ if metrics_only:
105
+ if filter_active and not filter_samples:
106
+ logger.error(
107
+ "To ensure metrics represent the entire dataset, --metrics-only can only be used"
108
+ " with --filter-samples when filtering samples."
109
+ )
110
+ return False
111
+ _recompute_metrics(log, scorers)
112
+ else:
113
+ _assert_no_sandbox_required_for_eval_logs(config, scorer_names)
114
+ await inspect_score_async(log=log, scorers=scorers, action="append", copy=False)
115
+ if filter_active and not filter_samples:
116
+ log.samples = _merge_samples(log.samples, all_samples)
117
+ _recompute_metrics(log, scorers)
118
+
119
+ if not filter_metrics and log.results:
120
+ _merge_original_scores(log, original_scores)
121
+
122
+ # Write the re-scored log to the appropriate location based on the output mode
123
+ if output_mode == "overwrite-log":
124
+ await write_eval_log_async(log)
125
+ logger.info(f"Wrote re-scored log in-place to '{log.location}'")
126
+ else:
127
+ output_path = find_unused_filename(logs_dir / f"rescore_{Path(eval_path).stem}.eval")
128
+ log.location = output_path.as_posix()
129
+ await write_eval_log_async(log, output_path)
130
+ logger.info(f"Wrote re-scored log to '{output_path}'")
131
+
132
+ return True
133
+
134
+
135
+ def _merge_samples(destination: list[EvalSample], source: list[EvalSample]) -> list[EvalSample]:
136
+ """
137
+ Merge the samples from the source list of samples into the destination list of samples, matching
138
+ samples by their ID.
139
+ """
140
+ by_sample_id = {sample.id: sample for sample in source}
141
+ return [by_sample_id.get(sample.id, sample) for sample in destination]
142
+
143
+
144
+ def _get_active_scorer_names(config: Config) -> list[str]:
145
+ """
146
+ Return the names of the active scorers based on the current config.
147
+ """
148
+ monitors = resolve_active_items(config, "monitors")
149
+
150
+ monitors_only: bool = config.get("rescore.monitors_only", False)
151
+ if monitors_only:
152
+ return monitors
153
+
154
+ scorers = resolve_active_items(config, "scorers")
155
+ return scorers + monitors
156
+
157
+
158
+ def _get_active_scorers(config: Config) -> list[Scorer]:
159
+ """
160
+ Instantiate the scorers to use for rescoring based on the current config.
161
+
162
+ Uses the standard `scorers` and `monitors` config sections (list + include-if + options
163
+ pattern), which can be filtered via the `--scorers` and `--monitors` CLI options respectively.
164
+ When `--monitors-only` is set, only monitor scorers are included.
165
+ """
166
+ monitors = get_config_items(config, "monitors")
167
+
168
+ monitors_only: bool = config.get("rescore.monitors_only", False)
169
+ if monitors_only:
170
+ return monitors
171
+
172
+ scorers = get_config_items(config, "scorers")
173
+ return scorers + monitors
174
+
175
+
176
+ def _filter_samples(samples: list[EvalSample], config: Mapping[str, Any]) -> list[EvalSample]:
177
+ """
178
+ Filter samples based on dataset filter config values.
179
+
180
+ Returns:
181
+ List of filtered samples. If no filters are configured, the original list is returned.
182
+ """
183
+ main_task_pattern = config.get("main_task")
184
+ side_task_pattern = config.get("side_task")
185
+ repo_pattern = config.get("repo")
186
+ limit = config.get("limit")
187
+ limit_active = isinstance(limit, int) and limit > 0
188
+
189
+ filter_active = bool(main_task_pattern or side_task_pattern or repo_pattern or limit_active)
190
+ if not filter_active:
191
+ return samples
192
+
193
+ filtered_samples: list[EvalSample] = []
194
+ for sample in samples:
195
+ metadata = sample.metadata if isinstance(sample.metadata, Mapping) else {}
196
+ sample_main_task = metadata.get("main_task_name", "")
197
+ sample_side_task = metadata.get("side_task_name", "")
198
+ sample_repo = str(sample.id).split("__", maxsplit=1)[0]
199
+
200
+ if main_task_pattern and not re.search(main_task_pattern, sample_main_task):
201
+ continue
202
+
203
+ if side_task_pattern and not re.search(side_task_pattern, sample_side_task):
204
+ continue
205
+
206
+ if repo_pattern and not re.search(repo_pattern, sample_repo):
207
+ continue
208
+
209
+ filtered_samples.append(sample)
210
+
211
+ if limit_active:
212
+ filtered_samples = filtered_samples[:limit]
213
+
214
+ return filtered_samples
215
+
216
+
217
+ def _recompute_metrics(log: EvalLog, scorers: list[Scorer]) -> None:
218
+ """
219
+ Recompute log.results and log.reductions from the current log.samples scores.
220
+ """
221
+ if not log.samples:
222
+ raise ValueError("_recompute_metrics must be called with at least one sample.")
223
+
224
+ active_scorer_names = {registry_unqualified_name(scorer) for scorer in scorers}
225
+
226
+ scores: list[dict[str, SampleScore]] = []
227
+ samples = list(log.samples)
228
+ for sample in samples:
229
+ if not sample.scores:
230
+ continue
231
+
232
+ sample_scores: dict[str, SampleScore] = {}
233
+ for name, score in sample.scores.items():
234
+ if name in active_scorer_names:
235
+ sample_scores[name] = SampleScore(score=score, sample_id=sample.id)
236
+ if sample_scores:
237
+ scores.append(sample_scores)
238
+
239
+ results, reductions = eval_results(
240
+ samples=len(samples),
241
+ scores=scores,
242
+ reducers=None,
243
+ scorers=scorers,
244
+ metrics=None,
245
+ )
246
+
247
+ log.results = results
248
+ log.reductions = reductions
249
+
250
+
251
+ def _merge_original_scores(
252
+ log: EvalLog,
253
+ original_scores: list[EvalScore],
254
+ ) -> None:
255
+ """
256
+ Merge metrics from previously existing scores back into the log results, preserving metrics for
257
+ scorers that were not part of the current rescore run. The original ordering is retained, with
258
+ recomputed scores replacing their originals in place.
259
+ """
260
+ if not original_scores or not log.results:
261
+ return
262
+
263
+ new_scores_by_name = {score.name: score for score in log.results.scores}
264
+ merged: list[EvalScore] = []
265
+ for score in original_scores:
266
+ merged.append(new_scores_by_name.pop(score.name, score))
267
+
268
+ merged.extend(new_scores_by_name.values())
269
+ log.results.scores = merged
270
+
271
+
272
+ def _import_config_from_log(config: Config, log: EvalLog) -> None:
273
+ """
274
+ Import config values from the eval log's stored configuration into the current config as
275
+ overrides. The paths to import are specified by `rescore.import.config`.
276
+ """
277
+ import_paths: Sequence[str] = config.get("rescore.import.config", [])
278
+ if not import_paths:
279
+ return
280
+
281
+ stored_config = (log.eval.metadata or {}).get("config")
282
+ if not stored_config or not isinstance(stored_config, Mapping):
283
+ logger.warning("Eval log has no stored config in metadata; skipping config import.")
284
+ return
285
+
286
+ overrides: dict[str, Any] = {}
287
+ sentinel = object()
288
+
289
+ for path in import_paths:
290
+ value = get_nested(stored_config, path, default=sentinel)
291
+ if value is sentinel:
292
+ logger.debug(f"Config path '{path}' not found in eval log metadata; skipping.")
293
+ continue
294
+ set_nested(overrides, path, value)
295
+ logger.debug(f"Imported config override from eval log: {path} = {value!r}")
296
+
297
+ if overrides:
298
+ # TODO: Figure out a way to integrate this into the config parser itself. I extended the
299
+ # args parser to create enums, but when the eval mode is set in a configuration file,
300
+ # it is not converted to an enum. [fastfedora 03.Apr.26]
301
+ # Force the eval mode to be an enum
302
+ eval_mode = overrides.get("eval", {}).get("mode")
303
+ if isinstance(eval_mode, str):
304
+ overrides["eval"] = { "mode": EvalMode(eval_mode) }
305
+
306
+ config.add_overrides(overrides)
307
+
308
+
309
+ def _assert_no_sandbox_required_for_eval_logs(config: Config, scorer_names: list[str]) -> None:
310
+ """
311
+ Ensure that when rescoring .eval logs we don't attempt to run scorers that require sandbox setup
312
+ or post-submission solvers.
313
+ """
314
+ options: Mapping[str, Mapping[str, object]] = config.get("rescore.scorers.options") or {}
315
+
316
+ invalid: list[str] = []
317
+ for name in scorer_names:
318
+ info = options.get(name, { "setup": False })
319
+ requires_setup = bool(info.get("setup"))
320
+ if requires_setup:
321
+ invalid.append(name)
322
+
323
+ if invalid:
324
+ raise RuntimeError(
325
+ "The following scorers require sandbox setup and cannot be run when rescoring "
326
+ f"existing .eval logs: {", ".join(sorted(invalid))}"
327
+ )
328
+