@elizaos/sweagent-root 2.0.0-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +270 -0
- package/package.json +71 -0
- package/python/LICENSE +21 -0
- package/python/config/README.md +15 -0
- package/python/config/bash_only.yaml +222 -0
- package/python/config/benchmarks/250212_sweagent_heavy_sbl.yaml +188 -0
- package/python/config/benchmarks/250225_anthropic_filemap_simple_review.yaml +75 -0
- package/python/config/benchmarks/250522_anthropic_filemap_simple_review.yaml +92 -0
- package/python/config/benchmarks/250526_anthropic_filemap_simple_review_sbl.yaml +93 -0
- package/python/config/benchmarks/anthropic_filemap_multilingual.yaml +66 -0
- package/python/config/coding_challenge.yaml +104 -0
- package/python/config/default.yaml +69 -0
- package/python/config/default_backticks.yaml +69 -0
- package/python/config/default_mm_no_images.yaml +82 -0
- package/python/config/default_mm_with_images.yaml +83 -0
- package/python/config/demo/default.yaml +80 -0
- package/python/config/demo/no_instructions.yaml +69 -0
- package/python/config/demo/only_bash.yaml +60 -0
- package/python/config/exotic/default_shell.yaml +52 -0
- package/python/config/exotic/windowed_replace.yaml +125 -0
- package/python/config/exotic/windowed_replace_late_repro.yaml +127 -0
- package/python/config/human/human.yaml +24 -0
- package/python/config/human/human_demo.yaml +52 -0
- package/python/config/sweagent_0_7/07.yaml +101 -0
- package/python/config/sweagent_0_7/07_fcalling.yaml +100 -0
- package/python/config/sweagent_0_7/07_from_url.yaml +114 -0
- package/python/config/sweagent_0_7/07_thought_action.yaml +102 -0
- package/python/config/sweagent_0_7/07_thought_action_xml.yaml +96 -0
- package/python/mlc_config.json +44 -0
- package/python/pyproject.toml +262 -0
- package/python/sweagent/__init__.py +114 -0
- package/python/sweagent/__main__.py +4 -0
- package/python/sweagent/agent/__init__.py +0 -0
- package/python/sweagent/agent/action_sampler.py +317 -0
- package/python/sweagent/agent/agents.py +1294 -0
- package/python/sweagent/agent/extra/shell_agent.py +106 -0
- package/python/sweagent/agent/history_processors.py +399 -0
- package/python/sweagent/agent/hooks/__init__.py +0 -0
- package/python/sweagent/agent/hooks/abstract.py +139 -0
- package/python/sweagent/agent/hooks/status.py +34 -0
- package/python/sweagent/agent/models.py +896 -0
- package/python/sweagent/agent/problem_statement.py +312 -0
- package/python/sweagent/agent/reviewer.py +664 -0
- package/python/sweagent/environment/__init__.py +0 -0
- package/python/sweagent/environment/hooks/__init__.py +0 -0
- package/python/sweagent/environment/hooks/abstract.py +60 -0
- package/python/sweagent/environment/hooks/status.py +28 -0
- package/python/sweagent/environment/repo.py +219 -0
- package/python/sweagent/environment/swe_env.py +276 -0
- package/python/sweagent/exceptions.py +54 -0
- package/python/sweagent/inspector/README.md +6 -0
- package/python/sweagent/inspector/__init__.py +0 -0
- package/python/sweagent/inspector/favicon.ico +0 -0
- package/python/sweagent/inspector/fileViewer.js +354 -0
- package/python/sweagent/inspector/icons/computer.png +0 -0
- package/python/sweagent/inspector/icons/edit_icon.svg +11 -0
- package/python/sweagent/inspector/icons/swe-agent-logo-50.png +0 -0
- package/python/sweagent/inspector/icons/swellama_blue.png +0 -0
- package/python/sweagent/inspector/icons/swellama_brown.png +0 -0
- package/python/sweagent/inspector/icons/swellama_grey.png +0 -0
- package/python/sweagent/inspector/icons/swellama_tan.png +0 -0
- package/python/sweagent/inspector/index.html +25 -0
- package/python/sweagent/inspector/server.py +354 -0
- package/python/sweagent/inspector/static.py +169 -0
- package/python/sweagent/inspector/style.css +454 -0
- package/python/sweagent/run/__init__.py +0 -0
- package/python/sweagent/run/_progress.py +158 -0
- package/python/sweagent/run/batch_instances.py +419 -0
- package/python/sweagent/run/common.py +387 -0
- package/python/sweagent/run/compare_runs.py +123 -0
- package/python/sweagent/run/extract_pred.py +19 -0
- package/python/sweagent/run/hooks/__init__.py +0 -0
- package/python/sweagent/run/hooks/abstract.py +67 -0
- package/python/sweagent/run/hooks/apply_patch.py +106 -0
- package/python/sweagent/run/hooks/open_pr.py +244 -0
- package/python/sweagent/run/hooks/swe_bench_evaluate.py +113 -0
- package/python/sweagent/run/inspector_cli.py +493 -0
- package/python/sweagent/run/merge_predictions.py +64 -0
- package/python/sweagent/run/quick_stats.py +96 -0
- package/python/sweagent/run/remove_unfinished.py +63 -0
- package/python/sweagent/run/rich_test.py +91 -0
- package/python/sweagent/run/run.py +147 -0
- package/python/sweagent/run/run_batch.py +442 -0
- package/python/sweagent/run/run_replay.py +219 -0
- package/python/sweagent/run/run_shell.py +155 -0
- package/python/sweagent/run/run_single.py +225 -0
- package/python/sweagent/run/run_traj_to_demo.py +85 -0
- package/python/sweagent/tools/__init__.py +0 -0
- package/python/sweagent/tools/bundle.py +57 -0
- package/python/sweagent/tools/commands.py +220 -0
- package/python/sweagent/tools/parsing.py +619 -0
- package/python/sweagent/tools/tools.py +430 -0
- package/python/sweagent/tools/utils.py +108 -0
- package/python/sweagent/types.py +102 -0
- package/python/sweagent/utils/__init__.py +0 -0
- package/python/sweagent/utils/config.py +80 -0
- package/python/sweagent/utils/files.py +27 -0
- package/python/sweagent/utils/github.py +118 -0
- package/python/sweagent/utils/jinja_warnings.py +14 -0
- package/python/sweagent/utils/log.py +175 -0
- package/python/sweagent/utils/patch_formatter.py +152 -0
- package/python/sweagent/utils/serialization.py +45 -0
- package/python/tests/__init__.py +0 -0
- package/python/tests/conftest.py +191 -0
- package/python/tests/test_agent.py +258 -0
- package/python/tests/test_batch_instance.py +43 -0
- package/python/tests/test_commands/_interactive_dummy.py +35 -0
- package/python/tests/test_commands/interactive_dummy_wrapper.sh +29 -0
- package/python/tests/test_data/config_files/dummy_interactive.yaml +62 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/README.md +13 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/customrandom.c +50 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/release +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/server +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/solver.py +12 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/README.md +16 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/challenge.json +9 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/flash_c8429a430278283c0e571baebca3d139.zip +0 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/README.md +15 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/challenge.json +10 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/networking.pcap +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/Dockerfile +28 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/challenge.json +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/flag.txt +1 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.c +26 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.py +9 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/challenge.json +8 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock +0 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock.cpp +167 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/solution.cpp +24 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/solution.py +6 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/test.sh +10 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/000-default.conf +18 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/file.pl +38 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/forms.pl +40 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/hello.pl +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/flag +1 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/index.html +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/solution.txt +1 -0
- package/python/tests/test_data/data_sources/debug_20240322.json +1 -0
- package/python/tests/test_data/data_sources/expert_instances.yaml +16 -0
- package/python/tests/test_data/data_sources/human_eval.json +1 -0
- package/python/tests/test_data/data_sources/simple_instances.yaml +3 -0
- package/python/tests/test_data/data_sources/simple_instances_long.yaml +30 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy_first_only.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-lite-test.json +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/6e44b9__sweagenttestrepo-1c2844.traj +342 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/solution_missing_colon.py +15 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/args.yaml +518 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/swe-agent__test-repo-i1.traj +124 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/all_preds.jsonl +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/args.yaml +520 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/patches/pydicom__pydicom-1458.patch +18 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/pydicom__pydicom-1458.traj +257 -0
- package/python/tests/test_env.py +66 -0
- package/python/tests/test_env_utils.py +129 -0
- package/python/tests/test_history_processors.py +40 -0
- package/python/tests/test_models.py +23 -0
- package/python/tests/test_openai_live.py +164 -0
- package/python/tests/test_packaging.py +7 -0
- package/python/tests/test_parsing.py +131 -0
- package/python/tests/test_problem_statement_multimodal.py +111 -0
- package/python/tests/test_quick_stats.py +42 -0
- package/python/tests/test_run.py +37 -0
- package/python/tests/test_run_batch.py +110 -0
- package/python/tests/test_run_hooks.py +114 -0
- package/python/tests/test_run_replay.py +33 -0
- package/python/tests/test_run_single.py +125 -0
- package/python/tests/test_tools_command_parsing.py +193 -0
- package/python/tests/test_utils.py +15 -0
- package/python/tests/tools/__init__.py +0 -0
- package/python/tests/tools/conftest.py +12 -0
- package/python/tests/tools/test_default_utils.py +153 -0
- package/python/tests/tools/test_edit_replace.py +0 -0
- package/python/tests/tools/test_split_string.py +82 -0
- package/python/tests/utils.py +29 -0
- package/python/tools/diff_state/bin/_state_diff_state +52 -0
- package/python/tools/diff_state/config.yaml +2 -0
- package/python/tools/edit_anthropic/bin/_state_anthropic +21 -0
- package/python/tools/edit_anthropic/bin/str_replace_editor +710 -0
- package/python/tools/edit_anthropic/config.yaml +56 -0
- package/python/tools/edit_anthropic/install.sh +3 -0
- package/python/tools/filemap/bin/filemap +45 -0
- package/python/tools/filemap/config.yaml +9 -0
- package/python/tools/filemap/install.sh +2 -0
- package/python/tools/forfeit/bin/exit_forfeit +5 -0
- package/python/tools/forfeit/config.yaml +5 -0
- package/python/tools/image_tools/bin/view_image +36 -0
- package/python/tools/image_tools/config.yaml +9 -0
- package/python/tools/multilingual_setup/bin/do_nothing +2 -0
- package/python/tools/multilingual_setup/config.yaml +1 -0
- package/python/tools/multilingual_setup/install.sh +45 -0
- package/python/tools/registry/bin/_read_env +10 -0
- package/python/tools/registry/bin/_write_env +10 -0
- package/python/tools/registry/config.yaml +1 -0
- package/python/tools/registry/install.sh +6 -0
- package/python/tools/registry/lib/__init__.py +0 -0
- package/python/tools/registry/lib/registry.py +56 -0
- package/python/tools/review_on_submit_m/README.md +6 -0
- package/python/tools/review_on_submit_m/bin/submit +54 -0
- package/python/tools/review_on_submit_m/config.yaml +6 -0
- package/python/tools/review_on_submit_m/install.sh +0 -0
- package/python/tools/search/bin/find_file +31 -0
- package/python/tools/search/bin/search_dir +39 -0
- package/python/tools/search/bin/search_file +55 -0
- package/python/tools/search/config.yaml +37 -0
- package/python/tools/search/install.sh +3 -0
- package/python/tools/submit/bin/submit +17 -0
- package/python/tools/submit/config.yaml +5 -0
- package/python/tools/web_browser/bin/click_mouse +41 -0
- package/python/tools/web_browser/bin/close_site +28 -0
- package/python/tools/web_browser/bin/double_click_mouse +37 -0
- package/python/tools/web_browser/bin/drag_mouse +46 -0
- package/python/tools/web_browser/bin/execute_script_on_page +39 -0
- package/python/tools/web_browser/bin/get_console_output +48 -0
- package/python/tools/web_browser/bin/move_mouse +35 -0
- package/python/tools/web_browser/bin/navigate_back +33 -0
- package/python/tools/web_browser/bin/navigate_forward +33 -0
- package/python/tools/web_browser/bin/open_site +36 -0
- package/python/tools/web_browser/bin/press_keys_on_page +51 -0
- package/python/tools/web_browser/bin/reload_page +33 -0
- package/python/tools/web_browser/bin/run_web_browser_server +394 -0
- package/python/tools/web_browser/bin/screenshot_site +38 -0
- package/python/tools/web_browser/bin/scroll_on_page +40 -0
- package/python/tools/web_browser/bin/set_browser_window_size +40 -0
- package/python/tools/web_browser/bin/type_text +34 -0
- package/python/tools/web_browser/bin/wait_time +39 -0
- package/python/tools/web_browser/config.yaml +155 -0
- package/python/tools/web_browser/install.sh +22 -0
- package/python/tools/web_browser/lib/browser_manager.py +404 -0
- package/python/tools/web_browser/lib/web_browser_config.py +33 -0
- package/python/tools/web_browser/lib/web_browser_utils.py +126 -0
- package/python/tools/web_browser/test_console.html +1 -0
- package/python/tools/windowed/bin/_state +25 -0
- package/python/tools/windowed/bin/create +29 -0
- package/python/tools/windowed/bin/goto +37 -0
- package/python/tools/windowed/bin/open +49 -0
- package/python/tools/windowed/bin/scroll_down +12 -0
- package/python/tools/windowed/bin/scroll_up +13 -0
- package/python/tools/windowed/config.yaml +38 -0
- package/python/tools/windowed/install.sh +15 -0
- package/python/tools/windowed/lib/__init__.py +0 -0
- package/python/tools/windowed/lib/flake8_utils.py +147 -0
- package/python/tools/windowed/lib/windowed_file.py +312 -0
- package/python/tools/windowed_edit_linting/bin/edit +128 -0
- package/python/tools/windowed_edit_linting/config.yaml +31 -0
- package/python/tools/windowed_edit_linting/install.sh +5 -0
- package/python/tools/windowed_edit_replace/bin/edit +172 -0
- package/python/tools/windowed_edit_replace/bin/insert +77 -0
- package/python/tools/windowed_edit_replace/config.yaml +60 -0
- package/python/tools/windowed_edit_replace/install.sh +5 -0
- package/python/tools/windowed_edit_rewrite/bin/edit +78 -0
- package/python/tools/windowed_edit_rewrite/config.yaml +11 -0
- package/python/tools/windowed_edit_rewrite/install.sh +5 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyEncryption.traj +318 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyTimeCapsule.traj +197 -0
- package/python/trajectories/demonstrations/ctf/crypto/eps.traj +289 -0
- package/python/trajectories/demonstrations/ctf/crypto/katy.traj +368 -0
- package/python/trajectories/demonstrations/ctf/forensics/flash.traj +102 -0
- package/python/trajectories/demonstrations/ctf/misc/networking_1.traj +102 -0
- package/python/trajectories/demonstrations/ctf/pwn/warmup.traj +159 -0
- package/python/trajectories/demonstrations/ctf/rev/rock.traj +251 -0
- package/python/trajectories/demonstrations/ctf/web/i_got_id_demo.traj +422 -0
- package/python/trajectories/demonstrations/function_calling_simple.traj +151 -0
- package/python/trajectories/demonstrations/human_thought__swe-bench-HumanEvalFix-python__lcb__t-0.00__p-0.95__c-4.00__install-0/humanevalfix-python-0.traj +129 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default__t-0.20__p-0.95__c-2.00__install-1___install_from_source/marshmallow-code__marshmallow-1867.traj +318 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling__install-1/marshmallow-code__marshmallow-1867.traj +594 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace__install-1/marshmallow-code__marshmallow-1867.traj +592 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace_from_source/marshmallow-code__marshmallow-1867.traj +3316 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/str_replace_anthropic_demo.yaml +432 -0
- package/rust/Cargo.toml +100 -0
- package/rust/README.md +49 -0
- package/rust/src/agent/action_sampler.rs +130 -0
- package/rust/src/agent/agents.rs +1029 -0
- package/rust/src/agent/history_processors.rs +277 -0
- package/rust/src/agent/hooks/mod.rs +208 -0
- package/rust/src/agent/mod.rs +24 -0
- package/rust/src/agent/models.rs +837 -0
- package/rust/src/agent/problem_statement.rs +355 -0
- package/rust/src/agent/reviewer.rs +505 -0
- package/rust/src/bin/sweagent.rs +784 -0
- package/rust/src/environment/deployment.rs +631 -0
- package/rust/src/environment/hooks/mod.rs +114 -0
- package/rust/src/environment/mod.rs +16 -0
- package/rust/src/environment/repo.rs +265 -0
- package/rust/src/environment/runtime.rs +237 -0
- package/rust/src/environment/swe_env.rs +248 -0
- package/rust/src/exceptions.rs +228 -0
- package/rust/src/lib.rs +68 -0
- package/rust/src/monitoring.rs +482 -0
- package/rust/src/run/hooks/mod.rs +134 -0
- package/rust/src/run/mod.rs +12 -0
- package/rust/src/run/run_batch.rs +563 -0
- package/rust/src/run/run_single.rs +196 -0
- package/rust/src/tools/bundle.rs +224 -0
- package/rust/src/tools/commands.rs +173 -0
- package/rust/src/tools/mod.rs +295 -0
- package/rust/src/tools/parsing.rs +354 -0
- package/rust/src/tools/registry.rs +143 -0
- package/rust/src/types.rs +554 -0
- package/rust/src/utils/config.rs +105 -0
- package/rust/src/utils/files.rs +137 -0
- package/rust/src/utils/github.rs +171 -0
- package/rust/src/utils/log.rs +65 -0
- package/rust/src/utils/mod.rs +17 -0
- package/rust/src/utils/serialization.rs +181 -0
- package/rust/src/utils/template.rs +173 -0
- package/typescript/README.md +335 -0
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Run on a batch of instances/issues, e.g., SWE-bench.
|
|
3
|
+
|
|
4
|
+
[cyan][bold]=== BASIC OPTIONS ===[/bold][/cyan]
|
|
5
|
+
|
|
6
|
+
-h --help Show help text and exit
|
|
7
|
+
--help_option Print specific help text and exit
|
|
8
|
+
|
|
9
|
+
[cyan][bold]=== EXAMPLES ===[/bold][/cyan]
|
|
10
|
+
|
|
11
|
+
Basic usage: Run over a [bold][cyan]SWE-bench lite[/bold][/cyan][green]:
|
|
12
|
+
|
|
13
|
+
sweagent run-batch \\
|
|
14
|
+
--instances.type swe_bench \\ # configure instances
|
|
15
|
+
--instances.subset lite \\
|
|
16
|
+
--instances.split dev \\
|
|
17
|
+
--instances.slice :50 \\ # first 50 instances
|
|
18
|
+
--instances.shuffle=True \\ # shuffle instances (with fixed seed)
|
|
19
|
+
--config config/default.yaml \\
|
|
20
|
+
--agent.model.name gpt-4o # configure model
|
|
21
|
+
[/green]
|
|
22
|
+
|
|
23
|
+
[cyan][bold]=== LOADING INSTANCES ===[/bold][/cyan]
|
|
24
|
+
|
|
25
|
+
[cyan][bold]From a file[/bold][/cyan] [green]--instances.type file --instances.path /path/to/file[/green].
|
|
26
|
+
[cyan][bold]From huggingface[/bold][/cyan] [green]--instances.type huggingface --instances.dataset_name=SWE_Bench_lite --instances.split=dev[/green].
|
|
27
|
+
|
|
28
|
+
All instance specifications support the [green]filter[/green], [green]slice[/green], and [green]shuffle[/green] options.
|
|
29
|
+
With [green]filter[/green], you can select specific instances, e.g., [green]--instances.filter='instance_id_1|instance_id_2'[/green].
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
import getpass
|
|
33
|
+
import json
|
|
34
|
+
import logging
|
|
35
|
+
import random
|
|
36
|
+
import sys
|
|
37
|
+
import time
|
|
38
|
+
import traceback
|
|
39
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
40
|
+
from contextlib import ExitStack
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from typing import Self
|
|
43
|
+
|
|
44
|
+
import yaml
|
|
45
|
+
from pydantic import Field, model_validator
|
|
46
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
47
|
+
from rich.live import Live
|
|
48
|
+
from swerex.deployment.hooks.status import SetStatusDeploymentHook
|
|
49
|
+
|
|
50
|
+
from sweagent import TRAJECTORY_DIR
|
|
51
|
+
from sweagent.agent.agents import AgentConfig, get_agent_from_config
|
|
52
|
+
from sweagent.agent.hooks.status import SetStatusAgentHook
|
|
53
|
+
from sweagent.environment.hooks.status import SetStatusEnvironmentHook
|
|
54
|
+
from sweagent.environment.swe_env import SWEEnv
|
|
55
|
+
from sweagent.exceptions import ModelConfigurationError, TotalCostLimitExceededError
|
|
56
|
+
from sweagent.run._progress import RunBatchProgressManager
|
|
57
|
+
from sweagent.run.batch_instances import BatchInstance, BatchInstanceSourceConfig, SWEBenchInstances
|
|
58
|
+
from sweagent.run.common import BasicCLI, ConfigHelper, save_predictions
|
|
59
|
+
from sweagent.run.hooks.abstract import CombinedRunHooks, RunHook
|
|
60
|
+
from sweagent.run.hooks.apply_patch import SaveApplyPatchHook
|
|
61
|
+
from sweagent.run.merge_predictions import merge_predictions
|
|
62
|
+
from sweagent.run.run_single import RunSingleConfig
|
|
63
|
+
from sweagent.types import AgentRunResult
|
|
64
|
+
from sweagent.utils.config import load_environment_variables
|
|
65
|
+
from sweagent.utils.log import (
|
|
66
|
+
add_file_handler,
|
|
67
|
+
add_logger_names_to_stream_handlers,
|
|
68
|
+
get_logger,
|
|
69
|
+
register_thread_name,
|
|
70
|
+
remove_file_handler,
|
|
71
|
+
set_stream_handler_levels,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class RunBatchConfig(BaseSettings, cli_implicit_flags=False):
|
|
76
|
+
instances: BatchInstanceSourceConfig = Field(description="Instances to run.")
|
|
77
|
+
agent: AgentConfig = Field(description="Agent options.")
|
|
78
|
+
output_dir: Path = Field(default=Path("DEFAULT"), description="Output directory.")
|
|
79
|
+
suffix: str = ""
|
|
80
|
+
"""Suffix to add to the output directory. Only used if `output_dir` is `DEFAULT`."""
|
|
81
|
+
raise_exceptions: bool = False
|
|
82
|
+
"""Raise exceptions instead of skipping instances."""
|
|
83
|
+
redo_existing: bool = False
|
|
84
|
+
"""Do not skip instances that already have a trajectory."""
|
|
85
|
+
env_var_path: Path | None = None
|
|
86
|
+
"""Path to a .env file to load environment variables from."""
|
|
87
|
+
num_workers: int = Field(default=1)
|
|
88
|
+
"""Number of parallel workers to use."""
|
|
89
|
+
random_delay_multiplier: float = 0.3
|
|
90
|
+
"""We will wait for a random amount of time between 0 and `random_delay_multiplier`
|
|
91
|
+
times the number of workers at the start of each instance. This is to avoid any
|
|
92
|
+
potential race condition or issues with bottlenecks, e.g., when running on a platform
|
|
93
|
+
with few CPUs that cannot handle the startup of all containers in time.
|
|
94
|
+
"""
|
|
95
|
+
progress_bar: bool = True
|
|
96
|
+
"""Whether to show a progress bar. Progress bar is never shown for human models.
|
|
97
|
+
Progress bar is always shown for multi-worker runs.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# pydantic config
|
|
101
|
+
model_config = SettingsConfigDict(extra="forbid", env_prefix="SWE_AGENT_")
|
|
102
|
+
|
|
103
|
+
def set_default_output_dir(self) -> None:
|
|
104
|
+
# Needs to be called explicitly, because self._config_files will be setup
|
|
105
|
+
# post-init.
|
|
106
|
+
if self.output_dir == Path("DEFAULT"):
|
|
107
|
+
user_id = getpass.getuser()
|
|
108
|
+
source_id = self.instances.id
|
|
109
|
+
try:
|
|
110
|
+
model_id = self.agent.model.id # type: ignore[attr-defined]
|
|
111
|
+
except AttributeError:
|
|
112
|
+
model_id = "unknown"
|
|
113
|
+
config_file = getattr(self, "_config_files", ["no_config"])[0]
|
|
114
|
+
if config_file != "no_config":
|
|
115
|
+
config_file = Path(config_file).stem
|
|
116
|
+
suffix = f"__{self.suffix}" if self.suffix else ""
|
|
117
|
+
self.output_dir = TRAJECTORY_DIR / user_id / f"{config_file}__{model_id}___{source_id}{suffix}"
|
|
118
|
+
|
|
119
|
+
@model_validator(mode="after")
|
|
120
|
+
def evaluate_and_redo_existing(self) -> Self:
|
|
121
|
+
if not isinstance(self.instances, SWEBenchInstances):
|
|
122
|
+
return self
|
|
123
|
+
if self.instances.evaluate and self.redo_existing:
|
|
124
|
+
msg = (
|
|
125
|
+
"Cannot evaluate and redo existing at the same time. This would cause invalid results, because "
|
|
126
|
+
"after the first merge_preds gives you a preds.json, this file would be submitted to SB-CLI, causing"
|
|
127
|
+
"evaluation of old instances, which could then not be overwritten by the new ones."
|
|
128
|
+
)
|
|
129
|
+
raise ValueError(msg)
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class _BreakLoop(Exception):
|
|
134
|
+
"""Used for internal control flow"""
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class RunBatch:
|
|
138
|
+
def __init__(
|
|
139
|
+
self,
|
|
140
|
+
instances: list[BatchInstance],
|
|
141
|
+
agent_config: AgentConfig,
|
|
142
|
+
*,
|
|
143
|
+
output_dir: Path = Path("."),
|
|
144
|
+
hooks: list[RunHook] | None = None,
|
|
145
|
+
raise_exceptions: bool = False,
|
|
146
|
+
redo_existing: bool = False,
|
|
147
|
+
num_workers: int = 1,
|
|
148
|
+
progress_bar: bool = True,
|
|
149
|
+
random_delay_multiplier: float = 0.3,
|
|
150
|
+
):
|
|
151
|
+
"""Note: When initializing this class, make sure to add the hooks that are required by your actions.
|
|
152
|
+
See `from_config` for an example.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
hooks: If not specified, the default hooks will be used.
|
|
156
|
+
num_workers: Number of parallel workers to use. Default is 1 (sequential execution).
|
|
157
|
+
progress_bar: Whether to show a progress bar. Progress bar is never shown for human models.
|
|
158
|
+
Progress bar is always shown for multi-worker runs.
|
|
159
|
+
random_delay_multiplier: We will wait for a random amount of time between 0 and `random_delay_multiplier`
|
|
160
|
+
times the number of workers at the start of each instance. This is to avoid any
|
|
161
|
+
potential race conditions.
|
|
162
|
+
"""
|
|
163
|
+
if self._model_id in ["human", "human_thought"] and num_workers > 1:
|
|
164
|
+
msg = "Cannot run with human model in parallel"
|
|
165
|
+
raise ValueError(msg)
|
|
166
|
+
|
|
167
|
+
self.logger = get_logger("swea-run", emoji="🏃")
|
|
168
|
+
add_file_handler(
|
|
169
|
+
output_dir / "run_batch.log",
|
|
170
|
+
id_="progress",
|
|
171
|
+
filter=lambda name: "swea-run" in name or "config" in name,
|
|
172
|
+
)
|
|
173
|
+
self.instances = instances
|
|
174
|
+
self.agent_config = agent_config
|
|
175
|
+
self.output_dir = output_dir
|
|
176
|
+
self._raise_exceptions = raise_exceptions
|
|
177
|
+
self._chooks = CombinedRunHooks()
|
|
178
|
+
self._redo_existing = redo_existing
|
|
179
|
+
self._num_workers = min(num_workers, len(instances))
|
|
180
|
+
for hook in hooks or [SaveApplyPatchHook(show_success_message=False)]:
|
|
181
|
+
self.add_hook(hook)
|
|
182
|
+
self._progress_manager = RunBatchProgressManager(
|
|
183
|
+
num_instances=len(instances), yaml_report_path=output_dir / "run_batch_exit_statuses.yaml"
|
|
184
|
+
)
|
|
185
|
+
self._show_progress_bar = progress_bar
|
|
186
|
+
self._random_delay_multiplier = random_delay_multiplier
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def _model_id(self) -> str:
|
|
190
|
+
try:
|
|
191
|
+
return self.agent_config.model.id # type: ignore[attr-defined]
|
|
192
|
+
except AttributeError:
|
|
193
|
+
return "unknown"
|
|
194
|
+
|
|
195
|
+
@classmethod
|
|
196
|
+
def from_config(cls, config: RunBatchConfig) -> Self:
|
|
197
|
+
load_environment_variables(config.env_var_path)
|
|
198
|
+
config.set_default_output_dir()
|
|
199
|
+
config.output_dir.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
(config.output_dir / "run_batch.config.yaml").write_text(yaml.dump(config.model_dump_json(), indent=2))
|
|
201
|
+
logger = get_logger("run", emoji="🏃")
|
|
202
|
+
logger.debug("Loading instances from %s", f"{config.instances!r}")
|
|
203
|
+
instances = config.instances.get_instance_configs()
|
|
204
|
+
logger.info("Loaded %d instances", len(instances))
|
|
205
|
+
if not instances:
|
|
206
|
+
msg = (
|
|
207
|
+
"No instances to run. Here are a few things to check:\n"
|
|
208
|
+
"- With huggingface data: Check that you have the right split (test or dev)\n"
|
|
209
|
+
"- Check your filter does not exclude all instances (check the info log messages)"
|
|
210
|
+
)
|
|
211
|
+
raise ValueError(msg)
|
|
212
|
+
logger.debug("The first instance is %s", f"{instances[0]!r}")
|
|
213
|
+
rb = cls(
|
|
214
|
+
instances=instances,
|
|
215
|
+
agent_config=config.agent,
|
|
216
|
+
output_dir=config.output_dir,
|
|
217
|
+
raise_exceptions=config.raise_exceptions,
|
|
218
|
+
redo_existing=config.redo_existing,
|
|
219
|
+
num_workers=config.num_workers,
|
|
220
|
+
progress_bar=config.progress_bar,
|
|
221
|
+
random_delay_multiplier=config.random_delay_multiplier,
|
|
222
|
+
)
|
|
223
|
+
if isinstance(config.instances, SWEBenchInstances) and config.instances.evaluate:
|
|
224
|
+
from sweagent.run.hooks.swe_bench_evaluate import SweBenchEvaluate
|
|
225
|
+
|
|
226
|
+
rb.add_hook(
|
|
227
|
+
SweBenchEvaluate(
|
|
228
|
+
output_dir=config.output_dir,
|
|
229
|
+
subset=config.instances.subset,
|
|
230
|
+
split=config.instances.split,
|
|
231
|
+
continuous_submission_every=30,
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
return rb
|
|
235
|
+
|
|
236
|
+
def add_hook(self, hook: RunHook) -> None:
|
|
237
|
+
hook.on_init(run=self)
|
|
238
|
+
self._chooks.add_hook(hook)
|
|
239
|
+
|
|
240
|
+
def main(self) -> None:
|
|
241
|
+
self.logger.info("Starting run. Find output files at %s", self.output_dir)
|
|
242
|
+
self._chooks.on_start()
|
|
243
|
+
|
|
244
|
+
if self._num_workers <= 1:
|
|
245
|
+
self.main_single_worker()
|
|
246
|
+
else:
|
|
247
|
+
self.main_multi_worker()
|
|
248
|
+
|
|
249
|
+
output_dirs = []
|
|
250
|
+
for instance in self.instances:
|
|
251
|
+
output_dirs.append(self.output_dir / instance.problem_statement.id)
|
|
252
|
+
merge_predictions(output_dirs, self.output_dir / "preds.json")
|
|
253
|
+
|
|
254
|
+
self._chooks.on_end()
|
|
255
|
+
|
|
256
|
+
def main_single_worker(self) -> None:
|
|
257
|
+
with ExitStack() as stack:
|
|
258
|
+
# Conditionally add progress bar
|
|
259
|
+
if self._model_id not in ["human", "human_thought"] and self._show_progress_bar:
|
|
260
|
+
stack.enter_context(Live(self._progress_manager.render_group))
|
|
261
|
+
for instance in self.instances:
|
|
262
|
+
try:
|
|
263
|
+
self.run_instance(instance)
|
|
264
|
+
except _BreakLoop:
|
|
265
|
+
self.logger.info("Stopping loop over instances")
|
|
266
|
+
break
|
|
267
|
+
|
|
268
|
+
def main_multi_worker(self) -> None:
|
|
269
|
+
add_logger_names_to_stream_handlers()
|
|
270
|
+
# Set all stream handlers to WARNING and set everything where we want to have
|
|
271
|
+
# more verbosity explicitly
|
|
272
|
+
set_stream_handler_levels(logging.WARNING)
|
|
273
|
+
self.logger.setLevel(logging.TRACE) # type: ignore
|
|
274
|
+
|
|
275
|
+
with Live(self._progress_manager.render_group):
|
|
276
|
+
with ThreadPoolExecutor(max_workers=self._num_workers) as executor:
|
|
277
|
+
futures = [executor.submit(self.run_instance, instance) for instance in self.instances]
|
|
278
|
+
try:
|
|
279
|
+
for future in as_completed(futures):
|
|
280
|
+
future.result()
|
|
281
|
+
except (KeyboardInterrupt, _BreakLoop):
|
|
282
|
+
msg = (
|
|
283
|
+
"Received keyboard interrupt, waiting for running instances "
|
|
284
|
+
"to finish, but cancelled everything else"
|
|
285
|
+
)
|
|
286
|
+
self.logger.info(msg)
|
|
287
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
288
|
+
finally:
|
|
289
|
+
self._progress_manager.print_report()
|
|
290
|
+
|
|
291
|
+
def run_instance(self, instance: BatchInstance) -> None:
|
|
292
|
+
self.logger.info("Running on instance %s", instance.problem_statement.id)
|
|
293
|
+
register_thread_name(instance.problem_statement.id)
|
|
294
|
+
self._add_instance_log_file_handlers(instance.problem_statement.id, multi_worker=self._num_workers > 1)
|
|
295
|
+
# Let's add some randomness to avoid any potential race conditions or thundering herd
|
|
296
|
+
if self._progress_manager.n_completed < self._num_workers:
|
|
297
|
+
time.sleep(random.random() * self._random_delay_multiplier * (self._num_workers - 1))
|
|
298
|
+
|
|
299
|
+
self._progress_manager.on_instance_start(instance.problem_statement.id)
|
|
300
|
+
|
|
301
|
+
if previous_exit_status := self.should_skip(instance):
|
|
302
|
+
self._progress_manager.on_instance_end(
|
|
303
|
+
instance.problem_statement.id, exit_status=f"skipped ({previous_exit_status})"
|
|
304
|
+
)
|
|
305
|
+
self._remove_instance_log_file_handlers(instance.problem_statement.id)
|
|
306
|
+
return
|
|
307
|
+
|
|
308
|
+
# Either catch and silence exception, or raise _BreakLoop to stop the loop
|
|
309
|
+
# over the instances
|
|
310
|
+
try:
|
|
311
|
+
result = self._run_instance(instance)
|
|
312
|
+
except KeyboardInterrupt:
|
|
313
|
+
raise _BreakLoop
|
|
314
|
+
except (SystemExit, ModelConfigurationError, TotalCostLimitExceededError) as e:
|
|
315
|
+
if self._raise_exceptions:
|
|
316
|
+
raise
|
|
317
|
+
self.logger.critical(f"❌ Exiting because {e.__class__.__name__} was called")
|
|
318
|
+
raise _BreakLoop
|
|
319
|
+
except Exception as e:
|
|
320
|
+
self.logger.error(traceback.format_exc())
|
|
321
|
+
self.logger.error(f"❌ Failed on {instance.problem_statement.id}: {e}")
|
|
322
|
+
self._progress_manager.on_uncaught_exception(instance.problem_statement.id, e)
|
|
323
|
+
if self._raise_exceptions:
|
|
324
|
+
raise
|
|
325
|
+
else:
|
|
326
|
+
self._progress_manager.on_instance_end(
|
|
327
|
+
instance.problem_statement.id, exit_status=result.info.get("exit_status", "unknown_exit")
|
|
328
|
+
)
|
|
329
|
+
finally:
|
|
330
|
+
self._progress_manager.update_exit_status_table()
|
|
331
|
+
self._remove_instance_log_file_handlers(instance.problem_statement.id)
|
|
332
|
+
|
|
333
|
+
def _run_instance(self, instance: BatchInstance) -> AgentRunResult:
|
|
334
|
+
output_dir = Path(self.output_dir) / instance.problem_statement.id
|
|
335
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
336
|
+
self.agent_config.name = f"{instance.problem_statement.id}"
|
|
337
|
+
agent = get_agent_from_config(self.agent_config)
|
|
338
|
+
single_run_replay_config = RunSingleConfig(
|
|
339
|
+
agent=self.agent_config,
|
|
340
|
+
problem_statement=instance.problem_statement,
|
|
341
|
+
env=instance.env,
|
|
342
|
+
)
|
|
343
|
+
(output_dir / f"{instance.problem_statement.id}.config.yaml").write_text(
|
|
344
|
+
yaml.dump(single_run_replay_config.model_dump_json(), indent=2)
|
|
345
|
+
)
|
|
346
|
+
agent.replay_config = single_run_replay_config # type: ignore[attr-defined]
|
|
347
|
+
agent.add_hook(SetStatusAgentHook(instance.problem_statement.id, self._progress_manager.update_instance_status))
|
|
348
|
+
self._progress_manager.update_instance_status(instance.problem_statement.id, "Starting environment")
|
|
349
|
+
instance.env.name = f"{instance.problem_statement.id}"
|
|
350
|
+
env = SWEEnv.from_config(instance.env)
|
|
351
|
+
env.add_hook(
|
|
352
|
+
SetStatusEnvironmentHook(instance.problem_statement.id, self._progress_manager.update_instance_status)
|
|
353
|
+
)
|
|
354
|
+
env.deployment.add_hook(
|
|
355
|
+
SetStatusDeploymentHook(instance.problem_statement.id, self._progress_manager.update_instance_status)
|
|
356
|
+
)
|
|
357
|
+
try:
|
|
358
|
+
env.start()
|
|
359
|
+
self._chooks.on_instance_start(index=0, env=env, problem_statement=instance.problem_statement)
|
|
360
|
+
result = agent.run(
|
|
361
|
+
problem_statement=instance.problem_statement,
|
|
362
|
+
env=env,
|
|
363
|
+
output_dir=output_dir,
|
|
364
|
+
)
|
|
365
|
+
except Exception:
|
|
366
|
+
# The actual handling is happening in `run_instance`, but we need to make sure that
|
|
367
|
+
# we log it to the agent specific logger as well
|
|
368
|
+
agent.logger.error(traceback.format_exc()) # type: ignore[attr-defined]
|
|
369
|
+
raise
|
|
370
|
+
finally:
|
|
371
|
+
env.close()
|
|
372
|
+
save_predictions(self.output_dir, instance.problem_statement.id, result)
|
|
373
|
+
self._chooks.on_instance_completed(result=result)
|
|
374
|
+
return result
|
|
375
|
+
|
|
376
|
+
def should_skip(self, instance: BatchInstance) -> bool | str:
|
|
377
|
+
"""Check if we should skip this instance.
|
|
378
|
+
Returns previous exit status if the instance should be skipped.
|
|
379
|
+
"""
|
|
380
|
+
if self._redo_existing:
|
|
381
|
+
return False
|
|
382
|
+
|
|
383
|
+
# Check if there's an existing trajectory for this instance
|
|
384
|
+
log_path = self.output_dir / instance.problem_statement.id / (instance.problem_statement.id + ".traj")
|
|
385
|
+
if not log_path.exists():
|
|
386
|
+
return False
|
|
387
|
+
|
|
388
|
+
content = log_path.read_text()
|
|
389
|
+
if not content.strip():
|
|
390
|
+
self.logger.warning("Found empty trajectory: %s. Removing.", log_path)
|
|
391
|
+
log_path.unlink()
|
|
392
|
+
return False
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
data = json.loads(content)
|
|
396
|
+
# If the trajectory has no exit status, it's incomplete and we will redo it
|
|
397
|
+
exit_status = data["info"].get("exit_status", None)
|
|
398
|
+
if exit_status == "early_exit" or exit_status is None:
|
|
399
|
+
self.logger.warning(f"Found existing trajectory with no exit status: {log_path}. Removing.")
|
|
400
|
+
log_path.unlink()
|
|
401
|
+
return False
|
|
402
|
+
except Exception as e:
|
|
403
|
+
self.logger.error(f"Failed to check existing trajectory: {log_path}: {e}. Removing.")
|
|
404
|
+
# If we can't check the trajectory, we will redo it
|
|
405
|
+
log_path.unlink()
|
|
406
|
+
return False
|
|
407
|
+
# otherwise, we will skip it
|
|
408
|
+
self.logger.info(f"⏭️ Skipping existing trajectory: {log_path}")
|
|
409
|
+
return exit_status
|
|
410
|
+
|
|
411
|
+
def _add_instance_log_file_handlers(self, instance_id: str, multi_worker: bool = False) -> None:
|
|
412
|
+
filename_template = f"{instance_id}.{{level}}.log"
|
|
413
|
+
for level in ["trace", "debug", "info"]:
|
|
414
|
+
filter = instance_id if multi_worker else ""
|
|
415
|
+
add_file_handler(
|
|
416
|
+
self.output_dir / instance_id / filename_template.format(level=level),
|
|
417
|
+
filter=filter,
|
|
418
|
+
level=level,
|
|
419
|
+
id_=f"{instance_id}-{level}",
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def _remove_instance_log_file_handlers(self, instance_id: str) -> None:
|
|
423
|
+
for level in ["trace", "debug", "info"]:
|
|
424
|
+
remove_file_handler(f"{instance_id}-{level}")
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def run_from_config(config: RunBatchConfig):
|
|
428
|
+
RunBatch.from_config(config).main()
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def run_from_cli(args: list[str] | None = None):
|
|
432
|
+
if args is None:
|
|
433
|
+
args = sys.argv[1:]
|
|
434
|
+
assert __doc__ is not None
|
|
435
|
+
help_text = ( # type: ignore
|
|
436
|
+
__doc__ + "\n[cyan][bold]=== ALL THE OPTIONS ===[/bold][/cyan]\n\n" + ConfigHelper().get_help(RunBatchConfig)
|
|
437
|
+
)
|
|
438
|
+
run_from_config(BasicCLI(RunBatchConfig, help_text=help_text).get_config(args)) # type: ignore
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
if __name__ == "__main__":
|
|
442
|
+
run_from_cli()
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""[cyan][bold]Replay a trajectory file.[/bold][/cyan]
|
|
2
|
+
|
|
3
|
+
[cyan][bold]=== DESCRIPTION ===[/bold][/cyan]
|
|
4
|
+
|
|
5
|
+
We will take all actions in the trajectory and execute them in an environment.
|
|
6
|
+
|
|
7
|
+
This has two main use cases:
|
|
8
|
+
|
|
9
|
+
1. Create a demo from a yaml file containing actions (can also be created from a trajectory file with [green]sweagent run traj-to-demo[/green]).
|
|
10
|
+
[green]run-replay[/green] will execute the actions to get the environment output and produce a full trajectory to be used as a demo.
|
|
11
|
+
2. Debugging and testing of tools and environment behavior.
|
|
12
|
+
|
|
13
|
+
[cyan][bold]=== EXAMPLES ===[/bold][/cyan]
|
|
14
|
+
|
|
15
|
+
Replay a trajectory file:
|
|
16
|
+
|
|
17
|
+
[green]sweagent run replay --traj_path mytraj.traj[/green]
|
|
18
|
+
|
|
19
|
+
Replay a demo file:
|
|
20
|
+
|
|
21
|
+
[green]sweagent run replay --traj_path mydemo.demo.yaml[/green]
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import json
|
|
25
|
+
import sys
|
|
26
|
+
import tempfile
|
|
27
|
+
from getpass import getuser
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
import yaml
|
|
32
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
33
|
+
from swerex.deployment.abstract import AbstractDeployment
|
|
34
|
+
from swerex.deployment.config import DeploymentConfig, get_deployment
|
|
35
|
+
from typing_extensions import Self
|
|
36
|
+
|
|
37
|
+
from sweagent.agent.agents import DefaultAgent
|
|
38
|
+
from sweagent.agent.models import ReplayModelConfig
|
|
39
|
+
from sweagent.environment.swe_env import SWEEnv
|
|
40
|
+
from sweagent.run.common import BasicCLI, ConfigHelper
|
|
41
|
+
from sweagent.run.run_single import RunSingle, RunSingleConfig
|
|
42
|
+
from sweagent.utils.config import load_environment_variables
|
|
43
|
+
from sweagent.utils.log import get_logger
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class RunReplayConfig(BaseSettings, cli_implicit_flags=False):
|
|
47
|
+
traj_path: Path
|
|
48
|
+
deployment: DeploymentConfig | None = None
|
|
49
|
+
"""Override the deployment in the trajectory."""
|
|
50
|
+
output_dir: Path = Path("DEFAULT")
|
|
51
|
+
env_var_path: Path | None = None
|
|
52
|
+
"""Path to a .env file to load environment variables from."""
|
|
53
|
+
update_config: list[Path] = []
|
|
54
|
+
"""Additional config files to merge with the replay config."""
|
|
55
|
+
|
|
56
|
+
# pydantic config
|
|
57
|
+
model_config = SettingsConfigDict(extra="forbid", env_prefix="SWE_AGENT_")
|
|
58
|
+
|
|
59
|
+
def model_post_init(self, __context: Any) -> None:
|
|
60
|
+
if self.output_dir == Path("DEFAULT"):
|
|
61
|
+
user_id = getuser()
|
|
62
|
+
self.output_dir = Path.cwd() / "trajectories" / user_id / f"replay___{self.traj_path.stem}"
|
|
63
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class RunReplay:
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
*,
|
|
70
|
+
traj_path: Path,
|
|
71
|
+
deployment: AbstractDeployment | None,
|
|
72
|
+
output_dir: Path,
|
|
73
|
+
update_config: list[Path] | None = None,
|
|
74
|
+
_catch_errors: bool = False,
|
|
75
|
+
_require_zero_exit_code: bool = False,
|
|
76
|
+
):
|
|
77
|
+
self.traj_path = traj_path
|
|
78
|
+
self.output_dir = output_dir
|
|
79
|
+
self._replay_action_trajs_path = Path(tempfile.NamedTemporaryFile(suffix=".json").name)
|
|
80
|
+
self.logger = get_logger("swea-run", emoji="🏃")
|
|
81
|
+
self._catch_errors = _catch_errors
|
|
82
|
+
self._require_zero_exit_code = _require_zero_exit_code
|
|
83
|
+
self._update_config = update_config if update_config is not None else []
|
|
84
|
+
|
|
85
|
+
if traj_path.suffix == ".yaml":
|
|
86
|
+
self._traj_data = yaml.safe_load(traj_path.read_text())
|
|
87
|
+
else:
|
|
88
|
+
self._traj_data = json.loads(traj_path.read_text())
|
|
89
|
+
self.config = self._get_config_from_agent(self._traj_data)
|
|
90
|
+
|
|
91
|
+
if deployment is None:
|
|
92
|
+
self.deployment = get_deployment(self.config.env.deployment)
|
|
93
|
+
else:
|
|
94
|
+
self.deployment = deployment
|
|
95
|
+
|
|
96
|
+
def _get_config_from_agent(self, traj_data):
|
|
97
|
+
try:
|
|
98
|
+
if isinstance(traj_data["replay_config"], str):
|
|
99
|
+
traj_data["replay_config"] = json.loads(traj_data["replay_config"])
|
|
100
|
+
config = RunSingleConfig.model_validate(traj_data["replay_config"])
|
|
101
|
+
except KeyError:
|
|
102
|
+
msg = "Replay config not found in trajectory. Are you running on an old trajectory?"
|
|
103
|
+
raise ValueError(msg)
|
|
104
|
+
|
|
105
|
+
# Merge any additional config files
|
|
106
|
+
for config_path in self._update_config:
|
|
107
|
+
update_data = yaml.safe_load(config_path.read_text())
|
|
108
|
+
# Store the current model config before merging
|
|
109
|
+
current_model = config.agent.model
|
|
110
|
+
# Convert the merged data back to a RunSingleConfig
|
|
111
|
+
config_dict = config.model_dump(mode="json")
|
|
112
|
+
merged_dict = config_dict | update_data
|
|
113
|
+
|
|
114
|
+
# Ensure agent.model is preserved if not explicitly updated
|
|
115
|
+
if "agent" in merged_dict and "model" not in merged_dict["agent"]:
|
|
116
|
+
merged_dict["agent"]["model"] = current_model.model_dump(mode="json")
|
|
117
|
+
|
|
118
|
+
config = RunSingleConfig.model_validate(merged_dict)
|
|
119
|
+
|
|
120
|
+
config.agent.model = ReplayModelConfig(replay_path=self._replay_action_trajs_path)
|
|
121
|
+
return config
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def instance_id(self) -> str:
|
|
125
|
+
return Path(self.traj_path).stem
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def from_config(cls, config: RunReplayConfig, **kwargs) -> Self:
|
|
129
|
+
load_environment_variables(config.env_var_path)
|
|
130
|
+
return cls(
|
|
131
|
+
traj_path=config.traj_path,
|
|
132
|
+
deployment=get_deployment(config.deployment) if config.deployment else None,
|
|
133
|
+
output_dir=config.output_dir,
|
|
134
|
+
update_config=config.update_config,
|
|
135
|
+
**kwargs,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def _create_actions_file(self) -> None:
|
|
139
|
+
# Verify config compatibility with tool calls
|
|
140
|
+
has_tool_calls = any(
|
|
141
|
+
"tool_calls" in item and item["tool_calls"] is not None
|
|
142
|
+
for item in self._traj_data["history"]
|
|
143
|
+
if item["role"] == "assistant"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
agent_config = self.config.agent
|
|
147
|
+
parse_function = agent_config.tools.parse_function.type
|
|
148
|
+
use_function_calling = parse_function == "function_calling"
|
|
149
|
+
|
|
150
|
+
if has_tool_calls and not use_function_calling:
|
|
151
|
+
msg = (
|
|
152
|
+
"Trajectory contains tool calls but config is not set up for function calling. "
|
|
153
|
+
"Check that the config you want to use has agent.tools.parse_function.type set to 'function_calling'."
|
|
154
|
+
)
|
|
155
|
+
raise ValueError(msg)
|
|
156
|
+
actions = []
|
|
157
|
+
for ix, item in enumerate(self._traj_data["history"]):
|
|
158
|
+
if item["role"] != "assistant":
|
|
159
|
+
continue
|
|
160
|
+
action = {"message": item["content"]}
|
|
161
|
+
if use_function_calling:
|
|
162
|
+
assert "tool_calls" in item and item["tool_calls"] is not None, (
|
|
163
|
+
f"Config is set to use `function_calling` but trajectory item {ix} is missing a tool call "
|
|
164
|
+
f"or has tool_calls set to None"
|
|
165
|
+
)
|
|
166
|
+
action["tool_calls"] = item["tool_calls"]
|
|
167
|
+
actions.append(action)
|
|
168
|
+
if len(actions) == 0:
|
|
169
|
+
msg = "No actions found in trajectory"
|
|
170
|
+
raise ValueError(msg)
|
|
171
|
+
self._replay_action_trajs_path.write_text(json.dumps({self.instance_id: actions}))
|
|
172
|
+
|
|
173
|
+
def _get_env(self) -> SWEEnv:
|
|
174
|
+
return SWEEnv(
|
|
175
|
+
deployment=self.deployment,
|
|
176
|
+
repo=self.config.env.repo,
|
|
177
|
+
post_startup_commands=[],
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def _get_agent(self) -> DefaultAgent:
|
|
181
|
+
agent = DefaultAgent.from_config(self.config.agent)
|
|
182
|
+
agent._catch_errors = self._catch_errors
|
|
183
|
+
agent._always_require_zero_exit_code = self._require_zero_exit_code
|
|
184
|
+
return agent
|
|
185
|
+
|
|
186
|
+
def _get_run_single(self) -> RunSingle:
|
|
187
|
+
return RunSingle(
|
|
188
|
+
self._get_env(),
|
|
189
|
+
self._get_agent(),
|
|
190
|
+
problem_statement=self.config.problem_statement,
|
|
191
|
+
output_dir=Path(self.output_dir),
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def main(self):
|
|
195
|
+
self._create_actions_file()
|
|
196
|
+
run_single = self._get_run_single()
|
|
197
|
+
run_single.agent.replay_config = RunSingleConfig(
|
|
198
|
+
agent=self.config.agent,
|
|
199
|
+
problem_statement=run_single.problem_statement,
|
|
200
|
+
env=self.config.env,
|
|
201
|
+
)
|
|
202
|
+
run_single.run()
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def run_from_config(config: RunReplayConfig):
|
|
206
|
+
RunReplay.from_config(config).main()
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def run_from_cli(args: list[str] | None = None):
|
|
210
|
+
if args is None:
|
|
211
|
+
args = sys.argv[1:]
|
|
212
|
+
help_text = ( # type: ignore
|
|
213
|
+
__doc__ + "\n[cyan][bold]=== ALL THE OPTIONS ===[/bold][/cyan]\n\n" + ConfigHelper().get_help(RunReplayConfig)
|
|
214
|
+
)
|
|
215
|
+
run_from_config(BasicCLI(RunReplayConfig, help_text=help_text, default_settings=False).get_config(args)) # type: ignore
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
if __name__ == "__main__":
|
|
219
|
+
run_from_cli()
|