@elizaos/sweagent-root 2.0.0-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +270 -0
- package/package.json +71 -0
- package/python/LICENSE +21 -0
- package/python/config/README.md +15 -0
- package/python/config/bash_only.yaml +222 -0
- package/python/config/benchmarks/250212_sweagent_heavy_sbl.yaml +188 -0
- package/python/config/benchmarks/250225_anthropic_filemap_simple_review.yaml +75 -0
- package/python/config/benchmarks/250522_anthropic_filemap_simple_review.yaml +92 -0
- package/python/config/benchmarks/250526_anthropic_filemap_simple_review_sbl.yaml +93 -0
- package/python/config/benchmarks/anthropic_filemap_multilingual.yaml +66 -0
- package/python/config/coding_challenge.yaml +104 -0
- package/python/config/default.yaml +69 -0
- package/python/config/default_backticks.yaml +69 -0
- package/python/config/default_mm_no_images.yaml +82 -0
- package/python/config/default_mm_with_images.yaml +83 -0
- package/python/config/demo/default.yaml +80 -0
- package/python/config/demo/no_instructions.yaml +69 -0
- package/python/config/demo/only_bash.yaml +60 -0
- package/python/config/exotic/default_shell.yaml +52 -0
- package/python/config/exotic/windowed_replace.yaml +125 -0
- package/python/config/exotic/windowed_replace_late_repro.yaml +127 -0
- package/python/config/human/human.yaml +24 -0
- package/python/config/human/human_demo.yaml +52 -0
- package/python/config/sweagent_0_7/07.yaml +101 -0
- package/python/config/sweagent_0_7/07_fcalling.yaml +100 -0
- package/python/config/sweagent_0_7/07_from_url.yaml +114 -0
- package/python/config/sweagent_0_7/07_thought_action.yaml +102 -0
- package/python/config/sweagent_0_7/07_thought_action_xml.yaml +96 -0
- package/python/mlc_config.json +44 -0
- package/python/pyproject.toml +262 -0
- package/python/sweagent/__init__.py +114 -0
- package/python/sweagent/__main__.py +4 -0
- package/python/sweagent/agent/__init__.py +0 -0
- package/python/sweagent/agent/action_sampler.py +317 -0
- package/python/sweagent/agent/agents.py +1294 -0
- package/python/sweagent/agent/extra/shell_agent.py +106 -0
- package/python/sweagent/agent/history_processors.py +399 -0
- package/python/sweagent/agent/hooks/__init__.py +0 -0
- package/python/sweagent/agent/hooks/abstract.py +139 -0
- package/python/sweagent/agent/hooks/status.py +34 -0
- package/python/sweagent/agent/models.py +896 -0
- package/python/sweagent/agent/problem_statement.py +312 -0
- package/python/sweagent/agent/reviewer.py +664 -0
- package/python/sweagent/environment/__init__.py +0 -0
- package/python/sweagent/environment/hooks/__init__.py +0 -0
- package/python/sweagent/environment/hooks/abstract.py +60 -0
- package/python/sweagent/environment/hooks/status.py +28 -0
- package/python/sweagent/environment/repo.py +219 -0
- package/python/sweagent/environment/swe_env.py +276 -0
- package/python/sweagent/exceptions.py +54 -0
- package/python/sweagent/inspector/README.md +6 -0
- package/python/sweagent/inspector/__init__.py +0 -0
- package/python/sweagent/inspector/favicon.ico +0 -0
- package/python/sweagent/inspector/fileViewer.js +354 -0
- package/python/sweagent/inspector/icons/computer.png +0 -0
- package/python/sweagent/inspector/icons/edit_icon.svg +11 -0
- package/python/sweagent/inspector/icons/swe-agent-logo-50.png +0 -0
- package/python/sweagent/inspector/icons/swellama_blue.png +0 -0
- package/python/sweagent/inspector/icons/swellama_brown.png +0 -0
- package/python/sweagent/inspector/icons/swellama_grey.png +0 -0
- package/python/sweagent/inspector/icons/swellama_tan.png +0 -0
- package/python/sweagent/inspector/index.html +25 -0
- package/python/sweagent/inspector/server.py +354 -0
- package/python/sweagent/inspector/static.py +169 -0
- package/python/sweagent/inspector/style.css +454 -0
- package/python/sweagent/run/__init__.py +0 -0
- package/python/sweagent/run/_progress.py +158 -0
- package/python/sweagent/run/batch_instances.py +419 -0
- package/python/sweagent/run/common.py +387 -0
- package/python/sweagent/run/compare_runs.py +123 -0
- package/python/sweagent/run/extract_pred.py +19 -0
- package/python/sweagent/run/hooks/__init__.py +0 -0
- package/python/sweagent/run/hooks/abstract.py +67 -0
- package/python/sweagent/run/hooks/apply_patch.py +106 -0
- package/python/sweagent/run/hooks/open_pr.py +244 -0
- package/python/sweagent/run/hooks/swe_bench_evaluate.py +113 -0
- package/python/sweagent/run/inspector_cli.py +493 -0
- package/python/sweagent/run/merge_predictions.py +64 -0
- package/python/sweagent/run/quick_stats.py +96 -0
- package/python/sweagent/run/remove_unfinished.py +63 -0
- package/python/sweagent/run/rich_test.py +91 -0
- package/python/sweagent/run/run.py +147 -0
- package/python/sweagent/run/run_batch.py +442 -0
- package/python/sweagent/run/run_replay.py +219 -0
- package/python/sweagent/run/run_shell.py +155 -0
- package/python/sweagent/run/run_single.py +225 -0
- package/python/sweagent/run/run_traj_to_demo.py +85 -0
- package/python/sweagent/tools/__init__.py +0 -0
- package/python/sweagent/tools/bundle.py +57 -0
- package/python/sweagent/tools/commands.py +220 -0
- package/python/sweagent/tools/parsing.py +619 -0
- package/python/sweagent/tools/tools.py +430 -0
- package/python/sweagent/tools/utils.py +108 -0
- package/python/sweagent/types.py +102 -0
- package/python/sweagent/utils/__init__.py +0 -0
- package/python/sweagent/utils/config.py +80 -0
- package/python/sweagent/utils/files.py +27 -0
- package/python/sweagent/utils/github.py +118 -0
- package/python/sweagent/utils/jinja_warnings.py +14 -0
- package/python/sweagent/utils/log.py +175 -0
- package/python/sweagent/utils/patch_formatter.py +152 -0
- package/python/sweagent/utils/serialization.py +45 -0
- package/python/tests/__init__.py +0 -0
- package/python/tests/conftest.py +191 -0
- package/python/tests/test_agent.py +258 -0
- package/python/tests/test_batch_instance.py +43 -0
- package/python/tests/test_commands/_interactive_dummy.py +35 -0
- package/python/tests/test_commands/interactive_dummy_wrapper.sh +29 -0
- package/python/tests/test_data/config_files/dummy_interactive.yaml +62 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/README.md +13 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/customrandom.c +50 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/release +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/server +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/solver.py +12 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/README.md +16 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/challenge.json +9 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/flash_c8429a430278283c0e571baebca3d139.zip +0 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/README.md +15 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/challenge.json +10 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/networking.pcap +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/Dockerfile +28 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/challenge.json +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/flag.txt +1 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.c +26 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.py +9 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/challenge.json +8 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock +0 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock.cpp +167 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/solution.cpp +24 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/solution.py +6 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/test.sh +10 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/000-default.conf +18 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/file.pl +38 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/forms.pl +40 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/hello.pl +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/flag +1 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/index.html +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/solution.txt +1 -0
- package/python/tests/test_data/data_sources/debug_20240322.json +1 -0
- package/python/tests/test_data/data_sources/expert_instances.yaml +16 -0
- package/python/tests/test_data/data_sources/human_eval.json +1 -0
- package/python/tests/test_data/data_sources/simple_instances.yaml +3 -0
- package/python/tests/test_data/data_sources/simple_instances_long.yaml +30 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy_first_only.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-lite-test.json +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/6e44b9__sweagenttestrepo-1c2844.traj +342 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/solution_missing_colon.py +15 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/args.yaml +518 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/swe-agent__test-repo-i1.traj +124 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/all_preds.jsonl +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/args.yaml +520 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/patches/pydicom__pydicom-1458.patch +18 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/pydicom__pydicom-1458.traj +257 -0
- package/python/tests/test_env.py +66 -0
- package/python/tests/test_env_utils.py +129 -0
- package/python/tests/test_history_processors.py +40 -0
- package/python/tests/test_models.py +23 -0
- package/python/tests/test_openai_live.py +164 -0
- package/python/tests/test_packaging.py +7 -0
- package/python/tests/test_parsing.py +131 -0
- package/python/tests/test_problem_statement_multimodal.py +111 -0
- package/python/tests/test_quick_stats.py +42 -0
- package/python/tests/test_run.py +37 -0
- package/python/tests/test_run_batch.py +110 -0
- package/python/tests/test_run_hooks.py +114 -0
- package/python/tests/test_run_replay.py +33 -0
- package/python/tests/test_run_single.py +125 -0
- package/python/tests/test_tools_command_parsing.py +193 -0
- package/python/tests/test_utils.py +15 -0
- package/python/tests/tools/__init__.py +0 -0
- package/python/tests/tools/conftest.py +12 -0
- package/python/tests/tools/test_default_utils.py +153 -0
- package/python/tests/tools/test_edit_replace.py +0 -0
- package/python/tests/tools/test_split_string.py +82 -0
- package/python/tests/utils.py +29 -0
- package/python/tools/diff_state/bin/_state_diff_state +52 -0
- package/python/tools/diff_state/config.yaml +2 -0
- package/python/tools/edit_anthropic/bin/_state_anthropic +21 -0
- package/python/tools/edit_anthropic/bin/str_replace_editor +710 -0
- package/python/tools/edit_anthropic/config.yaml +56 -0
- package/python/tools/edit_anthropic/install.sh +3 -0
- package/python/tools/filemap/bin/filemap +45 -0
- package/python/tools/filemap/config.yaml +9 -0
- package/python/tools/filemap/install.sh +2 -0
- package/python/tools/forfeit/bin/exit_forfeit +5 -0
- package/python/tools/forfeit/config.yaml +5 -0
- package/python/tools/image_tools/bin/view_image +36 -0
- package/python/tools/image_tools/config.yaml +9 -0
- package/python/tools/multilingual_setup/bin/do_nothing +2 -0
- package/python/tools/multilingual_setup/config.yaml +1 -0
- package/python/tools/multilingual_setup/install.sh +45 -0
- package/python/tools/registry/bin/_read_env +10 -0
- package/python/tools/registry/bin/_write_env +10 -0
- package/python/tools/registry/config.yaml +1 -0
- package/python/tools/registry/install.sh +6 -0
- package/python/tools/registry/lib/__init__.py +0 -0
- package/python/tools/registry/lib/registry.py +56 -0
- package/python/tools/review_on_submit_m/README.md +6 -0
- package/python/tools/review_on_submit_m/bin/submit +54 -0
- package/python/tools/review_on_submit_m/config.yaml +6 -0
- package/python/tools/review_on_submit_m/install.sh +0 -0
- package/python/tools/search/bin/find_file +31 -0
- package/python/tools/search/bin/search_dir +39 -0
- package/python/tools/search/bin/search_file +55 -0
- package/python/tools/search/config.yaml +37 -0
- package/python/tools/search/install.sh +3 -0
- package/python/tools/submit/bin/submit +17 -0
- package/python/tools/submit/config.yaml +5 -0
- package/python/tools/web_browser/bin/click_mouse +41 -0
- package/python/tools/web_browser/bin/close_site +28 -0
- package/python/tools/web_browser/bin/double_click_mouse +37 -0
- package/python/tools/web_browser/bin/drag_mouse +46 -0
- package/python/tools/web_browser/bin/execute_script_on_page +39 -0
- package/python/tools/web_browser/bin/get_console_output +48 -0
- package/python/tools/web_browser/bin/move_mouse +35 -0
- package/python/tools/web_browser/bin/navigate_back +33 -0
- package/python/tools/web_browser/bin/navigate_forward +33 -0
- package/python/tools/web_browser/bin/open_site +36 -0
- package/python/tools/web_browser/bin/press_keys_on_page +51 -0
- package/python/tools/web_browser/bin/reload_page +33 -0
- package/python/tools/web_browser/bin/run_web_browser_server +394 -0
- package/python/tools/web_browser/bin/screenshot_site +38 -0
- package/python/tools/web_browser/bin/scroll_on_page +40 -0
- package/python/tools/web_browser/bin/set_browser_window_size +40 -0
- package/python/tools/web_browser/bin/type_text +34 -0
- package/python/tools/web_browser/bin/wait_time +39 -0
- package/python/tools/web_browser/config.yaml +155 -0
- package/python/tools/web_browser/install.sh +22 -0
- package/python/tools/web_browser/lib/browser_manager.py +404 -0
- package/python/tools/web_browser/lib/web_browser_config.py +33 -0
- package/python/tools/web_browser/lib/web_browser_utils.py +126 -0
- package/python/tools/web_browser/test_console.html +1 -0
- package/python/tools/windowed/bin/_state +25 -0
- package/python/tools/windowed/bin/create +29 -0
- package/python/tools/windowed/bin/goto +37 -0
- package/python/tools/windowed/bin/open +49 -0
- package/python/tools/windowed/bin/scroll_down +12 -0
- package/python/tools/windowed/bin/scroll_up +13 -0
- package/python/tools/windowed/config.yaml +38 -0
- package/python/tools/windowed/install.sh +15 -0
- package/python/tools/windowed/lib/__init__.py +0 -0
- package/python/tools/windowed/lib/flake8_utils.py +147 -0
- package/python/tools/windowed/lib/windowed_file.py +312 -0
- package/python/tools/windowed_edit_linting/bin/edit +128 -0
- package/python/tools/windowed_edit_linting/config.yaml +31 -0
- package/python/tools/windowed_edit_linting/install.sh +5 -0
- package/python/tools/windowed_edit_replace/bin/edit +172 -0
- package/python/tools/windowed_edit_replace/bin/insert +77 -0
- package/python/tools/windowed_edit_replace/config.yaml +60 -0
- package/python/tools/windowed_edit_replace/install.sh +5 -0
- package/python/tools/windowed_edit_rewrite/bin/edit +78 -0
- package/python/tools/windowed_edit_rewrite/config.yaml +11 -0
- package/python/tools/windowed_edit_rewrite/install.sh +5 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyEncryption.traj +318 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyTimeCapsule.traj +197 -0
- package/python/trajectories/demonstrations/ctf/crypto/eps.traj +289 -0
- package/python/trajectories/demonstrations/ctf/crypto/katy.traj +368 -0
- package/python/trajectories/demonstrations/ctf/forensics/flash.traj +102 -0
- package/python/trajectories/demonstrations/ctf/misc/networking_1.traj +102 -0
- package/python/trajectories/demonstrations/ctf/pwn/warmup.traj +159 -0
- package/python/trajectories/demonstrations/ctf/rev/rock.traj +251 -0
- package/python/trajectories/demonstrations/ctf/web/i_got_id_demo.traj +422 -0
- package/python/trajectories/demonstrations/function_calling_simple.traj +151 -0
- package/python/trajectories/demonstrations/human_thought__swe-bench-HumanEvalFix-python__lcb__t-0.00__p-0.95__c-4.00__install-0/humanevalfix-python-0.traj +129 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default__t-0.20__p-0.95__c-2.00__install-1___install_from_source/marshmallow-code__marshmallow-1867.traj +318 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling__install-1/marshmallow-code__marshmallow-1867.traj +594 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace__install-1/marshmallow-code__marshmallow-1867.traj +592 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace_from_source/marshmallow-code__marshmallow-1867.traj +3316 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/str_replace_anthropic_demo.yaml +432 -0
- package/rust/Cargo.toml +100 -0
- package/rust/README.md +49 -0
- package/rust/src/agent/action_sampler.rs +130 -0
- package/rust/src/agent/agents.rs +1029 -0
- package/rust/src/agent/history_processors.rs +277 -0
- package/rust/src/agent/hooks/mod.rs +208 -0
- package/rust/src/agent/mod.rs +24 -0
- package/rust/src/agent/models.rs +837 -0
- package/rust/src/agent/problem_statement.rs +355 -0
- package/rust/src/agent/reviewer.rs +505 -0
- package/rust/src/bin/sweagent.rs +784 -0
- package/rust/src/environment/deployment.rs +631 -0
- package/rust/src/environment/hooks/mod.rs +114 -0
- package/rust/src/environment/mod.rs +16 -0
- package/rust/src/environment/repo.rs +265 -0
- package/rust/src/environment/runtime.rs +237 -0
- package/rust/src/environment/swe_env.rs +248 -0
- package/rust/src/exceptions.rs +228 -0
- package/rust/src/lib.rs +68 -0
- package/rust/src/monitoring.rs +482 -0
- package/rust/src/run/hooks/mod.rs +134 -0
- package/rust/src/run/mod.rs +12 -0
- package/rust/src/run/run_batch.rs +563 -0
- package/rust/src/run/run_single.rs +196 -0
- package/rust/src/tools/bundle.rs +224 -0
- package/rust/src/tools/commands.rs +173 -0
- package/rust/src/tools/mod.rs +295 -0
- package/rust/src/tools/parsing.rs +354 -0
- package/rust/src/tools/registry.rs +143 -0
- package/rust/src/types.rs +554 -0
- package/rust/src/utils/config.rs +105 -0
- package/rust/src/utils/files.rs +137 -0
- package/rust/src/utils/github.rs +171 -0
- package/rust/src/utils/log.rs +65 -0
- package/rust/src/utils/mod.rs +17 -0
- package/rust/src/utils/serialization.rs +181 -0
- package/rust/src/utils/template.rs +173 -0
- package/typescript/README.md +335 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""Common functionality for the run scripts."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from argparse import ArgumentParser
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from types import UnionType
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
from pydantic import ValidationError
|
|
14
|
+
from pydantic_settings import BaseSettings, CliApp, SettingsError
|
|
15
|
+
from rich import print as rich_print
|
|
16
|
+
from rich.panel import Panel
|
|
17
|
+
|
|
18
|
+
from sweagent import CONFIG_DIR
|
|
19
|
+
from sweagent.types import AgentInfo, AgentRunResult
|
|
20
|
+
from sweagent.utils.log import get_logger
|
|
21
|
+
from sweagent.utils.serialization import merge_nested_dicts
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _shorten_strings(data, *, max_length=30):
|
|
25
|
+
"""
|
|
26
|
+
Recursively shortens all strings in a nested data structure to a maximum length.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
data: The nested data structure (dicts, lists, and strings).
|
|
30
|
+
max_length: The maximum length for strings.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The modified data structure with shortened strings.
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(data, str):
|
|
36
|
+
# Shorten the string if it exceeds the max length
|
|
37
|
+
data = data.replace("\n", "\\n")
|
|
38
|
+
return data[: max_length - 3] + "..."
|
|
39
|
+
elif isinstance(data, list):
|
|
40
|
+
# Recursively process each item in the list
|
|
41
|
+
return [_shorten_strings(item, max_length=max_length) for item in data]
|
|
42
|
+
elif isinstance(data, dict):
|
|
43
|
+
# Recursively process each value in the dictionary
|
|
44
|
+
return {key: _shorten_strings(value, max_length=max_length) for key, value in data.items()}
|
|
45
|
+
else:
|
|
46
|
+
# Return the data as is if it's neither a string, list, nor dict
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
_VALIDATION_ERROR_HELP_TEXT = """
|
|
51
|
+
The following errors are raised by Pydantic, trying to instantiate the configuration based on
|
|
52
|
+
the merged configuration dictionary [bold](see above)[/bold].
|
|
53
|
+
|
|
54
|
+
Every new indented block corresponds to a different error from Pydantic.
|
|
55
|
+
The first line of each block is the attribute that failed validation, the following lines are the error messages.
|
|
56
|
+
|
|
57
|
+
If you see many lines of errors, there are probably different ways to instantiate the same object (a union type).
|
|
58
|
+
For example, there are different deployments with different options each. Pydantic is then trying
|
|
59
|
+
one after the other and reporting the failures for each of them.
|
|
60
|
+
More on union types: [link=https://swe-agent.com/latest/usage/cl_tutorial/#union-types]https://swe-agent.com/latest/usage/cl_tutorial/#union-types[/link]
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
_SETTING_ERROR_HINTS = """
|
|
64
|
+
[red][bold]Hints:[/bold][/red]
|
|
65
|
+
Run `sweagent <subcommand> --help` for usage examples.
|
|
66
|
+
|
|
67
|
+
[red][bold]Common mistakes:[/bold][/red]
|
|
68
|
+
- You used dashes instead of underscores (wrong: `--num-workers`, correct: `--num_workers`).
|
|
69
|
+
- You forgot about part of the hierarchy (wrong: `--model.name`, correct: `--agent.model.name`).
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AutoCorrectSuggestion:
|
|
74
|
+
def __init__(
|
|
75
|
+
self, original: str, alternative: str = "", *, condition: Callable | None = None, help: str | None = None
|
|
76
|
+
):
|
|
77
|
+
self.original = original
|
|
78
|
+
self.alternative = alternative
|
|
79
|
+
self.condition = condition
|
|
80
|
+
self.help = help
|
|
81
|
+
if self.help and self.alternative:
|
|
82
|
+
msg = "Cannot set both help and alternative"
|
|
83
|
+
raise ValueError(msg)
|
|
84
|
+
|
|
85
|
+
def show(self, args: list[str]) -> bool:
|
|
86
|
+
no_equal = []
|
|
87
|
+
for arg in args:
|
|
88
|
+
if "=" in arg:
|
|
89
|
+
no_equal.extend(arg.split("="))
|
|
90
|
+
else:
|
|
91
|
+
no_equal.append(arg)
|
|
92
|
+
if self.condition is not None:
|
|
93
|
+
return self.condition(no_equal)
|
|
94
|
+
return f"--{self.original}" in no_equal
|
|
95
|
+
|
|
96
|
+
def format(self) -> str:
|
|
97
|
+
if self.help:
|
|
98
|
+
return self.help
|
|
99
|
+
return f"You wrote [red]--{self.original}[/red]. Did you mean [green]--{self.alternative}[/green]?"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class ConfigHelper:
|
|
103
|
+
"""Produce easy-to-read help text from pydantic setting objects."""
|
|
104
|
+
|
|
105
|
+
def _get_type_name(self, item: Any, full: bool = False):
|
|
106
|
+
"""Given a config type, return a string that is either the full name or just the class name."""
|
|
107
|
+
full_name = str(item).removeprefix("<class '").removesuffix("'>")
|
|
108
|
+
if full:
|
|
109
|
+
return full_name
|
|
110
|
+
return full_name.split(".")[-1]
|
|
111
|
+
|
|
112
|
+
def _get_value_help_string(self, item: Any, description: str | None):
|
|
113
|
+
"""Given an item, document it"""
|
|
114
|
+
if hasattr(item, "model_fields"):
|
|
115
|
+
# It's a pydantic config class
|
|
116
|
+
full_name = self._get_type_name(item, full=True)
|
|
117
|
+
name = self._get_type_name(item)
|
|
118
|
+
out = f"[green]{name}[/green]\n"
|
|
119
|
+
if description:
|
|
120
|
+
out += f" {description}\n"
|
|
121
|
+
out += f" Run [green]--help_option {full_name}[/green] for more info"
|
|
122
|
+
return out
|
|
123
|
+
if isinstance(item, UnionType):
|
|
124
|
+
name = self._get_type_name(item)
|
|
125
|
+
out = ""
|
|
126
|
+
if description:
|
|
127
|
+
out += f" {description}\n"
|
|
128
|
+
out += " This config item can be one of the following things (run [green]--help_option <name>[/green] for more info):\n"
|
|
129
|
+
things = str(item).split("|")
|
|
130
|
+
for thing in things:
|
|
131
|
+
out += f" [green]{thing.strip()}[/green]\n"
|
|
132
|
+
return out.strip()
|
|
133
|
+
return self._get_type_name(item)
|
|
134
|
+
|
|
135
|
+
def get_help(self, config_type: type[BaseSettings]) -> str:
|
|
136
|
+
lines = []
|
|
137
|
+
for name, field_info in config_type.model_fields.items():
|
|
138
|
+
line = f"[green][bold]{name}[/bold][/green]: "
|
|
139
|
+
line += self._get_value_help_string(field_info.annotation, field_info.description)
|
|
140
|
+
lines.append(line)
|
|
141
|
+
return "\n\n".join(lines)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _nested_dict():
|
|
145
|
+
"""Helper function to create nested dictionaries."""
|
|
146
|
+
return defaultdict(_nested_dict)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _parse_args_to_nested_dict(args):
|
|
150
|
+
"""Parse the command-line arguments into a nested dictionary."""
|
|
151
|
+
result = _nested_dict()
|
|
152
|
+
|
|
153
|
+
i = 0
|
|
154
|
+
while i < len(args):
|
|
155
|
+
arg = args[i]
|
|
156
|
+
if not arg.startswith("--"):
|
|
157
|
+
i += 1
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
# Handle --key=value format
|
|
161
|
+
if "=" in arg:
|
|
162
|
+
key, value = arg[2:].split("=", 1)
|
|
163
|
+
# Handle --key value format
|
|
164
|
+
else:
|
|
165
|
+
key = arg[2:]
|
|
166
|
+
i += 1
|
|
167
|
+
if i >= len(args):
|
|
168
|
+
break
|
|
169
|
+
value = args[i]
|
|
170
|
+
|
|
171
|
+
# Convert value to int if possible
|
|
172
|
+
value = int(value) if value.isdigit() else value
|
|
173
|
+
|
|
174
|
+
# Build nested dict structure
|
|
175
|
+
keys = key.split(".")
|
|
176
|
+
current = result
|
|
177
|
+
for k in keys[:-1]:
|
|
178
|
+
current = current[k]
|
|
179
|
+
current[keys[-1]] = value
|
|
180
|
+
|
|
181
|
+
i += 1
|
|
182
|
+
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# todo: Parameterize type hints
|
|
187
|
+
class BasicCLI:
|
|
188
|
+
def __init__(
|
|
189
|
+
self,
|
|
190
|
+
config_type: type[BaseSettings],
|
|
191
|
+
*,
|
|
192
|
+
default_settings: bool = True,
|
|
193
|
+
help_text: str | None = None,
|
|
194
|
+
default_config_file: Path = CONFIG_DIR / "default.yaml",
|
|
195
|
+
):
|
|
196
|
+
"""This class implements a basic CLI for SWE-agent. It is based on pydantic-settings, i.e., takes
|
|
197
|
+
a `BaseSettings` object. In principle you could just initialize these via `pydantic-settings`'s `CliApp.run`,
|
|
198
|
+
however, we also want to add a `--config` option to load additional config files and some other things.
|
|
199
|
+
We also try to improve a bit on the pydantic error messages in here.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
config_type: The type of the configuration object to instantiate.
|
|
203
|
+
default_settings: Whether to load the default settings.
|
|
204
|
+
help_text: If given, this will override the default help text that would usually be shown
|
|
205
|
+
by argparse.
|
|
206
|
+
"""
|
|
207
|
+
self.arg_type = config_type
|
|
208
|
+
self.default_settings = default_settings
|
|
209
|
+
self.logger = get_logger("swea-cli", emoji="🔧")
|
|
210
|
+
self.help_text = help_text
|
|
211
|
+
self.default_config_file = default_config_file
|
|
212
|
+
|
|
213
|
+
def maybe_show_auto_correct(self, args: list[str]):
|
|
214
|
+
auto_correct = []
|
|
215
|
+
if hasattr(self.arg_type, "_get_auto_correct"):
|
|
216
|
+
for ac in self.arg_type._get_auto_correct(): # type: ignore
|
|
217
|
+
if ac.show(args):
|
|
218
|
+
auto_correct.append(ac)
|
|
219
|
+
if auto_correct:
|
|
220
|
+
rich_print(
|
|
221
|
+
Panel.fit(
|
|
222
|
+
"[red][bold]Auto-correct suggestions[/bold][/red]\n\n"
|
|
223
|
+
+ "\n".join(ac.format() for ac in auto_correct),
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def get_config(self, args: list[str] | None = None) -> BaseSettings:
|
|
228
|
+
"""Get the configuration object from defaults and command arguments."""
|
|
229
|
+
|
|
230
|
+
# >>> Step 1: Use argparse to add a --config option to load whole config files
|
|
231
|
+
|
|
232
|
+
# The defaults if no config file is provided
|
|
233
|
+
# Otherwise, the configs from the respective classes will be used
|
|
234
|
+
parser = ArgumentParser(description=__doc__, add_help=False)
|
|
235
|
+
parser.add_argument(
|
|
236
|
+
"--config",
|
|
237
|
+
type=Path,
|
|
238
|
+
action="append",
|
|
239
|
+
default=[],
|
|
240
|
+
help=(
|
|
241
|
+
"Load additional config files. Use this option multiple times to load "
|
|
242
|
+
"multiple files, e.g., --config config1.yaml --config config2.yaml"
|
|
243
|
+
),
|
|
244
|
+
)
|
|
245
|
+
parser.add_argument(
|
|
246
|
+
"-h",
|
|
247
|
+
"--help",
|
|
248
|
+
help="Show help text and exit",
|
|
249
|
+
action="store_true",
|
|
250
|
+
)
|
|
251
|
+
parser.add_argument(
|
|
252
|
+
"--help_option",
|
|
253
|
+
help="Show help text for a specific option",
|
|
254
|
+
)
|
|
255
|
+
if self.default_settings:
|
|
256
|
+
parser.add_argument(
|
|
257
|
+
"--no_config_file",
|
|
258
|
+
action="store_true",
|
|
259
|
+
help="Do not load default config file when no config file is provided",
|
|
260
|
+
)
|
|
261
|
+
parser.add_argument(
|
|
262
|
+
"--print_config",
|
|
263
|
+
action="store_true",
|
|
264
|
+
help="Print the final config and exit",
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# >>> Step 2: Parse argparse arguments but keep all the remaining arguments.
|
|
268
|
+
# Explicitly handle --help and --print-options
|
|
269
|
+
|
|
270
|
+
cli_args, remaining_args = parser.parse_known_args(args)
|
|
271
|
+
|
|
272
|
+
if cli_args.help:
|
|
273
|
+
if self.help_text:
|
|
274
|
+
rich_print(self.help_text)
|
|
275
|
+
else:
|
|
276
|
+
parser.print_help()
|
|
277
|
+
exit(0)
|
|
278
|
+
if cli_args.help_option:
|
|
279
|
+
module, _, name = cli_args.help_option.rpartition(".")
|
|
280
|
+
if module not in sys.modules:
|
|
281
|
+
__import__(module)
|
|
282
|
+
type_ = getattr(sys.modules[module], name)
|
|
283
|
+
rich_print(ConfigHelper().get_help(type_))
|
|
284
|
+
exit(0)
|
|
285
|
+
|
|
286
|
+
# >>> Step 3: Load config files and merge them in a big nested data structure
|
|
287
|
+
|
|
288
|
+
config_merged = {}
|
|
289
|
+
config_files = []
|
|
290
|
+
if cli_args.config:
|
|
291
|
+
config_files.extend(cli_args.config)
|
|
292
|
+
for _f in cli_args.config:
|
|
293
|
+
txt = Path(_f).read_text()
|
|
294
|
+
if not txt.strip():
|
|
295
|
+
self.logger.warning(f"Config file {_f} is empty")
|
|
296
|
+
continue
|
|
297
|
+
_loaded = yaml.safe_load(txt)
|
|
298
|
+
merge_nested_dicts(config_merged, _loaded)
|
|
299
|
+
elif self.default_settings and not cli_args.no_config_file:
|
|
300
|
+
config_file = self.default_config_file
|
|
301
|
+
config_files.append(config_file)
|
|
302
|
+
msg = (
|
|
303
|
+
f"Loading default config from {config_file}, because no other "
|
|
304
|
+
"config file is specified. Specify --no_config_file to disable this."
|
|
305
|
+
)
|
|
306
|
+
self.logger.info(msg)
|
|
307
|
+
txt = config_file.read_text()
|
|
308
|
+
if not txt.strip():
|
|
309
|
+
self.logger.warning(f"Default config file {config_file} is empty")
|
|
310
|
+
config_merged = {}
|
|
311
|
+
else:
|
|
312
|
+
config_merged = yaml.safe_load(txt)
|
|
313
|
+
else:
|
|
314
|
+
config_merged = {}
|
|
315
|
+
|
|
316
|
+
# For informational purposes, we also merge in the command line options
|
|
317
|
+
cl_options_dict = _parse_args_to_nested_dict(remaining_args)
|
|
318
|
+
|
|
319
|
+
# >>> Step 4: Bring together remaining arguments and the merged config to initialize the config object
|
|
320
|
+
# This is done by CliApp.run from pydantic-settings
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
config: BaseSettings = CliApp.run(self.arg_type, remaining_args, **config_merged, cli_exit_on_error=False) # type: ignore
|
|
324
|
+
except ValidationError as e:
|
|
325
|
+
rich_print(
|
|
326
|
+
Panel.fit(
|
|
327
|
+
"[red][bold]Configuration from config files\n[/bold]"
|
|
328
|
+
"This is all the configuration that was provided from defaults, --config, and CLI arguments[/red]\n\n"
|
|
329
|
+
+ yaml.dump(_shorten_strings(config_merged))
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
rich_print(
|
|
333
|
+
Panel.fit(
|
|
334
|
+
"[red][bold]Configuration from CLI arguments\n[/bold]"
|
|
335
|
+
"This is all the configuration that was provided from the command line arguments[/red]\n\n"
|
|
336
|
+
+ yaml.dump(_shorten_strings(cl_options_dict))
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
rich_print(
|
|
340
|
+
Panel.fit(
|
|
341
|
+
"[red][bold]Merged configuration\n[/bold]"
|
|
342
|
+
"This is the merged configuration that was used to instantiate the config object[/red]\n\n"
|
|
343
|
+
+ yaml.dump(_shorten_strings(merge_nested_dicts(config_merged, cl_options_dict)))
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
rich_print(
|
|
347
|
+
Panel.fit(
|
|
348
|
+
"[red][bold]Validation error[/bold]\n" + _VALIDATION_ERROR_HELP_TEXT + "[/red]\n" + str(e),
|
|
349
|
+
)
|
|
350
|
+
)
|
|
351
|
+
self.maybe_show_auto_correct(remaining_args)
|
|
352
|
+
msg = "Invalid configuration. Please check the above output."
|
|
353
|
+
raise RuntimeError(msg) from None
|
|
354
|
+
except SettingsError as e:
|
|
355
|
+
rich_print(Panel.fit("[red][bold]SettingsError[/bold][/red]\n\n" + str(e) + "\n\n" + _SETTING_ERROR_HINTS))
|
|
356
|
+
self.maybe_show_auto_correct(remaining_args)
|
|
357
|
+
msg = "Invalid command line arguments. Please check the above output in the box."
|
|
358
|
+
raise RuntimeError(msg) from None
|
|
359
|
+
|
|
360
|
+
if cli_args.print_config: # type: ignore
|
|
361
|
+
print(yaml.dump(config.model_dump()))
|
|
362
|
+
exit(0)
|
|
363
|
+
|
|
364
|
+
# Attach config files to the arg object, because we need them for file naming purposes
|
|
365
|
+
# (the output traj directory is named after the last config file)
|
|
366
|
+
config._config_files = config_files # type: ignore
|
|
367
|
+
return config
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def save_predictions(traj_dir: Path, instance_id: str, result: AgentRunResult):
|
|
371
|
+
"""Save predictions in a file readable by SWE-bench"""
|
|
372
|
+
output_file = traj_dir / instance_id / (instance_id + ".pred")
|
|
373
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
374
|
+
datum = {
|
|
375
|
+
"model_name_or_path": traj_dir.name,
|
|
376
|
+
"instance_id": instance_id,
|
|
377
|
+
"model_patch": result.info.get("submission"),
|
|
378
|
+
}
|
|
379
|
+
output_file.write_text(json.dumps(datum))
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _is_promising_patch(info: AgentInfo) -> bool:
|
|
383
|
+
"""Do we actually believe that the patch will solve the issue?
|
|
384
|
+
Or are we just submitting the last patch we generated before hitting an error?
|
|
385
|
+
"""
|
|
386
|
+
# The exit status can also be `submitted (exit_cost)` etc.
|
|
387
|
+
return info.get("exit_status") == "submitted" and info.get("submission") is not None
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from tabulate import tabulate
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_resolved(path: Path) -> set[str]:
|
|
9
|
+
data = json.loads(path.read_text())
|
|
10
|
+
if "resolved" in data:
|
|
11
|
+
data["resolved_ids"] = data["resolved"]
|
|
12
|
+
return set(data["resolved_ids"])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_submitted(path: Path) -> set[str]:
|
|
16
|
+
return set(json.loads(path.read_text())["submitted_ids"])
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def stats_single(path: Path) -> None:
|
|
20
|
+
evaluated_ids = sorted(get_submitted(path))
|
|
21
|
+
resolved_ids = sorted(get_resolved(path))
|
|
22
|
+
print(f"Total evaluated: {len(evaluated_ids)}")
|
|
23
|
+
print(f"Total resolved: {len(resolved_ids)}")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def compare_many(paths: list[Path]) -> None:
|
|
27
|
+
evaluated_ids = {}
|
|
28
|
+
resolved_ids = {}
|
|
29
|
+
for path in paths:
|
|
30
|
+
evaluated_ids[path] = sorted(get_submitted(path))
|
|
31
|
+
resolved_ids[path] = sorted(get_resolved(path))
|
|
32
|
+
header: list[str] = ["ID"] + [str(i) for i in range(len(paths))] + ["Success rate"]
|
|
33
|
+
table: list[list[str | float | int]] = []
|
|
34
|
+
|
|
35
|
+
def get_emoji(id: str, path: Path) -> str:
|
|
36
|
+
if id not in evaluated_ids[path]:
|
|
37
|
+
return "❓"
|
|
38
|
+
if id in resolved_ids[path]:
|
|
39
|
+
return "✅"
|
|
40
|
+
return "❌"
|
|
41
|
+
|
|
42
|
+
ids_to_compare = set(evaluated_ids[paths[0]])
|
|
43
|
+
for id in sorted(ids_to_compare):
|
|
44
|
+
row = [id] + [get_emoji(id, path) for path in paths]
|
|
45
|
+
n_success = sum(id in resolved_ids[path] for path in paths)
|
|
46
|
+
n_evaluated = sum(id in evaluated_ids[path] for path in paths)
|
|
47
|
+
row.append(f"{n_success / n_evaluated:.2f}")
|
|
48
|
+
table.append(row)
|
|
49
|
+
successes: list[str | float] = ["Successes"]
|
|
50
|
+
success_rates: list[str | float] = ["Success rates"]
|
|
51
|
+
for path in paths:
|
|
52
|
+
n_success = sum(id in resolved_ids[path] for id in ids_to_compare)
|
|
53
|
+
n_evaluated = sum(id in evaluated_ids[path] for id in ids_to_compare)
|
|
54
|
+
successes.append(n_success)
|
|
55
|
+
success_rates.append(f"{n_success / n_evaluated:.2f}")
|
|
56
|
+
table.append(successes)
|
|
57
|
+
table.append(success_rates)
|
|
58
|
+
print(tabulate(table, headers=header))
|
|
59
|
+
print()
|
|
60
|
+
|
|
61
|
+
header: list[str] = ["#", "ID", "Successes", "Success rate"]
|
|
62
|
+
table: list[list[str | float | int]] = []
|
|
63
|
+
for i, path in enumerate(paths):
|
|
64
|
+
row = [i, path.parent.name, successes[i + 1], success_rates[i + 1]]
|
|
65
|
+
table.append(row)
|
|
66
|
+
print(tabulate(table, headers=header))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def compare_pair(new_path: Path, old_path: Path, *, show_same=False) -> None:
|
|
70
|
+
evaluated_ids = sorted(get_submitted(new_path))
|
|
71
|
+
resolved_ids = sorted(get_resolved(new_path))
|
|
72
|
+
old_evaluated_ids = sorted(get_submitted(old_path))
|
|
73
|
+
old_resolved_ids = sorted(get_resolved(old_path))
|
|
74
|
+
print(f"Total evaluated: new {len(evaluated_ids)}, old {len(old_evaluated_ids)}")
|
|
75
|
+
print(f"Total resolved: new {len(resolved_ids)}, old {len(old_resolved_ids)}")
|
|
76
|
+
print("-" * 80)
|
|
77
|
+
print("Emoji legend:")
|
|
78
|
+
print("❓: Not evaluated in old version, so guessing it's either 😀 or 👾")
|
|
79
|
+
print("😀: Newly resolved in new version")
|
|
80
|
+
print("✅: Resolved in both")
|
|
81
|
+
print("❌: Resolved in old, not in new")
|
|
82
|
+
print("👾: Unresolved in both")
|
|
83
|
+
print("-" * 80)
|
|
84
|
+
|
|
85
|
+
for id in evaluated_ids:
|
|
86
|
+
resolved_now = id in resolved_ids
|
|
87
|
+
resolved_before = id in old_resolved_ids
|
|
88
|
+
if id not in old_evaluated_ids and resolved_now:
|
|
89
|
+
emoji = "😀❓"
|
|
90
|
+
elif id not in old_evaluated_ids and not resolved_now:
|
|
91
|
+
emoji = "👾❓"
|
|
92
|
+
elif resolved_now and not resolved_before:
|
|
93
|
+
emoji = "😀"
|
|
94
|
+
elif resolved_now and resolved_before:
|
|
95
|
+
emoji = "✅"
|
|
96
|
+
if not show_same:
|
|
97
|
+
continue
|
|
98
|
+
elif not resolved_now and resolved_before:
|
|
99
|
+
emoji = "❌"
|
|
100
|
+
else:
|
|
101
|
+
emoji = "👾"
|
|
102
|
+
if not show_same:
|
|
103
|
+
continue
|
|
104
|
+
print(f"{emoji} {id}")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def run_from_cli(_args: list[str] | None = None) -> None:
|
|
108
|
+
def get_preds_path(path: Path) -> Path:
|
|
109
|
+
if path.is_dir():
|
|
110
|
+
return path / "results.json"
|
|
111
|
+
return path
|
|
112
|
+
|
|
113
|
+
parser = argparse.ArgumentParser()
|
|
114
|
+
parser.add_argument("paths", type=Path, nargs="+")
|
|
115
|
+
parser.add_argument("--show-same", action="store_true")
|
|
116
|
+
args = parser.parse_args(_args)
|
|
117
|
+
args.paths = [get_preds_path(path) for path in args.paths]
|
|
118
|
+
if len(args.paths) == 1:
|
|
119
|
+
stats_single(args.paths[0])
|
|
120
|
+
elif len(args.paths) == 2:
|
|
121
|
+
compare_pair(args.paths[0], args.paths[1], show_same=args.show_same)
|
|
122
|
+
else:
|
|
123
|
+
compare_many(args.paths)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""If for some reason the .pred file isn't saved, we can extract it from the .traj file."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def run_from_cli(_args: list[str] | None = None):
|
|
9
|
+
parser = argparse.ArgumentParser()
|
|
10
|
+
parser.add_argument("traj_path", type=Path)
|
|
11
|
+
args = parser.parse_args(_args)
|
|
12
|
+
data = json.loads(args.traj_path.read_text())
|
|
13
|
+
pred_path = args.traj_path.with_suffix(".pred")
|
|
14
|
+
pred_data = {
|
|
15
|
+
"model_name_or_path": args.traj_path.resolve().parent.parent.name,
|
|
16
|
+
"model_patch": data["info"]["submission"],
|
|
17
|
+
"instance_id": args.traj_path.resolve().parent.name,
|
|
18
|
+
}
|
|
19
|
+
pred_path.write_text(json.dumps(pred_data))
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from sweagent.agent.problem_statement import ProblemStatement, ProblemStatementConfig
|
|
2
|
+
from sweagent.environment.swe_env import SWEEnv
|
|
3
|
+
from sweagent.types import AgentRunResult
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class RunHook:
|
|
7
|
+
"""Hook structure for the web server or other addons to interface with"""
|
|
8
|
+
|
|
9
|
+
def on_init(self, *, run):
|
|
10
|
+
"""Called when hook is initialized"""
|
|
11
|
+
|
|
12
|
+
def on_start(self):
|
|
13
|
+
"""Called at the beginning of `Main.main`"""
|
|
14
|
+
|
|
15
|
+
def on_end(self):
|
|
16
|
+
"""Called at the end of `Main.main`"""
|
|
17
|
+
|
|
18
|
+
def on_instance_start(
|
|
19
|
+
self, *, index: int, env: SWEEnv, problem_statement: ProblemStatement | ProblemStatementConfig
|
|
20
|
+
):
|
|
21
|
+
"""Called at the beginning of each instance loop in `Main.run`"""
|
|
22
|
+
|
|
23
|
+
def on_instance_skipped(
|
|
24
|
+
self,
|
|
25
|
+
):
|
|
26
|
+
"""Called when an instance is skipped in `Main.run`"""
|
|
27
|
+
|
|
28
|
+
def on_instance_completed(self, *, result: AgentRunResult):
|
|
29
|
+
"""Called when an instance is completed in `Main.run`"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CombinedRunHooks(RunHook):
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self._hooks = []
|
|
35
|
+
|
|
36
|
+
def add_hook(self, hook: RunHook) -> None:
|
|
37
|
+
self._hooks.append(hook)
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def hooks(self) -> list[RunHook]:
|
|
41
|
+
return self._hooks
|
|
42
|
+
|
|
43
|
+
def on_init(self, *, run):
|
|
44
|
+
for hook in self._hooks:
|
|
45
|
+
hook.on_init(run=run)
|
|
46
|
+
|
|
47
|
+
def on_start(self):
|
|
48
|
+
for hook in self._hooks:
|
|
49
|
+
hook.on_start()
|
|
50
|
+
|
|
51
|
+
def on_end(self):
|
|
52
|
+
for hook in self._hooks:
|
|
53
|
+
hook.on_end()
|
|
54
|
+
|
|
55
|
+
def on_instance_start(
|
|
56
|
+
self, *, index: int, env: SWEEnv, problem_statement: ProblemStatement | ProblemStatementConfig
|
|
57
|
+
):
|
|
58
|
+
for hook in self._hooks:
|
|
59
|
+
hook.on_instance_start(index=index, env=env, problem_statement=problem_statement)
|
|
60
|
+
|
|
61
|
+
def on_instance_skipped(self):
|
|
62
|
+
for hook in self._hooks:
|
|
63
|
+
hook.on_instance_skipped()
|
|
64
|
+
|
|
65
|
+
def on_instance_completed(self, *, result: AgentRunResult):
|
|
66
|
+
for hook in self._hooks:
|
|
67
|
+
hook.on_instance_completed(result=result)
|