@elizaos/sweagent-root 2.0.0-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +270 -0
- package/package.json +71 -0
- package/python/LICENSE +21 -0
- package/python/config/README.md +15 -0
- package/python/config/bash_only.yaml +222 -0
- package/python/config/benchmarks/250212_sweagent_heavy_sbl.yaml +188 -0
- package/python/config/benchmarks/250225_anthropic_filemap_simple_review.yaml +75 -0
- package/python/config/benchmarks/250522_anthropic_filemap_simple_review.yaml +92 -0
- package/python/config/benchmarks/250526_anthropic_filemap_simple_review_sbl.yaml +93 -0
- package/python/config/benchmarks/anthropic_filemap_multilingual.yaml +66 -0
- package/python/config/coding_challenge.yaml +104 -0
- package/python/config/default.yaml +69 -0
- package/python/config/default_backticks.yaml +69 -0
- package/python/config/default_mm_no_images.yaml +82 -0
- package/python/config/default_mm_with_images.yaml +83 -0
- package/python/config/demo/default.yaml +80 -0
- package/python/config/demo/no_instructions.yaml +69 -0
- package/python/config/demo/only_bash.yaml +60 -0
- package/python/config/exotic/default_shell.yaml +52 -0
- package/python/config/exotic/windowed_replace.yaml +125 -0
- package/python/config/exotic/windowed_replace_late_repro.yaml +127 -0
- package/python/config/human/human.yaml +24 -0
- package/python/config/human/human_demo.yaml +52 -0
- package/python/config/sweagent_0_7/07.yaml +101 -0
- package/python/config/sweagent_0_7/07_fcalling.yaml +100 -0
- package/python/config/sweagent_0_7/07_from_url.yaml +114 -0
- package/python/config/sweagent_0_7/07_thought_action.yaml +102 -0
- package/python/config/sweagent_0_7/07_thought_action_xml.yaml +96 -0
- package/python/mlc_config.json +44 -0
- package/python/pyproject.toml +262 -0
- package/python/sweagent/__init__.py +114 -0
- package/python/sweagent/__main__.py +4 -0
- package/python/sweagent/agent/__init__.py +0 -0
- package/python/sweagent/agent/action_sampler.py +317 -0
- package/python/sweagent/agent/agents.py +1294 -0
- package/python/sweagent/agent/extra/shell_agent.py +106 -0
- package/python/sweagent/agent/history_processors.py +399 -0
- package/python/sweagent/agent/hooks/__init__.py +0 -0
- package/python/sweagent/agent/hooks/abstract.py +139 -0
- package/python/sweagent/agent/hooks/status.py +34 -0
- package/python/sweagent/agent/models.py +896 -0
- package/python/sweagent/agent/problem_statement.py +312 -0
- package/python/sweagent/agent/reviewer.py +664 -0
- package/python/sweagent/environment/__init__.py +0 -0
- package/python/sweagent/environment/hooks/__init__.py +0 -0
- package/python/sweagent/environment/hooks/abstract.py +60 -0
- package/python/sweagent/environment/hooks/status.py +28 -0
- package/python/sweagent/environment/repo.py +219 -0
- package/python/sweagent/environment/swe_env.py +276 -0
- package/python/sweagent/exceptions.py +54 -0
- package/python/sweagent/inspector/README.md +6 -0
- package/python/sweagent/inspector/__init__.py +0 -0
- package/python/sweagent/inspector/favicon.ico +0 -0
- package/python/sweagent/inspector/fileViewer.js +354 -0
- package/python/sweagent/inspector/icons/computer.png +0 -0
- package/python/sweagent/inspector/icons/edit_icon.svg +11 -0
- package/python/sweagent/inspector/icons/swe-agent-logo-50.png +0 -0
- package/python/sweagent/inspector/icons/swellama_blue.png +0 -0
- package/python/sweagent/inspector/icons/swellama_brown.png +0 -0
- package/python/sweagent/inspector/icons/swellama_grey.png +0 -0
- package/python/sweagent/inspector/icons/swellama_tan.png +0 -0
- package/python/sweagent/inspector/index.html +25 -0
- package/python/sweagent/inspector/server.py +354 -0
- package/python/sweagent/inspector/static.py +169 -0
- package/python/sweagent/inspector/style.css +454 -0
- package/python/sweagent/run/__init__.py +0 -0
- package/python/sweagent/run/_progress.py +158 -0
- package/python/sweagent/run/batch_instances.py +419 -0
- package/python/sweagent/run/common.py +387 -0
- package/python/sweagent/run/compare_runs.py +123 -0
- package/python/sweagent/run/extract_pred.py +19 -0
- package/python/sweagent/run/hooks/__init__.py +0 -0
- package/python/sweagent/run/hooks/abstract.py +67 -0
- package/python/sweagent/run/hooks/apply_patch.py +106 -0
- package/python/sweagent/run/hooks/open_pr.py +244 -0
- package/python/sweagent/run/hooks/swe_bench_evaluate.py +113 -0
- package/python/sweagent/run/inspector_cli.py +493 -0
- package/python/sweagent/run/merge_predictions.py +64 -0
- package/python/sweagent/run/quick_stats.py +96 -0
- package/python/sweagent/run/remove_unfinished.py +63 -0
- package/python/sweagent/run/rich_test.py +91 -0
- package/python/sweagent/run/run.py +147 -0
- package/python/sweagent/run/run_batch.py +442 -0
- package/python/sweagent/run/run_replay.py +219 -0
- package/python/sweagent/run/run_shell.py +155 -0
- package/python/sweagent/run/run_single.py +225 -0
- package/python/sweagent/run/run_traj_to_demo.py +85 -0
- package/python/sweagent/tools/__init__.py +0 -0
- package/python/sweagent/tools/bundle.py +57 -0
- package/python/sweagent/tools/commands.py +220 -0
- package/python/sweagent/tools/parsing.py +619 -0
- package/python/sweagent/tools/tools.py +430 -0
- package/python/sweagent/tools/utils.py +108 -0
- package/python/sweagent/types.py +102 -0
- package/python/sweagent/utils/__init__.py +0 -0
- package/python/sweagent/utils/config.py +80 -0
- package/python/sweagent/utils/files.py +27 -0
- package/python/sweagent/utils/github.py +118 -0
- package/python/sweagent/utils/jinja_warnings.py +14 -0
- package/python/sweagent/utils/log.py +175 -0
- package/python/sweagent/utils/patch_formatter.py +152 -0
- package/python/sweagent/utils/serialization.py +45 -0
- package/python/tests/__init__.py +0 -0
- package/python/tests/conftest.py +191 -0
- package/python/tests/test_agent.py +258 -0
- package/python/tests/test_batch_instance.py +43 -0
- package/python/tests/test_commands/_interactive_dummy.py +35 -0
- package/python/tests/test_commands/interactive_dummy_wrapper.sh +29 -0
- package/python/tests/test_data/config_files/dummy_interactive.yaml +62 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/README.md +13 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/customrandom.c +50 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/release +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/server +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/solver.py +12 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/README.md +16 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/challenge.json +9 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/flash_c8429a430278283c0e571baebca3d139.zip +0 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/README.md +15 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/challenge.json +10 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/networking.pcap +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/Dockerfile +28 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/challenge.json +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/flag.txt +1 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.c +26 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.py +9 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/challenge.json +8 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock +0 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock.cpp +167 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/solution.cpp +24 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/solution.py +6 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/test.sh +10 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/000-default.conf +18 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/file.pl +38 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/forms.pl +40 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/hello.pl +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/flag +1 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/index.html +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/solution.txt +1 -0
- package/python/tests/test_data/data_sources/debug_20240322.json +1 -0
- package/python/tests/test_data/data_sources/expert_instances.yaml +16 -0
- package/python/tests/test_data/data_sources/human_eval.json +1 -0
- package/python/tests/test_data/data_sources/simple_instances.yaml +3 -0
- package/python/tests/test_data/data_sources/simple_instances_long.yaml +30 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy_first_only.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-lite-test.json +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/6e44b9__sweagenttestrepo-1c2844.traj +342 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/solution_missing_colon.py +15 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/args.yaml +518 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/swe-agent__test-repo-i1.traj +124 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/all_preds.jsonl +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/args.yaml +520 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/patches/pydicom__pydicom-1458.patch +18 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/pydicom__pydicom-1458.traj +257 -0
- package/python/tests/test_env.py +66 -0
- package/python/tests/test_env_utils.py +129 -0
- package/python/tests/test_history_processors.py +40 -0
- package/python/tests/test_models.py +23 -0
- package/python/tests/test_openai_live.py +164 -0
- package/python/tests/test_packaging.py +7 -0
- package/python/tests/test_parsing.py +131 -0
- package/python/tests/test_problem_statement_multimodal.py +111 -0
- package/python/tests/test_quick_stats.py +42 -0
- package/python/tests/test_run.py +37 -0
- package/python/tests/test_run_batch.py +110 -0
- package/python/tests/test_run_hooks.py +114 -0
- package/python/tests/test_run_replay.py +33 -0
- package/python/tests/test_run_single.py +125 -0
- package/python/tests/test_tools_command_parsing.py +193 -0
- package/python/tests/test_utils.py +15 -0
- package/python/tests/tools/__init__.py +0 -0
- package/python/tests/tools/conftest.py +12 -0
- package/python/tests/tools/test_default_utils.py +153 -0
- package/python/tests/tools/test_edit_replace.py +0 -0
- package/python/tests/tools/test_split_string.py +82 -0
- package/python/tests/utils.py +29 -0
- package/python/tools/diff_state/bin/_state_diff_state +52 -0
- package/python/tools/diff_state/config.yaml +2 -0
- package/python/tools/edit_anthropic/bin/_state_anthropic +21 -0
- package/python/tools/edit_anthropic/bin/str_replace_editor +710 -0
- package/python/tools/edit_anthropic/config.yaml +56 -0
- package/python/tools/edit_anthropic/install.sh +3 -0
- package/python/tools/filemap/bin/filemap +45 -0
- package/python/tools/filemap/config.yaml +9 -0
- package/python/tools/filemap/install.sh +2 -0
- package/python/tools/forfeit/bin/exit_forfeit +5 -0
- package/python/tools/forfeit/config.yaml +5 -0
- package/python/tools/image_tools/bin/view_image +36 -0
- package/python/tools/image_tools/config.yaml +9 -0
- package/python/tools/multilingual_setup/bin/do_nothing +2 -0
- package/python/tools/multilingual_setup/config.yaml +1 -0
- package/python/tools/multilingual_setup/install.sh +45 -0
- package/python/tools/registry/bin/_read_env +10 -0
- package/python/tools/registry/bin/_write_env +10 -0
- package/python/tools/registry/config.yaml +1 -0
- package/python/tools/registry/install.sh +6 -0
- package/python/tools/registry/lib/__init__.py +0 -0
- package/python/tools/registry/lib/registry.py +56 -0
- package/python/tools/review_on_submit_m/README.md +6 -0
- package/python/tools/review_on_submit_m/bin/submit +54 -0
- package/python/tools/review_on_submit_m/config.yaml +6 -0
- package/python/tools/review_on_submit_m/install.sh +0 -0
- package/python/tools/search/bin/find_file +31 -0
- package/python/tools/search/bin/search_dir +39 -0
- package/python/tools/search/bin/search_file +55 -0
- package/python/tools/search/config.yaml +37 -0
- package/python/tools/search/install.sh +3 -0
- package/python/tools/submit/bin/submit +17 -0
- package/python/tools/submit/config.yaml +5 -0
- package/python/tools/web_browser/bin/click_mouse +41 -0
- package/python/tools/web_browser/bin/close_site +28 -0
- package/python/tools/web_browser/bin/double_click_mouse +37 -0
- package/python/tools/web_browser/bin/drag_mouse +46 -0
- package/python/tools/web_browser/bin/execute_script_on_page +39 -0
- package/python/tools/web_browser/bin/get_console_output +48 -0
- package/python/tools/web_browser/bin/move_mouse +35 -0
- package/python/tools/web_browser/bin/navigate_back +33 -0
- package/python/tools/web_browser/bin/navigate_forward +33 -0
- package/python/tools/web_browser/bin/open_site +36 -0
- package/python/tools/web_browser/bin/press_keys_on_page +51 -0
- package/python/tools/web_browser/bin/reload_page +33 -0
- package/python/tools/web_browser/bin/run_web_browser_server +394 -0
- package/python/tools/web_browser/bin/screenshot_site +38 -0
- package/python/tools/web_browser/bin/scroll_on_page +40 -0
- package/python/tools/web_browser/bin/set_browser_window_size +40 -0
- package/python/tools/web_browser/bin/type_text +34 -0
- package/python/tools/web_browser/bin/wait_time +39 -0
- package/python/tools/web_browser/config.yaml +155 -0
- package/python/tools/web_browser/install.sh +22 -0
- package/python/tools/web_browser/lib/browser_manager.py +404 -0
- package/python/tools/web_browser/lib/web_browser_config.py +33 -0
- package/python/tools/web_browser/lib/web_browser_utils.py +126 -0
- package/python/tools/web_browser/test_console.html +1 -0
- package/python/tools/windowed/bin/_state +25 -0
- package/python/tools/windowed/bin/create +29 -0
- package/python/tools/windowed/bin/goto +37 -0
- package/python/tools/windowed/bin/open +49 -0
- package/python/tools/windowed/bin/scroll_down +12 -0
- package/python/tools/windowed/bin/scroll_up +13 -0
- package/python/tools/windowed/config.yaml +38 -0
- package/python/tools/windowed/install.sh +15 -0
- package/python/tools/windowed/lib/__init__.py +0 -0
- package/python/tools/windowed/lib/flake8_utils.py +147 -0
- package/python/tools/windowed/lib/windowed_file.py +312 -0
- package/python/tools/windowed_edit_linting/bin/edit +128 -0
- package/python/tools/windowed_edit_linting/config.yaml +31 -0
- package/python/tools/windowed_edit_linting/install.sh +5 -0
- package/python/tools/windowed_edit_replace/bin/edit +172 -0
- package/python/tools/windowed_edit_replace/bin/insert +77 -0
- package/python/tools/windowed_edit_replace/config.yaml +60 -0
- package/python/tools/windowed_edit_replace/install.sh +5 -0
- package/python/tools/windowed_edit_rewrite/bin/edit +78 -0
- package/python/tools/windowed_edit_rewrite/config.yaml +11 -0
- package/python/tools/windowed_edit_rewrite/install.sh +5 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyEncryption.traj +318 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyTimeCapsule.traj +197 -0
- package/python/trajectories/demonstrations/ctf/crypto/eps.traj +289 -0
- package/python/trajectories/demonstrations/ctf/crypto/katy.traj +368 -0
- package/python/trajectories/demonstrations/ctf/forensics/flash.traj +102 -0
- package/python/trajectories/demonstrations/ctf/misc/networking_1.traj +102 -0
- package/python/trajectories/demonstrations/ctf/pwn/warmup.traj +159 -0
- package/python/trajectories/demonstrations/ctf/rev/rock.traj +251 -0
- package/python/trajectories/demonstrations/ctf/web/i_got_id_demo.traj +422 -0
- package/python/trajectories/demonstrations/function_calling_simple.traj +151 -0
- package/python/trajectories/demonstrations/human_thought__swe-bench-HumanEvalFix-python__lcb__t-0.00__p-0.95__c-4.00__install-0/humanevalfix-python-0.traj +129 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default__t-0.20__p-0.95__c-2.00__install-1___install_from_source/marshmallow-code__marshmallow-1867.traj +318 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling__install-1/marshmallow-code__marshmallow-1867.traj +594 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace__install-1/marshmallow-code__marshmallow-1867.traj +592 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace_from_source/marshmallow-code__marshmallow-1867.traj +3316 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/str_replace_anthropic_demo.yaml +432 -0
- package/rust/Cargo.toml +100 -0
- package/rust/README.md +49 -0
- package/rust/src/agent/action_sampler.rs +130 -0
- package/rust/src/agent/agents.rs +1029 -0
- package/rust/src/agent/history_processors.rs +277 -0
- package/rust/src/agent/hooks/mod.rs +208 -0
- package/rust/src/agent/mod.rs +24 -0
- package/rust/src/agent/models.rs +837 -0
- package/rust/src/agent/problem_statement.rs +355 -0
- package/rust/src/agent/reviewer.rs +505 -0
- package/rust/src/bin/sweagent.rs +784 -0
- package/rust/src/environment/deployment.rs +631 -0
- package/rust/src/environment/hooks/mod.rs +114 -0
- package/rust/src/environment/mod.rs +16 -0
- package/rust/src/environment/repo.rs +265 -0
- package/rust/src/environment/runtime.rs +237 -0
- package/rust/src/environment/swe_env.rs +248 -0
- package/rust/src/exceptions.rs +228 -0
- package/rust/src/lib.rs +68 -0
- package/rust/src/monitoring.rs +482 -0
- package/rust/src/run/hooks/mod.rs +134 -0
- package/rust/src/run/mod.rs +12 -0
- package/rust/src/run/run_batch.rs +563 -0
- package/rust/src/run/run_single.rs +196 -0
- package/rust/src/tools/bundle.rs +224 -0
- package/rust/src/tools/commands.rs +173 -0
- package/rust/src/tools/mod.rs +295 -0
- package/rust/src/tools/parsing.rs +354 -0
- package/rust/src/tools/registry.rs +143 -0
- package/rust/src/types.rs +554 -0
- package/rust/src/utils/config.rs +105 -0
- package/rust/src/utils/files.rs +137 -0
- package/rust/src/utils/github.rs +171 -0
- package/rust/src/utils/log.rs +65 -0
- package/rust/src/utils/mod.rs +17 -0
- package/rust/src/utils/serialization.rs +181 -0
- package/rust/src/utils/template.rs +173 -0
- package/typescript/README.md +335 -0
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
//! Reviewer implementations for evaluating agent submissions
|
|
2
|
+
//!
|
|
3
|
+
//! Reviewers are used in retry loops to evaluate submissions and decide
|
|
4
|
+
//! whether to retry with different approaches.
|
|
5
|
+
|
|
6
|
+
use crate::exceptions::{Result, SWEAgentError};
|
|
7
|
+
use crate::types::{AgentInfo, Trajectory};
|
|
8
|
+
use async_trait::async_trait;
|
|
9
|
+
use serde::{Deserialize, Serialize};
|
|
10
|
+
use std::collections::HashMap;
|
|
11
|
+
|
|
12
|
+
/// Result from a review
|
|
13
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
14
|
+
pub struct ReviewerResult {
|
|
15
|
+
pub score: f64,
|
|
16
|
+
pub feedback: String,
|
|
17
|
+
pub should_retry: bool,
|
|
18
|
+
pub extra: HashMap<String, serde_json::Value>,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/// Data submitted for review
|
|
22
|
+
#[derive(Debug, Clone)]
|
|
23
|
+
pub struct ReviewSubmission {
|
|
24
|
+
pub trajectory: Trajectory,
|
|
25
|
+
pub info: AgentInfo,
|
|
26
|
+
pub submission: Option<String>,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Trait for submission reviewers
|
|
30
|
+
#[async_trait]
|
|
31
|
+
pub trait Reviewer: Send + Sync {
|
|
32
|
+
/// Review a submission
|
|
33
|
+
async fn review(&self, submission: &ReviewSubmission) -> Result<ReviewerResult>;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Simple reviewer that always passes - baseline implementation
|
|
37
|
+
pub struct PassThroughReviewer;
|
|
38
|
+
|
|
39
|
+
#[async_trait]
|
|
40
|
+
impl Reviewer for PassThroughReviewer {
|
|
41
|
+
async fn review(&self, _submission: &ReviewSubmission) -> Result<ReviewerResult> {
|
|
42
|
+
Ok(ReviewerResult {
|
|
43
|
+
score: 1.0,
|
|
44
|
+
feedback: "Submission accepted (pass-through reviewer)".to_string(),
|
|
45
|
+
should_retry: false,
|
|
46
|
+
extra: HashMap::new(),
|
|
47
|
+
})
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Reviewer that checks if a submission was actually provided
|
|
52
|
+
pub struct SubmissionPresenceReviewer {
|
|
53
|
+
threshold: f64,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
impl SubmissionPresenceReviewer {
|
|
57
|
+
pub fn new(threshold: f64) -> Self {
|
|
58
|
+
Self { threshold }
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
impl Default for SubmissionPresenceReviewer {
|
|
63
|
+
fn default() -> Self {
|
|
64
|
+
Self::new(0.5)
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[async_trait]
|
|
69
|
+
impl Reviewer for SubmissionPresenceReviewer {
|
|
70
|
+
async fn review(&self, submission: &ReviewSubmission) -> Result<ReviewerResult> {
|
|
71
|
+
let has_submission = submission
|
|
72
|
+
.submission
|
|
73
|
+
.as_ref()
|
|
74
|
+
.map(|s| !s.trim().is_empty())
|
|
75
|
+
.unwrap_or(false);
|
|
76
|
+
|
|
77
|
+
let score = if has_submission { 1.0 } else { 0.0 };
|
|
78
|
+
|
|
79
|
+
Ok(ReviewerResult {
|
|
80
|
+
score,
|
|
81
|
+
feedback: if has_submission {
|
|
82
|
+
"Submission provided".to_string()
|
|
83
|
+
} else {
|
|
84
|
+
"No submission provided".to_string()
|
|
85
|
+
},
|
|
86
|
+
should_retry: score < self.threshold,
|
|
87
|
+
extra: HashMap::new(),
|
|
88
|
+
})
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/// Reviewer that checks submission is non-empty patch
|
|
93
|
+
pub struct PatchPresenceReviewer {
|
|
94
|
+
min_lines: usize,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
impl PatchPresenceReviewer {
|
|
98
|
+
pub fn new(min_lines: usize) -> Self {
|
|
99
|
+
Self { min_lines }
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
impl Default for PatchPresenceReviewer {
|
|
104
|
+
fn default() -> Self {
|
|
105
|
+
Self::new(1)
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
#[async_trait]
|
|
110
|
+
impl Reviewer for PatchPresenceReviewer {
|
|
111
|
+
async fn review(&self, submission: &ReviewSubmission) -> Result<ReviewerResult> {
|
|
112
|
+
let patch = submission.submission.as_deref().unwrap_or("");
|
|
113
|
+
let has_diff_content = patch
|
|
114
|
+
.lines()
|
|
115
|
+
.any(|line| line.starts_with('+') || line.starts_with('-'));
|
|
116
|
+
|
|
117
|
+
let line_count = patch.lines().count();
|
|
118
|
+
let passes = has_diff_content && line_count >= self.min_lines;
|
|
119
|
+
|
|
120
|
+
let score = if passes { 1.0 } else { 0.0 };
|
|
121
|
+
|
|
122
|
+
Ok(ReviewerResult {
|
|
123
|
+
score,
|
|
124
|
+
feedback: if passes {
|
|
125
|
+
format!("Valid patch with {} lines", line_count)
|
|
126
|
+
} else if !has_diff_content {
|
|
127
|
+
"Patch contains no diff content (+/- lines)".to_string()
|
|
128
|
+
} else {
|
|
129
|
+
format!(
|
|
130
|
+
"Patch too short ({} lines, need {})",
|
|
131
|
+
line_count, self.min_lines
|
|
132
|
+
)
|
|
133
|
+
},
|
|
134
|
+
should_retry: !passes,
|
|
135
|
+
extra: {
|
|
136
|
+
let mut m = HashMap::new();
|
|
137
|
+
m.insert("line_count".to_string(), serde_json::json!(line_count));
|
|
138
|
+
m.insert(
|
|
139
|
+
"has_diff_content".to_string(),
|
|
140
|
+
serde_json::json!(has_diff_content),
|
|
141
|
+
);
|
|
142
|
+
m
|
|
143
|
+
},
|
|
144
|
+
})
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Chooser for selecting the best from multiple submissions
|
|
149
|
+
#[derive(Debug, Clone)]
|
|
150
|
+
pub struct ChooserOutput {
|
|
151
|
+
pub best_index: usize,
|
|
152
|
+
pub scores: Vec<f64>,
|
|
153
|
+
pub reasoning: String,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/// Trait for choosers
|
|
157
|
+
#[async_trait]
|
|
158
|
+
pub trait Chooser: Send + Sync {
|
|
159
|
+
/// Choose the best submission from a list
|
|
160
|
+
async fn choose(&self, submissions: &[ReviewSubmission]) -> Result<ChooserOutput>;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/// Simple chooser that selects based on submission presence and length
|
|
164
|
+
pub struct SimpleChooser;
|
|
165
|
+
|
|
166
|
+
#[async_trait]
|
|
167
|
+
impl Chooser for SimpleChooser {
|
|
168
|
+
async fn choose(&self, submissions: &[ReviewSubmission]) -> Result<ChooserOutput> {
|
|
169
|
+
if submissions.is_empty() {
|
|
170
|
+
return Err(SWEAgentError::ConfigurationError(
|
|
171
|
+
"No submissions to choose from".to_string(),
|
|
172
|
+
));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Score based on: has submission (0.5) + patch length normalized (0.5)
|
|
176
|
+
let scores: Vec<f64> = submissions
|
|
177
|
+
.iter()
|
|
178
|
+
.map(|s| {
|
|
179
|
+
let has_submission = s.submission.is_some();
|
|
180
|
+
let patch_len = s.submission.as_ref().map(|p| p.len()).unwrap_or(0);
|
|
181
|
+
let base_score = if has_submission { 0.5 } else { 0.0 };
|
|
182
|
+
let len_score = (patch_len as f64 / 10000.0).min(0.5);
|
|
183
|
+
base_score + len_score
|
|
184
|
+
})
|
|
185
|
+
.collect();
|
|
186
|
+
|
|
187
|
+
let best_index = scores
|
|
188
|
+
.iter()
|
|
189
|
+
.enumerate()
|
|
190
|
+
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
|
|
191
|
+
.map(|(i, _)| i)
|
|
192
|
+
.unwrap_or(0);
|
|
193
|
+
|
|
194
|
+
Ok(ChooserOutput {
|
|
195
|
+
best_index,
|
|
196
|
+
scores: scores.clone(),
|
|
197
|
+
reasoning: format!(
|
|
198
|
+
"Selected submission {} with score {:.2} (scores: {:?})",
|
|
199
|
+
best_index, scores[best_index], scores
|
|
200
|
+
),
|
|
201
|
+
})
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/// Abstract retry loop trait
|
|
206
|
+
#[async_trait]
|
|
207
|
+
pub trait RetryLoop: Send + Sync {
|
|
208
|
+
/// Called when a submission is made
|
|
209
|
+
fn on_submit(&mut self, submission: ReviewSubmission);
|
|
210
|
+
|
|
211
|
+
/// Check if should retry
|
|
212
|
+
fn should_retry(&self) -> bool;
|
|
213
|
+
|
|
214
|
+
/// Get the best submission index
|
|
215
|
+
fn get_best(&self) -> Option<usize>;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/// Simple retry loop with max attempts
|
|
219
|
+
pub struct MaxAttemptsRetryLoop {
|
|
220
|
+
submissions: Vec<ReviewSubmission>,
|
|
221
|
+
max_attempts: usize,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
impl MaxAttemptsRetryLoop {
|
|
225
|
+
pub fn new(max_attempts: usize) -> Self {
|
|
226
|
+
Self {
|
|
227
|
+
submissions: Vec::new(),
|
|
228
|
+
max_attempts,
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
#[async_trait]
|
|
234
|
+
impl RetryLoop for MaxAttemptsRetryLoop {
|
|
235
|
+
fn on_submit(&mut self, submission: ReviewSubmission) {
|
|
236
|
+
self.submissions.push(submission);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
fn should_retry(&self) -> bool {
|
|
240
|
+
self.submissions.len() < self.max_attempts
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
fn get_best(&self) -> Option<usize> {
|
|
244
|
+
if self.submissions.is_empty() {
|
|
245
|
+
return None;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Return the last submission with a patch, or the last one
|
|
249
|
+
self.submissions
|
|
250
|
+
.iter()
|
|
251
|
+
.enumerate()
|
|
252
|
+
.rev()
|
|
253
|
+
.find(|(_, s)| s.submission.is_some())
|
|
254
|
+
.map(|(i, _)| i)
|
|
255
|
+
.or(Some(self.submissions.len() - 1))
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/// Retry loop with reviewer-based decisions
|
|
260
|
+
pub struct ReviewerRetryLoop {
|
|
261
|
+
submissions: Vec<(ReviewSubmission, ReviewerResult)>,
|
|
262
|
+
max_attempts: usize,
|
|
263
|
+
reviewer: Box<dyn Reviewer>,
|
|
264
|
+
score_threshold: f64,
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
impl ReviewerRetryLoop {
|
|
268
|
+
pub fn new(max_attempts: usize, reviewer: Box<dyn Reviewer>, score_threshold: f64) -> Self {
|
|
269
|
+
Self {
|
|
270
|
+
submissions: Vec::new(),
|
|
271
|
+
max_attempts,
|
|
272
|
+
reviewer,
|
|
273
|
+
score_threshold,
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
/// Async review and store - must be called separately from on_submit
|
|
278
|
+
pub async fn review_submission(
|
|
279
|
+
&mut self,
|
|
280
|
+
submission: ReviewSubmission,
|
|
281
|
+
) -> Result<ReviewerResult> {
|
|
282
|
+
let result = self.reviewer.review(&submission).await?;
|
|
283
|
+
self.submissions.push((submission, result.clone()));
|
|
284
|
+
Ok(result)
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
#[async_trait]
|
|
289
|
+
impl RetryLoop for ReviewerRetryLoop {
|
|
290
|
+
fn on_submit(&mut self, submission: ReviewSubmission) {
|
|
291
|
+
// Store with a placeholder review - actual review should use review_submission
|
|
292
|
+
self.submissions.push((
|
|
293
|
+
submission,
|
|
294
|
+
ReviewerResult {
|
|
295
|
+
score: 0.0,
|
|
296
|
+
feedback: "Not reviewed".to_string(),
|
|
297
|
+
should_retry: true,
|
|
298
|
+
extra: HashMap::new(),
|
|
299
|
+
},
|
|
300
|
+
));
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
fn should_retry(&self) -> bool {
|
|
304
|
+
if self.submissions.is_empty() {
|
|
305
|
+
return true;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Stop if we've hit max attempts
|
|
309
|
+
if self.submissions.len() >= self.max_attempts {
|
|
310
|
+
return false;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Stop if last submission passed threshold
|
|
314
|
+
if let Some((_, result)) = self.submissions.last() {
|
|
315
|
+
if result.score >= self.score_threshold {
|
|
316
|
+
return false;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
true
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
fn get_best(&self) -> Option<usize> {
|
|
324
|
+
self.submissions
|
|
325
|
+
.iter()
|
|
326
|
+
.enumerate()
|
|
327
|
+
.max_by(|(_, (_, a)), (_, (_, b))| a.score.partial_cmp(&b.score).unwrap())
|
|
328
|
+
.map(|(i, _)| i)
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
fn default_threshold() -> f64 {
|
|
333
|
+
0.5
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
fn default_min_lines() -> usize {
|
|
337
|
+
1
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
/// Configuration for retry loops
|
|
341
|
+
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
342
|
+
#[serde(tag = "type", rename_all = "snake_case")]
|
|
343
|
+
pub enum RetryLoopConfig {
|
|
344
|
+
/// No retry - run once
|
|
345
|
+
#[default]
|
|
346
|
+
None,
|
|
347
|
+
/// Retry up to max_attempts times
|
|
348
|
+
MaxAttempts { max_attempts: usize },
|
|
349
|
+
/// Retry based on submission presence
|
|
350
|
+
SubmissionPresence {
|
|
351
|
+
max_attempts: usize,
|
|
352
|
+
#[serde(default = "default_threshold")]
|
|
353
|
+
threshold: f64,
|
|
354
|
+
},
|
|
355
|
+
/// Retry based on patch presence
|
|
356
|
+
PatchPresence {
|
|
357
|
+
max_attempts: usize,
|
|
358
|
+
#[serde(default = "default_min_lines")]
|
|
359
|
+
min_lines: usize,
|
|
360
|
+
},
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/// Create a retry loop from configuration
|
|
364
|
+
pub fn get_retry_loop_from_config(config: &RetryLoopConfig) -> Box<dyn RetryLoop> {
|
|
365
|
+
match config {
|
|
366
|
+
RetryLoopConfig::None => Box::new(MaxAttemptsRetryLoop::new(1)),
|
|
367
|
+
RetryLoopConfig::MaxAttempts { max_attempts } => {
|
|
368
|
+
Box::new(MaxAttemptsRetryLoop::new(*max_attempts))
|
|
369
|
+
}
|
|
370
|
+
RetryLoopConfig::SubmissionPresence {
|
|
371
|
+
max_attempts,
|
|
372
|
+
threshold,
|
|
373
|
+
} => Box::new(ReviewerRetryLoop::new(
|
|
374
|
+
*max_attempts,
|
|
375
|
+
Box::new(SubmissionPresenceReviewer::new(*threshold)),
|
|
376
|
+
*threshold,
|
|
377
|
+
)),
|
|
378
|
+
RetryLoopConfig::PatchPresence {
|
|
379
|
+
max_attempts,
|
|
380
|
+
min_lines,
|
|
381
|
+
} => Box::new(ReviewerRetryLoop::new(
|
|
382
|
+
*max_attempts,
|
|
383
|
+
Box::new(PatchPresenceReviewer::new(*min_lines)),
|
|
384
|
+
0.5,
|
|
385
|
+
)),
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
#[cfg(test)]
|
|
390
|
+
mod tests {
|
|
391
|
+
use super::*;
|
|
392
|
+
|
|
393
|
+
#[tokio::test]
|
|
394
|
+
async fn test_pass_through_reviewer() {
|
|
395
|
+
let reviewer = PassThroughReviewer;
|
|
396
|
+
let submission = ReviewSubmission {
|
|
397
|
+
trajectory: vec![],
|
|
398
|
+
info: AgentInfo::default(),
|
|
399
|
+
submission: Some("test patch".to_string()),
|
|
400
|
+
};
|
|
401
|
+
|
|
402
|
+
let result = reviewer.review(&submission).await.unwrap();
|
|
403
|
+
assert_eq!(result.score, 1.0);
|
|
404
|
+
assert!(!result.should_retry);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
#[tokio::test]
|
|
408
|
+
async fn test_submission_presence_reviewer() {
|
|
409
|
+
let reviewer = SubmissionPresenceReviewer::new(0.5);
|
|
410
|
+
|
|
411
|
+
// With submission
|
|
412
|
+
let with_sub = ReviewSubmission {
|
|
413
|
+
trajectory: vec![],
|
|
414
|
+
info: AgentInfo::default(),
|
|
415
|
+
submission: Some("patch content".to_string()),
|
|
416
|
+
};
|
|
417
|
+
let result = reviewer.review(&with_sub).await.unwrap();
|
|
418
|
+
assert_eq!(result.score, 1.0);
|
|
419
|
+
assert!(!result.should_retry);
|
|
420
|
+
|
|
421
|
+
// Without submission
|
|
422
|
+
let no_sub = ReviewSubmission {
|
|
423
|
+
trajectory: vec![],
|
|
424
|
+
info: AgentInfo::default(),
|
|
425
|
+
submission: None,
|
|
426
|
+
};
|
|
427
|
+
let result = reviewer.review(&no_sub).await.unwrap();
|
|
428
|
+
assert_eq!(result.score, 0.0);
|
|
429
|
+
assert!(result.should_retry);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
#[tokio::test]
|
|
433
|
+
async fn test_patch_presence_reviewer() {
|
|
434
|
+
let reviewer = PatchPresenceReviewer::new(2);
|
|
435
|
+
|
|
436
|
+
// Valid patch
|
|
437
|
+
let valid = ReviewSubmission {
|
|
438
|
+
trajectory: vec![],
|
|
439
|
+
info: AgentInfo::default(),
|
|
440
|
+
submission: Some("--- a/file.py\n+++ b/file.py\n+new line\n-old line".to_string()),
|
|
441
|
+
};
|
|
442
|
+
let result = reviewer.review(&valid).await.unwrap();
|
|
443
|
+
assert_eq!(result.score, 1.0);
|
|
444
|
+
|
|
445
|
+
// Invalid patch (no diff content)
|
|
446
|
+
let invalid = ReviewSubmission {
|
|
447
|
+
trajectory: vec![],
|
|
448
|
+
info: AgentInfo::default(),
|
|
449
|
+
submission: Some("just some text\nno diff here".to_string()),
|
|
450
|
+
};
|
|
451
|
+
let result = reviewer.review(&invalid).await.unwrap();
|
|
452
|
+
assert_eq!(result.score, 0.0);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
#[tokio::test]
|
|
456
|
+
async fn test_simple_chooser() {
|
|
457
|
+
let chooser = SimpleChooser;
|
|
458
|
+
let submissions = vec![
|
|
459
|
+
ReviewSubmission {
|
|
460
|
+
trajectory: vec![],
|
|
461
|
+
info: AgentInfo::default(),
|
|
462
|
+
submission: None,
|
|
463
|
+
},
|
|
464
|
+
ReviewSubmission {
|
|
465
|
+
trajectory: vec![],
|
|
466
|
+
info: AgentInfo::default(),
|
|
467
|
+
submission: Some("a longer patch content here".to_string()),
|
|
468
|
+
},
|
|
469
|
+
];
|
|
470
|
+
|
|
471
|
+
let result = chooser.choose(&submissions).await.unwrap();
|
|
472
|
+
assert_eq!(result.best_index, 1);
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
#[test]
|
|
476
|
+
fn test_max_attempts_retry_loop() {
|
|
477
|
+
let mut loop_runner = MaxAttemptsRetryLoop::new(3);
|
|
478
|
+
|
|
479
|
+
assert!(loop_runner.should_retry());
|
|
480
|
+
|
|
481
|
+
loop_runner.on_submit(ReviewSubmission {
|
|
482
|
+
trajectory: vec![],
|
|
483
|
+
info: AgentInfo::default(),
|
|
484
|
+
submission: None,
|
|
485
|
+
});
|
|
486
|
+
assert!(loop_runner.should_retry());
|
|
487
|
+
|
|
488
|
+
loop_runner.on_submit(ReviewSubmission {
|
|
489
|
+
trajectory: vec![],
|
|
490
|
+
info: AgentInfo::default(),
|
|
491
|
+
submission: Some("patch".to_string()),
|
|
492
|
+
});
|
|
493
|
+
assert!(loop_runner.should_retry());
|
|
494
|
+
|
|
495
|
+
loop_runner.on_submit(ReviewSubmission {
|
|
496
|
+
trajectory: vec![],
|
|
497
|
+
info: AgentInfo::default(),
|
|
498
|
+
submission: None,
|
|
499
|
+
});
|
|
500
|
+
assert!(!loop_runner.should_retry());
|
|
501
|
+
|
|
502
|
+
// Best should be index 1 (the one with submission)
|
|
503
|
+
assert_eq!(loop_runner.get_best(), Some(1));
|
|
504
|
+
}
|
|
505
|
+
}
|