@elizaos/sweagent-root 2.0.0-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +270 -0
- package/package.json +71 -0
- package/python/LICENSE +21 -0
- package/python/config/README.md +15 -0
- package/python/config/bash_only.yaml +222 -0
- package/python/config/benchmarks/250212_sweagent_heavy_sbl.yaml +188 -0
- package/python/config/benchmarks/250225_anthropic_filemap_simple_review.yaml +75 -0
- package/python/config/benchmarks/250522_anthropic_filemap_simple_review.yaml +92 -0
- package/python/config/benchmarks/250526_anthropic_filemap_simple_review_sbl.yaml +93 -0
- package/python/config/benchmarks/anthropic_filemap_multilingual.yaml +66 -0
- package/python/config/coding_challenge.yaml +104 -0
- package/python/config/default.yaml +69 -0
- package/python/config/default_backticks.yaml +69 -0
- package/python/config/default_mm_no_images.yaml +82 -0
- package/python/config/default_mm_with_images.yaml +83 -0
- package/python/config/demo/default.yaml +80 -0
- package/python/config/demo/no_instructions.yaml +69 -0
- package/python/config/demo/only_bash.yaml +60 -0
- package/python/config/exotic/default_shell.yaml +52 -0
- package/python/config/exotic/windowed_replace.yaml +125 -0
- package/python/config/exotic/windowed_replace_late_repro.yaml +127 -0
- package/python/config/human/human.yaml +24 -0
- package/python/config/human/human_demo.yaml +52 -0
- package/python/config/sweagent_0_7/07.yaml +101 -0
- package/python/config/sweagent_0_7/07_fcalling.yaml +100 -0
- package/python/config/sweagent_0_7/07_from_url.yaml +114 -0
- package/python/config/sweagent_0_7/07_thought_action.yaml +102 -0
- package/python/config/sweagent_0_7/07_thought_action_xml.yaml +96 -0
- package/python/mlc_config.json +44 -0
- package/python/pyproject.toml +262 -0
- package/python/sweagent/__init__.py +114 -0
- package/python/sweagent/__main__.py +4 -0
- package/python/sweagent/agent/__init__.py +0 -0
- package/python/sweagent/agent/action_sampler.py +317 -0
- package/python/sweagent/agent/agents.py +1294 -0
- package/python/sweagent/agent/extra/shell_agent.py +106 -0
- package/python/sweagent/agent/history_processors.py +399 -0
- package/python/sweagent/agent/hooks/__init__.py +0 -0
- package/python/sweagent/agent/hooks/abstract.py +139 -0
- package/python/sweagent/agent/hooks/status.py +34 -0
- package/python/sweagent/agent/models.py +896 -0
- package/python/sweagent/agent/problem_statement.py +312 -0
- package/python/sweagent/agent/reviewer.py +664 -0
- package/python/sweagent/environment/__init__.py +0 -0
- package/python/sweagent/environment/hooks/__init__.py +0 -0
- package/python/sweagent/environment/hooks/abstract.py +60 -0
- package/python/sweagent/environment/hooks/status.py +28 -0
- package/python/sweagent/environment/repo.py +219 -0
- package/python/sweagent/environment/swe_env.py +276 -0
- package/python/sweagent/exceptions.py +54 -0
- package/python/sweagent/inspector/README.md +6 -0
- package/python/sweagent/inspector/__init__.py +0 -0
- package/python/sweagent/inspector/favicon.ico +0 -0
- package/python/sweagent/inspector/fileViewer.js +354 -0
- package/python/sweagent/inspector/icons/computer.png +0 -0
- package/python/sweagent/inspector/icons/edit_icon.svg +11 -0
- package/python/sweagent/inspector/icons/swe-agent-logo-50.png +0 -0
- package/python/sweagent/inspector/icons/swellama_blue.png +0 -0
- package/python/sweagent/inspector/icons/swellama_brown.png +0 -0
- package/python/sweagent/inspector/icons/swellama_grey.png +0 -0
- package/python/sweagent/inspector/icons/swellama_tan.png +0 -0
- package/python/sweagent/inspector/index.html +25 -0
- package/python/sweagent/inspector/server.py +354 -0
- package/python/sweagent/inspector/static.py +169 -0
- package/python/sweagent/inspector/style.css +454 -0
- package/python/sweagent/run/__init__.py +0 -0
- package/python/sweagent/run/_progress.py +158 -0
- package/python/sweagent/run/batch_instances.py +419 -0
- package/python/sweagent/run/common.py +387 -0
- package/python/sweagent/run/compare_runs.py +123 -0
- package/python/sweagent/run/extract_pred.py +19 -0
- package/python/sweagent/run/hooks/__init__.py +0 -0
- package/python/sweagent/run/hooks/abstract.py +67 -0
- package/python/sweagent/run/hooks/apply_patch.py +106 -0
- package/python/sweagent/run/hooks/open_pr.py +244 -0
- package/python/sweagent/run/hooks/swe_bench_evaluate.py +113 -0
- package/python/sweagent/run/inspector_cli.py +493 -0
- package/python/sweagent/run/merge_predictions.py +64 -0
- package/python/sweagent/run/quick_stats.py +96 -0
- package/python/sweagent/run/remove_unfinished.py +63 -0
- package/python/sweagent/run/rich_test.py +91 -0
- package/python/sweagent/run/run.py +147 -0
- package/python/sweagent/run/run_batch.py +442 -0
- package/python/sweagent/run/run_replay.py +219 -0
- package/python/sweagent/run/run_shell.py +155 -0
- package/python/sweagent/run/run_single.py +225 -0
- package/python/sweagent/run/run_traj_to_demo.py +85 -0
- package/python/sweagent/tools/__init__.py +0 -0
- package/python/sweagent/tools/bundle.py +57 -0
- package/python/sweagent/tools/commands.py +220 -0
- package/python/sweagent/tools/parsing.py +619 -0
- package/python/sweagent/tools/tools.py +430 -0
- package/python/sweagent/tools/utils.py +108 -0
- package/python/sweagent/types.py +102 -0
- package/python/sweagent/utils/__init__.py +0 -0
- package/python/sweagent/utils/config.py +80 -0
- package/python/sweagent/utils/files.py +27 -0
- package/python/sweagent/utils/github.py +118 -0
- package/python/sweagent/utils/jinja_warnings.py +14 -0
- package/python/sweagent/utils/log.py +175 -0
- package/python/sweagent/utils/patch_formatter.py +152 -0
- package/python/sweagent/utils/serialization.py +45 -0
- package/python/tests/__init__.py +0 -0
- package/python/tests/conftest.py +191 -0
- package/python/tests/test_agent.py +258 -0
- package/python/tests/test_batch_instance.py +43 -0
- package/python/tests/test_commands/_interactive_dummy.py +35 -0
- package/python/tests/test_commands/interactive_dummy_wrapper.sh +29 -0
- package/python/tests/test_data/config_files/dummy_interactive.yaml +62 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/README.md +13 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/customrandom.c +50 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/release +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/server +0 -0
- package/python/tests/test_data/data_sources/ctf/crypto/Katy/solver.py +12 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/README.md +16 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/challenge.json +9 -0
- package/python/tests/test_data/data_sources/ctf/forensics/flash/flash_c8429a430278283c0e571baebca3d139.zip +0 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/README.md +15 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/challenge.json +10 -0
- package/python/tests/test_data/data_sources/ctf/misc/networking_1/networking.pcap +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/Dockerfile +28 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/challenge.json +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/flag.txt +1 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup +0 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.c +26 -0
- package/python/tests/test_data/data_sources/ctf/pwn/warmup/warmup.py +9 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/README.md +14 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/challenge.json +8 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock +0 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/rock.cpp +167 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/solution.cpp +24 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/solution.py +6 -0
- package/python/tests/test_data/data_sources/ctf/rev/rock/test_solver/test.sh +10 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/000-default.conf +18 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/Dockerfile +20 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/file.pl +38 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/forms.pl +40 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/cgi/hello.pl +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/challenge.json +12 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/docker-compose.yml +14 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/flag +1 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/index.html +11 -0
- package/python/tests/test_data/data_sources/ctf/web/i_got_id_demo/solution.txt +1 -0
- package/python/tests/test_data/data_sources/debug_20240322.json +1 -0
- package/python/tests/test_data/data_sources/expert_instances.yaml +16 -0
- package/python/tests/test_data/data_sources/human_eval.json +1 -0
- package/python/tests/test_data/data_sources/simple_instances.yaml +3 -0
- package/python/tests/test_data/data_sources/simple_instances_long.yaml +30 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-dev-easy_first_only.json +1 -0
- package/python/tests/test_data/data_sources/swe-bench-lite-test.json +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/6e44b9__sweagenttestrepo-1c2844.traj +342 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/solution_missing_colon.py +15 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/args.yaml +518 -0
- package/python/tests/test_data/trajectories/gpt4__swe-agent__test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/swe-agent__test-repo-i1.traj +124 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/all_preds.jsonl +1 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/args.yaml +520 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/patches/pydicom__pydicom-1458.patch +18 -0
- package/python/tests/test_data/trajectories/gpt4__swe-bench-dev-easy_first_only__default__t-0.00__p-0.95__c-3.00__install-1/pydicom__pydicom-1458.traj +257 -0
- package/python/tests/test_env.py +66 -0
- package/python/tests/test_env_utils.py +129 -0
- package/python/tests/test_history_processors.py +40 -0
- package/python/tests/test_models.py +23 -0
- package/python/tests/test_openai_live.py +164 -0
- package/python/tests/test_packaging.py +7 -0
- package/python/tests/test_parsing.py +131 -0
- package/python/tests/test_problem_statement_multimodal.py +111 -0
- package/python/tests/test_quick_stats.py +42 -0
- package/python/tests/test_run.py +37 -0
- package/python/tests/test_run_batch.py +110 -0
- package/python/tests/test_run_hooks.py +114 -0
- package/python/tests/test_run_replay.py +33 -0
- package/python/tests/test_run_single.py +125 -0
- package/python/tests/test_tools_command_parsing.py +193 -0
- package/python/tests/test_utils.py +15 -0
- package/python/tests/tools/__init__.py +0 -0
- package/python/tests/tools/conftest.py +12 -0
- package/python/tests/tools/test_default_utils.py +153 -0
- package/python/tests/tools/test_edit_replace.py +0 -0
- package/python/tests/tools/test_split_string.py +82 -0
- package/python/tests/utils.py +29 -0
- package/python/tools/diff_state/bin/_state_diff_state +52 -0
- package/python/tools/diff_state/config.yaml +2 -0
- package/python/tools/edit_anthropic/bin/_state_anthropic +21 -0
- package/python/tools/edit_anthropic/bin/str_replace_editor +710 -0
- package/python/tools/edit_anthropic/config.yaml +56 -0
- package/python/tools/edit_anthropic/install.sh +3 -0
- package/python/tools/filemap/bin/filemap +45 -0
- package/python/tools/filemap/config.yaml +9 -0
- package/python/tools/filemap/install.sh +2 -0
- package/python/tools/forfeit/bin/exit_forfeit +5 -0
- package/python/tools/forfeit/config.yaml +5 -0
- package/python/tools/image_tools/bin/view_image +36 -0
- package/python/tools/image_tools/config.yaml +9 -0
- package/python/tools/multilingual_setup/bin/do_nothing +2 -0
- package/python/tools/multilingual_setup/config.yaml +1 -0
- package/python/tools/multilingual_setup/install.sh +45 -0
- package/python/tools/registry/bin/_read_env +10 -0
- package/python/tools/registry/bin/_write_env +10 -0
- package/python/tools/registry/config.yaml +1 -0
- package/python/tools/registry/install.sh +6 -0
- package/python/tools/registry/lib/__init__.py +0 -0
- package/python/tools/registry/lib/registry.py +56 -0
- package/python/tools/review_on_submit_m/README.md +6 -0
- package/python/tools/review_on_submit_m/bin/submit +54 -0
- package/python/tools/review_on_submit_m/config.yaml +6 -0
- package/python/tools/review_on_submit_m/install.sh +0 -0
- package/python/tools/search/bin/find_file +31 -0
- package/python/tools/search/bin/search_dir +39 -0
- package/python/tools/search/bin/search_file +55 -0
- package/python/tools/search/config.yaml +37 -0
- package/python/tools/search/install.sh +3 -0
- package/python/tools/submit/bin/submit +17 -0
- package/python/tools/submit/config.yaml +5 -0
- package/python/tools/web_browser/bin/click_mouse +41 -0
- package/python/tools/web_browser/bin/close_site +28 -0
- package/python/tools/web_browser/bin/double_click_mouse +37 -0
- package/python/tools/web_browser/bin/drag_mouse +46 -0
- package/python/tools/web_browser/bin/execute_script_on_page +39 -0
- package/python/tools/web_browser/bin/get_console_output +48 -0
- package/python/tools/web_browser/bin/move_mouse +35 -0
- package/python/tools/web_browser/bin/navigate_back +33 -0
- package/python/tools/web_browser/bin/navigate_forward +33 -0
- package/python/tools/web_browser/bin/open_site +36 -0
- package/python/tools/web_browser/bin/press_keys_on_page +51 -0
- package/python/tools/web_browser/bin/reload_page +33 -0
- package/python/tools/web_browser/bin/run_web_browser_server +394 -0
- package/python/tools/web_browser/bin/screenshot_site +38 -0
- package/python/tools/web_browser/bin/scroll_on_page +40 -0
- package/python/tools/web_browser/bin/set_browser_window_size +40 -0
- package/python/tools/web_browser/bin/type_text +34 -0
- package/python/tools/web_browser/bin/wait_time +39 -0
- package/python/tools/web_browser/config.yaml +155 -0
- package/python/tools/web_browser/install.sh +22 -0
- package/python/tools/web_browser/lib/browser_manager.py +404 -0
- package/python/tools/web_browser/lib/web_browser_config.py +33 -0
- package/python/tools/web_browser/lib/web_browser_utils.py +126 -0
- package/python/tools/web_browser/test_console.html +1 -0
- package/python/tools/windowed/bin/_state +25 -0
- package/python/tools/windowed/bin/create +29 -0
- package/python/tools/windowed/bin/goto +37 -0
- package/python/tools/windowed/bin/open +49 -0
- package/python/tools/windowed/bin/scroll_down +12 -0
- package/python/tools/windowed/bin/scroll_up +13 -0
- package/python/tools/windowed/config.yaml +38 -0
- package/python/tools/windowed/install.sh +15 -0
- package/python/tools/windowed/lib/__init__.py +0 -0
- package/python/tools/windowed/lib/flake8_utils.py +147 -0
- package/python/tools/windowed/lib/windowed_file.py +312 -0
- package/python/tools/windowed_edit_linting/bin/edit +128 -0
- package/python/tools/windowed_edit_linting/config.yaml +31 -0
- package/python/tools/windowed_edit_linting/install.sh +5 -0
- package/python/tools/windowed_edit_replace/bin/edit +172 -0
- package/python/tools/windowed_edit_replace/bin/insert +77 -0
- package/python/tools/windowed_edit_replace/config.yaml +60 -0
- package/python/tools/windowed_edit_replace/install.sh +5 -0
- package/python/tools/windowed_edit_rewrite/bin/edit +78 -0
- package/python/tools/windowed_edit_rewrite/config.yaml +11 -0
- package/python/tools/windowed_edit_rewrite/install.sh +5 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyEncryption.traj +318 -0
- package/python/trajectories/demonstrations/ctf/crypto/BabyTimeCapsule.traj +197 -0
- package/python/trajectories/demonstrations/ctf/crypto/eps.traj +289 -0
- package/python/trajectories/demonstrations/ctf/crypto/katy.traj +368 -0
- package/python/trajectories/demonstrations/ctf/forensics/flash.traj +102 -0
- package/python/trajectories/demonstrations/ctf/misc/networking_1.traj +102 -0
- package/python/trajectories/demonstrations/ctf/pwn/warmup.traj +159 -0
- package/python/trajectories/demonstrations/ctf/rev/rock.traj +251 -0
- package/python/trajectories/demonstrations/ctf/web/i_got_id_demo.traj +422 -0
- package/python/trajectories/demonstrations/function_calling_simple.traj +151 -0
- package/python/trajectories/demonstrations/human_thought__swe-bench-HumanEvalFix-python__lcb__t-0.00__p-0.95__c-4.00__install-0/humanevalfix-python-0.traj +129 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default__t-0.20__p-0.95__c-2.00__install-1___install_from_source/marshmallow-code__marshmallow-1867.traj +318 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling__install-1/marshmallow-code__marshmallow-1867.traj +594 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace__install-1/marshmallow-code__marshmallow-1867.traj +592 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__function_calling_replace_from_source/marshmallow-code__marshmallow-1867.traj +3316 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +251 -0
- package/python/trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj +399 -0
- package/python/trajectories/demonstrations/str_replace_anthropic_demo.yaml +432 -0
- package/rust/Cargo.toml +100 -0
- package/rust/README.md +49 -0
- package/rust/src/agent/action_sampler.rs +130 -0
- package/rust/src/agent/agents.rs +1029 -0
- package/rust/src/agent/history_processors.rs +277 -0
- package/rust/src/agent/hooks/mod.rs +208 -0
- package/rust/src/agent/mod.rs +24 -0
- package/rust/src/agent/models.rs +837 -0
- package/rust/src/agent/problem_statement.rs +355 -0
- package/rust/src/agent/reviewer.rs +505 -0
- package/rust/src/bin/sweagent.rs +784 -0
- package/rust/src/environment/deployment.rs +631 -0
- package/rust/src/environment/hooks/mod.rs +114 -0
- package/rust/src/environment/mod.rs +16 -0
- package/rust/src/environment/repo.rs +265 -0
- package/rust/src/environment/runtime.rs +237 -0
- package/rust/src/environment/swe_env.rs +248 -0
- package/rust/src/exceptions.rs +228 -0
- package/rust/src/lib.rs +68 -0
- package/rust/src/monitoring.rs +482 -0
- package/rust/src/run/hooks/mod.rs +134 -0
- package/rust/src/run/mod.rs +12 -0
- package/rust/src/run/run_batch.rs +563 -0
- package/rust/src/run/run_single.rs +196 -0
- package/rust/src/tools/bundle.rs +224 -0
- package/rust/src/tools/commands.rs +173 -0
- package/rust/src/tools/mod.rs +295 -0
- package/rust/src/tools/parsing.rs +354 -0
- package/rust/src/tools/registry.rs +143 -0
- package/rust/src/types.rs +554 -0
- package/rust/src/utils/config.rs +105 -0
- package/rust/src/utils/files.rs +137 -0
- package/rust/src/utils/github.rs +171 -0
- package/rust/src/utils/log.rs +65 -0
- package/rust/src/utils/mod.rs +17 -0
- package/rust/src/utils/serialization.rs +181 -0
- package/rust/src/utils/template.rs +173 -0
- package/typescript/README.md +335 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from sweagent.run.hooks.open_pr import _remove_triple_backticks, format_trajectory_markdown
|
|
8
|
+
from sweagent.utils.github import (
|
|
9
|
+
InvalidGithubURL,
|
|
10
|
+
_get_associated_commit_urls,
|
|
11
|
+
_is_github_issue_url,
|
|
12
|
+
_is_github_repo_url,
|
|
13
|
+
_parse_gh_issue_url,
|
|
14
|
+
_parse_gh_repo_url,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_format_trajectory_markdown(test_trajectory):
|
|
19
|
+
formatted = format_trajectory_markdown(test_trajectory["trajectory"])
|
|
20
|
+
assert formatted.startswith("<details>")
|
|
21
|
+
assert formatted.endswith("</details>")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_remove_triple_backticks():
|
|
25
|
+
assert _remove_triple_backticks("```") == ""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_is_github_repo_url():
|
|
29
|
+
assert _is_github_repo_url("https://github.com/SWE-agent/SWE-agent")
|
|
30
|
+
assert _is_github_repo_url("https://github.com/SWE-agent/SWE-agent/anything")
|
|
31
|
+
assert _is_github_repo_url("github.com/SWE-agent/SWE-agent/anything")
|
|
32
|
+
assert not _is_github_repo_url("")
|
|
33
|
+
assert not _is_github_repo_url("/path/to/file")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_parse_gh_repo_url():
|
|
37
|
+
assert _parse_gh_repo_url("https://github.com/SWE-agent/SWE-agent") == ("SWE-agent", "SWE-agent")
|
|
38
|
+
assert _parse_gh_repo_url("github.com/SWE-agent/SWE-agent") == ("SWE-agent", "SWE-agent")
|
|
39
|
+
assert _parse_gh_repo_url("github.com/SWE-agent/SWE-agent/asdfjsdfg") == ("SWE-agent", "SWE-agent")
|
|
40
|
+
assert _parse_gh_repo_url("git@github.com/SWE-agent/SWE-agent/asdfjsdfg") == ("SWE-agent", "SWE-agent")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_parse_gh_repo_url_fails():
|
|
44
|
+
with pytest.raises(InvalidGithubURL):
|
|
45
|
+
_parse_gh_repo_url("adfkj;lasdfl;kj")
|
|
46
|
+
with pytest.raises(InvalidGithubURL):
|
|
47
|
+
_parse_gh_repo_url("github.com/")
|
|
48
|
+
with pytest.raises(InvalidGithubURL):
|
|
49
|
+
_parse_gh_repo_url("github.com//a/")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_parse_gh_issue_url():
|
|
53
|
+
url = "https://github.com/SWE-agent/SWE-agent/issues/43"
|
|
54
|
+
owner, repo, no = _parse_gh_issue_url(url)
|
|
55
|
+
assert owner == "SWE-agent"
|
|
56
|
+
assert repo == "SWE-agent"
|
|
57
|
+
assert no == "43"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_parse_gh_issue_url_fails():
|
|
61
|
+
with pytest.raises(InvalidGithubURL):
|
|
62
|
+
_parse_gh_issue_url("https://github.com/a/b")
|
|
63
|
+
with pytest.raises(InvalidGithubURL):
|
|
64
|
+
_parse_gh_issue_url("https://github.com/a/b////")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_is_from_github_url():
|
|
68
|
+
assert not _is_github_issue_url("")
|
|
69
|
+
assert _is_github_issue_url("https://github.com/SWE-agent/SWE-agent/issues/43")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_get_associated_commit_urls(monkeypatch: pytest.MonkeyPatch):
|
|
73
|
+
class FakeEvent:
|
|
74
|
+
def __init__(self, event: str, commit_id: str | None):
|
|
75
|
+
self.event = event
|
|
76
|
+
self.commit_id = commit_id
|
|
77
|
+
|
|
78
|
+
class FakeCommit:
|
|
79
|
+
def __init__(self, message: str, html_url: str):
|
|
80
|
+
self.commit = type("CommitObj", (), {"message": message})()
|
|
81
|
+
self.html_url = html_url
|
|
82
|
+
|
|
83
|
+
class FakeIssues:
|
|
84
|
+
def list_events(self, _org: str, _repo: str, _issue_number: str):
|
|
85
|
+
return [
|
|
86
|
+
FakeEvent("referenced", "abc123"),
|
|
87
|
+
FakeEvent("commented", "zzz999"),
|
|
88
|
+
FakeEvent("referenced", None),
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
class FakeRepos:
|
|
92
|
+
def get_commit(self, _org: str, _repo: str, commit_id: str):
|
|
93
|
+
if commit_id == "abc123":
|
|
94
|
+
return FakeCommit(
|
|
95
|
+
message="Fixes #41: handle edge case",
|
|
96
|
+
html_url="https://github.com/SWE-agent/SWE-agent/commit/abc123",
|
|
97
|
+
)
|
|
98
|
+
return FakeCommit(
|
|
99
|
+
message="Unrelated commit",
|
|
100
|
+
html_url="https://github.com/SWE-agent/SWE-agent/commit/zzz999",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
class FakeGhApi:
|
|
104
|
+
def __init__(self, token: str = ""):
|
|
105
|
+
self.token = token
|
|
106
|
+
self.issues = FakeIssues()
|
|
107
|
+
self.repos = FakeRepos()
|
|
108
|
+
|
|
109
|
+
# Patch GhApi used inside sweagent.utils.github
|
|
110
|
+
import sweagent.utils.github as gh
|
|
111
|
+
|
|
112
|
+
monkeypatch.setattr(gh, "GhApi", FakeGhApi)
|
|
113
|
+
assoc = _get_associated_commit_urls(
|
|
114
|
+
org="SWE-agent",
|
|
115
|
+
repo="SWE-agent",
|
|
116
|
+
issue_number="41",
|
|
117
|
+
token="",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
assert assoc == ["https://github.com/SWE-agent/SWE-agent/commit/abc123"]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def clone_repo(tmp_path, repo_url):
|
|
124
|
+
cmd = [
|
|
125
|
+
"git",
|
|
126
|
+
"clone",
|
|
127
|
+
repo_url,
|
|
128
|
+
]
|
|
129
|
+
subprocess.run(cmd, check=True, cwd=tmp_path)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from sweagent.agent.history_processors import LastNObservations, TagToolCallObservations
|
|
7
|
+
from sweagent.types import History
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_history(traj_path: Path):
|
|
11
|
+
return json.loads((traj_path).read_text())["history"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def count_elided_observations(history: History):
|
|
15
|
+
return len([entry for entry in history if "Old environment output" in entry["content"]])
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture
|
|
19
|
+
def test_history(test_trajectories_path: Path):
|
|
20
|
+
return get_history(
|
|
21
|
+
test_trajectories_path
|
|
22
|
+
/ "gpt4__swe-agent-test-repo__default_from_url__t-0.00__p-0.95__c-3.00__install-1/6e44b9__sweagenttestrepo-1c2844.traj"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_last_n_observations(test_history: History):
|
|
27
|
+
processor = LastNObservations(n=3)
|
|
28
|
+
new_history = processor(test_history)
|
|
29
|
+
total_observations = len([entry for entry in test_history if entry["message_type"] == "observation"])
|
|
30
|
+
# extra -1 because instance template is kept
|
|
31
|
+
expected_elided_observations = total_observations - 3 - 1
|
|
32
|
+
assert count_elided_observations(new_history) == expected_elided_observations
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_add_tag_to_edits(test_history: History):
|
|
36
|
+
processor = TagToolCallObservations(tags={"test"}, function_names={"edit"})
|
|
37
|
+
new_history = processor(test_history)
|
|
38
|
+
for entry in new_history:
|
|
39
|
+
if entry.get("action", "").startswith("edit "): # type: ignore
|
|
40
|
+
assert entry.get("tags") == ["test"], entry
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import SecretStr
|
|
4
|
+
|
|
5
|
+
from sweagent.agent.models import GenericAPIModelConfig, get_model
|
|
6
|
+
from sweagent.tools.parsing import Identity
|
|
7
|
+
from sweagent.tools.tools import ToolConfig
|
|
8
|
+
from sweagent.types import History
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_litellm_mock():
|
|
12
|
+
model = get_model(
|
|
13
|
+
GenericAPIModelConfig(
|
|
14
|
+
name="gpt-4o",
|
|
15
|
+
completion_kwargs={"mock_response": "Hello, world!"},
|
|
16
|
+
api_key=SecretStr("dummy_key"),
|
|
17
|
+
top_p=None,
|
|
18
|
+
),
|
|
19
|
+
ToolConfig(
|
|
20
|
+
parse_function=Identity(),
|
|
21
|
+
),
|
|
22
|
+
)
|
|
23
|
+
assert model.query(History([{"role": "user", "content": "Hello, world!"}])) == {"message": "Hello, world!"} # type: ignore
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Live OpenAI integration tests.
|
|
3
|
+
|
|
4
|
+
These tests require:
|
|
5
|
+
No network access. This file provides deterministic offline tests that validate
|
|
6
|
+
the expected response shape and basic caller behavior.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import TypedDict
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Message(TypedDict):
|
|
13
|
+
role: str
|
|
14
|
+
content: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Choice(TypedDict):
|
|
18
|
+
index: int
|
|
19
|
+
message: Message
|
|
20
|
+
finish_reason: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Usage(TypedDict):
|
|
24
|
+
prompt_tokens: int
|
|
25
|
+
completion_tokens: int
|
|
26
|
+
total_tokens: int
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OpenAIResponse(TypedDict):
|
|
30
|
+
id: str
|
|
31
|
+
object: str
|
|
32
|
+
created: int
|
|
33
|
+
model: str
|
|
34
|
+
choices: list[Choice]
|
|
35
|
+
usage: Usage
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def call_openai(messages: list[Message], model: str = "gpt-4o-mini", max_tokens: int = 100) -> OpenAIResponse:
|
|
39
|
+
"""Return a deterministic OpenAI-like response (offline)."""
|
|
40
|
+
|
|
41
|
+
last_user = next((m["content"] for m in reversed(messages) if m["role"] == "user"), "")
|
|
42
|
+
|
|
43
|
+
if "exactly one word" in last_user.lower():
|
|
44
|
+
content = "hello"
|
|
45
|
+
elif "multiply" in last_user.lower() and "3" in last_user:
|
|
46
|
+
content = "12"
|
|
47
|
+
elif "Write a Python function" in last_user:
|
|
48
|
+
content = "def add(a: int, b: int) -> int:\n return a + b\n"
|
|
49
|
+
elif "very long essay" in last_user.lower():
|
|
50
|
+
content = "Lorem ipsum " * 200
|
|
51
|
+
else:
|
|
52
|
+
content = "ok"
|
|
53
|
+
|
|
54
|
+
# crude token estimate to satisfy invariants without depending on a tokenizer
|
|
55
|
+
prompt_tokens = max(1, sum(max(1, len(m["content"]) // 4) for m in messages))
|
|
56
|
+
completion_tokens = min(max_tokens, max(1, len(content) // 4))
|
|
57
|
+
content = content[: completion_tokens * 4]
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
"id": "chatcmpl_test_1",
|
|
61
|
+
"object": "chat.completion",
|
|
62
|
+
"created": 0,
|
|
63
|
+
"model": model,
|
|
64
|
+
"choices": [
|
|
65
|
+
{
|
|
66
|
+
"index": 0,
|
|
67
|
+
"message": {"role": "assistant", "content": content},
|
|
68
|
+
"finish_reason": "stop",
|
|
69
|
+
}
|
|
70
|
+
],
|
|
71
|
+
"usage": {
|
|
72
|
+
"prompt_tokens": prompt_tokens,
|
|
73
|
+
"completion_tokens": completion_tokens,
|
|
74
|
+
"total_tokens": prompt_tokens + completion_tokens,
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TestOpenAILive:
|
|
80
|
+
"""Offline OpenAI response-shape tests."""
|
|
81
|
+
|
|
82
|
+
def test_connect_and_get_response(self) -> None:
|
|
83
|
+
"""Test basic connection to OpenAI API."""
|
|
84
|
+
messages: list[Message] = [
|
|
85
|
+
{"role": "system", "content": "You are a helpful assistant. Reply briefly."},
|
|
86
|
+
{"role": "user", "content": "Say hello in exactly one word."},
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
response = call_openai(messages)
|
|
90
|
+
|
|
91
|
+
# Verify response structure
|
|
92
|
+
assert response is not None
|
|
93
|
+
assert "id" in response
|
|
94
|
+
assert response["object"] == "chat.completion"
|
|
95
|
+
assert "gpt-4o-mini" in response["model"]
|
|
96
|
+
assert len(response["choices"]) == 1
|
|
97
|
+
assert response["choices"][0]["message"]["role"] == "assistant"
|
|
98
|
+
assert len(response["choices"][0]["message"]["content"]) > 0
|
|
99
|
+
assert response["usage"]["prompt_tokens"] > 0
|
|
100
|
+
assert response["usage"]["completion_tokens"] > 0
|
|
101
|
+
assert response["usage"]["total_tokens"] > 0
|
|
102
|
+
|
|
103
|
+
def test_multi_turn_conversation(self) -> None:
|
|
104
|
+
"""Test multi-turn conversation handling."""
|
|
105
|
+
messages: list[Message] = [
|
|
106
|
+
{"role": "system", "content": "You are a helpful math tutor. Be brief."},
|
|
107
|
+
{"role": "user", "content": "What is 2+2?"},
|
|
108
|
+
{"role": "assistant", "content": "4"},
|
|
109
|
+
{"role": "user", "content": "And if you multiply that by 3?"},
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
response = call_openai(messages)
|
|
113
|
+
|
|
114
|
+
content = response["choices"][0]["message"]["content"]
|
|
115
|
+
assert content is not None
|
|
116
|
+
# The response should mention 12 (4*3)
|
|
117
|
+
assert "12" in content.lower()
|
|
118
|
+
|
|
119
|
+
def test_max_tokens_respected(self) -> None:
|
|
120
|
+
"""Test that max_tokens parameter is respected."""
|
|
121
|
+
messages: list[Message] = [
|
|
122
|
+
{"role": "user", "content": "Write a very long essay about programming."},
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
response = call_openai(messages, max_tokens=100)
|
|
126
|
+
|
|
127
|
+
# With max_tokens=100, the response should be limited
|
|
128
|
+
assert response["usage"]["completion_tokens"] <= 100
|
|
129
|
+
|
|
130
|
+
def test_code_related_queries(self) -> None:
|
|
131
|
+
"""Test handling of code-related queries."""
|
|
132
|
+
messages: list[Message] = [
|
|
133
|
+
{"role": "system", "content": "You are a coding assistant. Reply with code only."},
|
|
134
|
+
{
|
|
135
|
+
"role": "user",
|
|
136
|
+
"content": "Write a Python function that adds two numbers. Only the function, no explanation.",
|
|
137
|
+
},
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
response = call_openai(messages)
|
|
141
|
+
|
|
142
|
+
content = response["choices"][0]["message"]["content"]
|
|
143
|
+
assert content is not None
|
|
144
|
+
# Should contain Python function syntax
|
|
145
|
+
assert "def " in content
|
|
146
|
+
|
|
147
|
+
def test_valid_token_counts(self) -> None:
|
|
148
|
+
"""Test that token counts are valid and consistent."""
|
|
149
|
+
messages: list[Message] = [{"role": "user", "content": "Hi"}]
|
|
150
|
+
|
|
151
|
+
response = call_openai(messages)
|
|
152
|
+
|
|
153
|
+
usage = response["usage"]
|
|
154
|
+
assert usage["prompt_tokens"] > 0
|
|
155
|
+
assert usage["completion_tokens"] > 0
|
|
156
|
+
assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TestOpenAISkipped:
|
|
160
|
+
"""Tests that run when live tests are skipped."""
|
|
161
|
+
|
|
162
|
+
def test_skip_message(self) -> None:
|
|
163
|
+
"""Kept for backwards-compatibility; always passes."""
|
|
164
|
+
assert True
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from jinja2 import Template
|
|
5
|
+
|
|
6
|
+
from sweagent.exceptions import FormatError, FunctionCallingFormatError
|
|
7
|
+
from sweagent.tools.commands import Command
|
|
8
|
+
from sweagent.tools.parsing import (
|
|
9
|
+
ActionParser,
|
|
10
|
+
EditFormat,
|
|
11
|
+
FunctionCallingParser,
|
|
12
|
+
Identity,
|
|
13
|
+
JsonParser,
|
|
14
|
+
ThoughtActionParser,
|
|
15
|
+
XMLThoughtActionParser,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_action_parser():
|
|
20
|
+
parser = ActionParser()
|
|
21
|
+
command = Command(name="ls", docstring="")
|
|
22
|
+
thought, action = parser({"message": "ls -l"}, [command])
|
|
23
|
+
assert thought == "ls -l"
|
|
24
|
+
assert action == "ls -l"
|
|
25
|
+
with pytest.raises(FormatError):
|
|
26
|
+
parser({"message": "invalid command"}, [command])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_thought_action_parser():
|
|
30
|
+
parser = ThoughtActionParser()
|
|
31
|
+
model_response = "Let's look at the files in the current directory.\n```\nls -l\n```"
|
|
32
|
+
thought, action = parser({"message": model_response}, [])
|
|
33
|
+
assert thought == "Let's look at the files in the current directory.\n"
|
|
34
|
+
assert action == "ls -l\n"
|
|
35
|
+
with pytest.raises(FormatError):
|
|
36
|
+
parser({"message": "No code block"}, [])
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_xml_thought_action_parser():
|
|
40
|
+
parser = XMLThoughtActionParser()
|
|
41
|
+
model_response = "Let's look at the files in the current directory.\n<command>\nls -l\n</command>"
|
|
42
|
+
thought, action = parser({"message": model_response}, [])
|
|
43
|
+
assert thought == "Let's look at the files in the current directory."
|
|
44
|
+
assert action == "ls -l"
|
|
45
|
+
with pytest.raises(FormatError):
|
|
46
|
+
parser({"message": "No command tags"}, [])
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_edit_format_parser():
|
|
50
|
+
parser = EditFormat()
|
|
51
|
+
model_response = "Let's replace the contents.\n```\nimport os\nos.listdir()\n```"
|
|
52
|
+
thought, action = parser({"message": model_response}, [])
|
|
53
|
+
assert thought == "Let's replace the contents.\n"
|
|
54
|
+
assert action == "import os\nos.listdir()\n"
|
|
55
|
+
with pytest.raises(FormatError):
|
|
56
|
+
parser({"message": "No code block"}, [])
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_identity_parser():
|
|
60
|
+
parser = Identity()
|
|
61
|
+
model_response = "Return as is"
|
|
62
|
+
thought, action = parser({"message": model_response}, [])
|
|
63
|
+
assert thought == model_response
|
|
64
|
+
assert action == model_response
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_json_parser():
|
|
68
|
+
parser = JsonParser()
|
|
69
|
+
model_response = '{"thought": "List files", "command": {"name": "ls", "arguments": {"path": "."}}}'
|
|
70
|
+
thought, action = parser({"message": model_response}, [])
|
|
71
|
+
assert thought == "List files"
|
|
72
|
+
assert action == "ls ."
|
|
73
|
+
|
|
74
|
+
invalid_json = "Not a JSON"
|
|
75
|
+
with pytest.raises(FormatError):
|
|
76
|
+
parser({"message": invalid_json}, [])
|
|
77
|
+
|
|
78
|
+
missing_keys = '{"thought": "Missing command key"}'
|
|
79
|
+
with pytest.raises(FormatError):
|
|
80
|
+
parser({"message": missing_keys}, [])
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_function_calling_parser():
|
|
84
|
+
parser = FunctionCallingParser()
|
|
85
|
+
command = Command(name="ls", docstring="", arguments=[])
|
|
86
|
+
|
|
87
|
+
# Test successful parsing
|
|
88
|
+
model_response = {
|
|
89
|
+
"message": "Let's list the files",
|
|
90
|
+
"tool_calls": [{"function": {"name": "ls", "arguments": "{}"}}],
|
|
91
|
+
}
|
|
92
|
+
thought, action = parser(model_response, [command])
|
|
93
|
+
assert thought == "Let's list the files"
|
|
94
|
+
assert action == "ls"
|
|
95
|
+
|
|
96
|
+
# Test with missing tool_calls
|
|
97
|
+
with pytest.raises(FormatError):
|
|
98
|
+
parser({"message": "No tool calls"}, [command])
|
|
99
|
+
|
|
100
|
+
# Test with multiple tool calls
|
|
101
|
+
multiple_calls = {
|
|
102
|
+
"message": "Multiple calls",
|
|
103
|
+
"tool_calls": [
|
|
104
|
+
{"function": {"name": "ls", "arguments": "{}"}},
|
|
105
|
+
{"function": {"name": "cd", "arguments": "{}"}},
|
|
106
|
+
],
|
|
107
|
+
}
|
|
108
|
+
with pytest.raises(FormatError):
|
|
109
|
+
parser(multiple_calls, [command])
|
|
110
|
+
|
|
111
|
+
# Test with invalid command
|
|
112
|
+
invalid_command = {
|
|
113
|
+
"message": "Invalid command",
|
|
114
|
+
"tool_calls": [{"function": {"name": "invalid", "arguments": "{}"}}],
|
|
115
|
+
}
|
|
116
|
+
with pytest.raises(FormatError):
|
|
117
|
+
parser(invalid_command, [command])
|
|
118
|
+
|
|
119
|
+
# Test with invalid JSON arguments
|
|
120
|
+
invalid_json = {
|
|
121
|
+
"message": "Invalid JSON",
|
|
122
|
+
"tool_calls": [{"function": {"name": "ls", "arguments": "invalid json"}}],
|
|
123
|
+
}
|
|
124
|
+
with pytest.raises(FormatError):
|
|
125
|
+
parser(invalid_json, [command])
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_function_calling_parser_error_message():
|
|
129
|
+
template = Template(FunctionCallingParser().error_message)
|
|
130
|
+
exc1 = FunctionCallingFormatError("test", "missing")
|
|
131
|
+
assert "did not use any tool calls" in template.render(**exc1.extra_info, exception_message=exc1.message)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from unittest.mock import Mock, patch
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from sweagent.agent.problem_statement import SWEBenchMultimodalProblemStatement
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestSWEBenchMultimodalProblemStatement:
|
|
9
|
+
example_image_url = (
|
|
10
|
+
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Candide1759.jpg/330px-Candide1759.jpg"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
def test_initialization(self):
|
|
14
|
+
"""Test basic initialization of multimodal problem statement."""
|
|
15
|
+
problem_statement = SWEBenchMultimodalProblemStatement(
|
|
16
|
+
text="Test problem statement", issue_images=[self.example_image_url], id="test_id"
|
|
17
|
+
)
|
|
18
|
+
assert problem_statement.text == "Test problem statement"
|
|
19
|
+
assert problem_statement.issue_images == [self.example_image_url]
|
|
20
|
+
assert problem_statement.id == "test_id"
|
|
21
|
+
assert problem_statement.type == "swe_bench_multimodal"
|
|
22
|
+
|
|
23
|
+
def test_get_problem_statement_no_images(self):
|
|
24
|
+
"""Test get_problem_statement when no images are present."""
|
|
25
|
+
problem_statement = SWEBenchMultimodalProblemStatement(text="Test problem statement", issue_images=[])
|
|
26
|
+
result = problem_statement.get_problem_statement()
|
|
27
|
+
assert result == "Test problem statement"
|
|
28
|
+
|
|
29
|
+
@patch("requests.get")
|
|
30
|
+
def test_get_problem_statement_with_valid_image(self, mock_get):
|
|
31
|
+
"""Test get_problem_statement with a valid image that gets processed."""
|
|
32
|
+
# mock successful HTTP response
|
|
33
|
+
mock_response = Mock()
|
|
34
|
+
mock_response.raise_for_status.return_value = None
|
|
35
|
+
mock_response.headers = {"content-type": "image/png"}
|
|
36
|
+
mock_response.iter_content.return_value = [b"fake_image_data"]
|
|
37
|
+
mock_get.return_value = mock_response
|
|
38
|
+
problem_statement = SWEBenchMultimodalProblemStatement(
|
|
39
|
+
text="Test problem statement", issue_images=[self.example_image_url]
|
|
40
|
+
)
|
|
41
|
+
result = problem_statement.get_problem_statement()
|
|
42
|
+
# should contain original text plus the base64 image
|
|
43
|
+
assert "Test problem statement" in result
|
|
44
|
+
assert f"
|
|
47
|
+
def test_get_problem_statement_with_network_error(self, mock_get):
|
|
48
|
+
"""Test that network errors are handled gracefully with warnings."""
|
|
49
|
+
# mock network error
|
|
50
|
+
mock_get.side_effect = requests.exceptions.RequestException("Network error")
|
|
51
|
+
problem_statement = SWEBenchMultimodalProblemStatement(
|
|
52
|
+
text="Test problem statement", issue_images=[self.example_image_url]
|
|
53
|
+
)
|
|
54
|
+
result = problem_statement.get_problem_statement()
|
|
55
|
+
assert result == "Test problem statement"
|
|
56
|
+
|
|
57
|
+
@patch("requests.get")
|
|
58
|
+
def test_get_problem_statement_with_invalid_mime_type(self, mock_get):
|
|
59
|
+
"""Test that invalid MIME types are handled gracefully."""
|
|
60
|
+
# mock response with invalid MIME type
|
|
61
|
+
mock_response = Mock()
|
|
62
|
+
mock_response.raise_for_status.return_value = None
|
|
63
|
+
mock_response.headers = {"content-type": "text/html"}
|
|
64
|
+
mock_get.return_value = mock_response
|
|
65
|
+
problem_statement = SWEBenchMultimodalProblemStatement(
|
|
66
|
+
text="Test problem statement", issue_images=["http://example.com/document.html"]
|
|
67
|
+
)
|
|
68
|
+
result = problem_statement.get_problem_statement()
|
|
69
|
+
assert result == "Test problem statement"
|
|
70
|
+
|
|
71
|
+
@patch("requests.get")
|
|
72
|
+
def test_caching_behavior(self, mock_get):
|
|
73
|
+
"""Test that get_problem_statement caches results and doesn't re-download images."""
|
|
74
|
+
mock_response = Mock()
|
|
75
|
+
mock_response.raise_for_status.return_value = None
|
|
76
|
+
mock_response.headers = {"content-type": "image/png"}
|
|
77
|
+
mock_response.iter_content.return_value = [b"fake_image_data"]
|
|
78
|
+
mock_get.return_value = mock_response
|
|
79
|
+
problem_statement = SWEBenchMultimodalProblemStatement(
|
|
80
|
+
text="Test problem statement", issue_images=[self.example_image_url]
|
|
81
|
+
)
|
|
82
|
+
result1 = problem_statement.get_problem_statement()
|
|
83
|
+
assert mock_get.call_count == 1
|
|
84
|
+
result2 = problem_statement.get_problem_statement()
|
|
85
|
+
assert mock_get.call_count == 1 # should still be 1, not 2, because of caching
|
|
86
|
+
assert result1 == result2
|
|
87
|
+
assert "Test problem statement" in result1
|
|
88
|
+
assert f":
|
|
91
|
+
"""Test that invalid URLs are handled gracefully."""
|
|
92
|
+
problem_statement = SWEBenchMultimodalProblemStatement(
|
|
93
|
+
text="Test problem statement", issue_images=["not_a_url", "ftp://invalid_scheme.com/image.png"]
|
|
94
|
+
)
|
|
95
|
+
result = problem_statement.get_problem_statement()
|
|
96
|
+
assert result == "Test problem statement"
|
|
97
|
+
|
|
98
|
+
@patch("requests.get")
|
|
99
|
+
def test_large_image_handling(self, mock_get):
|
|
100
|
+
"""Test that large images are rejected."""
|
|
101
|
+
mock_response = Mock()
|
|
102
|
+
mock_response.raise_for_status.return_value = None
|
|
103
|
+
mock_response.headers = {"content-type": "image/png", "content-length": "20971520"} # 20MB
|
|
104
|
+
mock_get.return_value = mock_response
|
|
105
|
+
|
|
106
|
+
problem_statement = SWEBenchMultimodalProblemStatement(
|
|
107
|
+
text="Test problem statement", issue_images=["http://example.com/huge_image.png"]
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
result = problem_statement.get_problem_statement()
|
|
111
|
+
assert result == "Test problem statement"
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from sweagent.run.quick_stats import quick_stats
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_quick_stats_empty_directory():
|
|
11
|
+
"""Test that quick_stats handles empty directories properly."""
|
|
12
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
13
|
+
result = quick_stats(tmp_dir)
|
|
14
|
+
assert result == "No .traj files found."
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_quick_stats_test_data(test_trajectories_path: Path):
|
|
18
|
+
"""Test that quick_stats works on the test data directory."""
|
|
19
|
+
# Create a sample .traj file with required structure
|
|
20
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
21
|
+
tmp_path = Path(tmp_dir)
|
|
22
|
+
traj_file = tmp_path / "test.traj"
|
|
23
|
+
|
|
24
|
+
# Create a minimal valid .traj file
|
|
25
|
+
traj_data = {"info": {"model_stats": {"api_calls": 42}, "exit_status": "success"}}
|
|
26
|
+
|
|
27
|
+
traj_file.write_text(json.dumps(traj_data))
|
|
28
|
+
|
|
29
|
+
# Run quick_stats on the directory with our test file
|
|
30
|
+
result = quick_stats(tmp_path)
|
|
31
|
+
|
|
32
|
+
# Check that the result contains our exit status
|
|
33
|
+
assert "## `success`" in result
|
|
34
|
+
|
|
35
|
+
# Run quick_stats on the test_trajectories_path
|
|
36
|
+
result = quick_stats(test_trajectories_path)
|
|
37
|
+
|
|
38
|
+
# The result should not be empty when run on test data
|
|
39
|
+
assert result != "No .traj files found."
|
|
40
|
+
|
|
41
|
+
# The result should contain some exit status sections
|
|
42
|
+
assert "## `" in result
|