mini-swe-agent 1.17.5__py3-none-any.whl → 2.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/METADATA +36 -52
  2. mini_swe_agent-2.0.0a1.dist-info/RECORD +70 -0
  3. mini_swe_agent-2.0.0a1.dist-info/entry_points.txt +5 -0
  4. minisweagent/__init__.py +19 -26
  5. minisweagent/agents/default.py +128 -113
  6. minisweagent/agents/interactive.py +119 -58
  7. minisweagent/config/README.md +3 -4
  8. minisweagent/config/__init__.py +36 -1
  9. minisweagent/config/benchmarks/swebench.yaml +156 -0
  10. minisweagent/config/{extra/swebench.yaml → benchmarks/swebench_backticks.yaml} +69 -64
  11. minisweagent/config/benchmarks/swebench_modal.yaml +47 -0
  12. minisweagent/config/{extra → benchmarks}/swebench_xml.yaml +73 -70
  13. minisweagent/config/default.yaml +24 -21
  14. minisweagent/config/inspector.tcss +42 -0
  15. minisweagent/config/mini.yaml +53 -71
  16. minisweagent/config/{github_issue.yaml → mini_textbased.yaml} +43 -29
  17. minisweagent/environments/__init__.py +1 -0
  18. minisweagent/environments/docker.py +67 -20
  19. minisweagent/environments/extra/bubblewrap.py +86 -47
  20. minisweagent/environments/extra/swerex_docker.py +53 -20
  21. minisweagent/environments/extra/swerex_modal.py +90 -0
  22. minisweagent/environments/local.py +62 -21
  23. minisweagent/environments/singularity.py +59 -18
  24. minisweagent/exceptions.py +22 -0
  25. minisweagent/models/__init__.py +6 -7
  26. minisweagent/models/extra/roulette.py +20 -17
  27. minisweagent/models/litellm_model.py +90 -44
  28. minisweagent/models/litellm_response_model.py +80 -0
  29. minisweagent/models/litellm_textbased_model.py +45 -0
  30. minisweagent/models/openrouter_model.py +87 -45
  31. minisweagent/models/openrouter_response_model.py +123 -0
  32. minisweagent/models/openrouter_textbased_model.py +76 -0
  33. minisweagent/models/portkey_model.py +84 -42
  34. minisweagent/models/portkey_response_model.py +163 -0
  35. minisweagent/models/requesty_model.py +91 -41
  36. minisweagent/models/test_models.py +246 -19
  37. minisweagent/models/utils/actions_text.py +60 -0
  38. minisweagent/models/utils/actions_toolcall.py +102 -0
  39. minisweagent/models/utils/actions_toolcall_response.py +110 -0
  40. minisweagent/models/utils/anthropic_utils.py +28 -0
  41. minisweagent/models/utils/cache_control.py +15 -2
  42. minisweagent/models/utils/content_string.py +74 -0
  43. minisweagent/models/utils/openai_multimodal.py +50 -0
  44. minisweagent/models/utils/retry.py +25 -0
  45. minisweagent/run/benchmarks/__init__.py +1 -0
  46. minisweagent/run/{extra → benchmarks}/swebench.py +56 -35
  47. minisweagent/run/{extra → benchmarks}/swebench_single.py +36 -26
  48. minisweagent/run/{extra → benchmarks}/utils/batch_progress.py +1 -1
  49. minisweagent/run/hello_world.py +6 -0
  50. minisweagent/run/mini.py +54 -63
  51. minisweagent/run/utilities/__init__.py +1 -0
  52. minisweagent/run/{extra → utilities}/config.py +2 -0
  53. minisweagent/run/{inspector.py → utilities/inspector.py} +90 -11
  54. minisweagent/run/{mini_extra.py → utilities/mini_extra.py} +9 -5
  55. minisweagent/utils/serialize.py +26 -0
  56. mini_swe_agent-1.17.5.dist-info/RECORD +0 -61
  57. mini_swe_agent-1.17.5.dist-info/entry_points.txt +0 -5
  58. minisweagent/agents/interactive_textual.py +0 -450
  59. minisweagent/config/extra/swebench_roulette.yaml +0 -233
  60. minisweagent/config/mini.tcss +0 -86
  61. minisweagent/models/anthropic.py +0 -35
  62. minisweagent/models/litellm_response_api_model.py +0 -82
  63. minisweagent/models/portkey_response_api_model.py +0 -75
  64. minisweagent/models/utils/key_per_thread.py +0 -20
  65. minisweagent/models/utils/openai_utils.py +0 -41
  66. minisweagent/run/github_issue.py +0 -87
  67. minisweagent/run/utils/__init__.py +0 -0
  68. minisweagent/run/utils/save.py +0 -78
  69. {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/WHEEL +0 -0
  70. {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/licenses/LICENSE.md +0 -0
  71. {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/top_level.txt +0 -0
  72. /minisweagent/config/{extra → benchmarks}/__init__.py +0 -0
  73. /minisweagent/run/{extra → benchmarks}/utils/__init__.py +0 -0
@@ -0,0 +1,60 @@
1
+ """Parse actions & format observations without toolcalls.
2
+ This was the method used for mini-swe-agent v1.0 and the original SWE-agent.
3
+ As of mini-swe-agent v2.0, we strongly recommend to use toolcalls instead.
4
+ """
5
+
6
+ import re
7
+ import time
8
+
9
+ from jinja2 import StrictUndefined, Template
10
+
11
+ from minisweagent.exceptions import FormatError
12
+ from minisweagent.models.utils.openai_multimodal import expand_multimodal_content
13
+
14
+
15
+ def parse_regex_actions(content: str, *, action_regex: str, format_error_template: str) -> list[dict]:
16
+ """Parse actions from text content using regex. Raises FormatError if not exactly one action."""
17
+ actions = [a.strip() for a in re.findall(action_regex, content, re.DOTALL)]
18
+ if len(actions) != 1:
19
+ raise FormatError(
20
+ {
21
+ "role": "user",
22
+ "content": Template(format_error_template, undefined=StrictUndefined).render(actions=actions),
23
+ "extra": {
24
+ "interrupt_type": "FormatError",
25
+ "n_actions": len(actions),
26
+ "model_response": content,
27
+ },
28
+ }
29
+ )
30
+ return [{"command": action} for action in actions]
31
+
32
+
33
+ def format_observation_messages(
34
+ outputs: list[dict],
35
+ *,
36
+ observation_template: str,
37
+ template_vars: dict | None = None,
38
+ multimodal_regex: str = "",
39
+ ) -> list[dict]:
40
+ """Format execution outputs into user observation messages."""
41
+ results = []
42
+ for output in outputs:
43
+ content = Template(observation_template, undefined=StrictUndefined).render(
44
+ output=output, **(template_vars or {})
45
+ )
46
+ msg: dict = {
47
+ "role": "user",
48
+ "content": content,
49
+ "extra": {
50
+ "raw_output": output.get("output", ""),
51
+ "returncode": output.get("returncode"),
52
+ "timestamp": time.time(),
53
+ "exception_info": output.get("exception_info"),
54
+ **output.get("extra", {}),
55
+ },
56
+ }
57
+ if multimodal_regex:
58
+ msg = expand_multimodal_content(msg, pattern=multimodal_regex)
59
+ results.append(msg)
60
+ return results
@@ -0,0 +1,102 @@
1
+ """Parse actions & format observations with toolcalls"""
2
+
3
+ import json
4
+ import time
5
+
6
+ from jinja2 import StrictUndefined, Template
7
+
8
+ from minisweagent.exceptions import FormatError
9
+ from minisweagent.models.utils.openai_multimodal import expand_multimodal_content
10
+
11
+ BASH_TOOL = {
12
+ "type": "function",
13
+ "function": {
14
+ "name": "bash",
15
+ "description": "Execute a bash command",
16
+ "parameters": {
17
+ "type": "object",
18
+ "properties": {
19
+ "command": {
20
+ "type": "string",
21
+ "description": "The bash command to execute",
22
+ }
23
+ },
24
+ "required": ["command"],
25
+ },
26
+ },
27
+ }
28
+
29
+
30
+ def parse_toolcall_actions(tool_calls: list, *, format_error_template: str) -> list[dict]:
31
+ """Parse tool calls from the response. Raises FormatError if unknown tool or invalid args."""
32
+ if not tool_calls:
33
+ raise FormatError(
34
+ {
35
+ "role": "user",
36
+ "content": Template(format_error_template, undefined=StrictUndefined).render(
37
+ error="No tool calls found in the response. Every response MUST include at least one tool call."
38
+ ),
39
+ "extra": {"interrupt_type": "FormatError"},
40
+ }
41
+ )
42
+ actions = []
43
+ for tool_call in tool_calls:
44
+ error_msg = ""
45
+ args = {}
46
+ try:
47
+ args = json.loads(tool_call.function.arguments)
48
+ except Exception as e:
49
+ error_msg = f"Error parsing tool call arguments: {e}. "
50
+ if tool_call.function.name != "bash":
51
+ error_msg += f"Unknown tool '{tool_call.function.name}'."
52
+ if "command" not in args:
53
+ error_msg += "Missing 'command' argument in bash tool call."
54
+ if error_msg:
55
+ raise FormatError(
56
+ {
57
+ "role": "user",
58
+ "content": Template(format_error_template, undefined=StrictUndefined).render(
59
+ error=error_msg.strip()
60
+ ),
61
+ "extra": {"interrupt_type": "FormatError"},
62
+ }
63
+ )
64
+ actions.append({"command": args["command"], "tool_call_id": tool_call.id})
65
+ return actions
66
+
67
+
68
+ def format_toolcall_observation_messages(
69
+ *,
70
+ actions: list[dict],
71
+ outputs: list[dict],
72
+ observation_template: str,
73
+ template_vars: dict | None = None,
74
+ multimodal_regex: str = "",
75
+ ) -> list[dict]:
76
+ """Format execution outputs into tool result messages."""
77
+ not_executed = {"output": "", "returncode": -1, "exception_info": "action was not executed"}
78
+ padded_outputs = outputs + [not_executed] * (len(actions) - len(outputs))
79
+ results = []
80
+ for action, output in zip(actions, padded_outputs):
81
+ content = Template(observation_template, undefined=StrictUndefined).render(
82
+ output=output, **(template_vars or {})
83
+ )
84
+ msg = {
85
+ "content": content,
86
+ "extra": {
87
+ "raw_output": output.get("output", ""),
88
+ "returncode": output.get("returncode"),
89
+ "timestamp": time.time(),
90
+ "exception_info": output.get("exception_info"),
91
+ **output.get("extra", {}),
92
+ },
93
+ }
94
+ if "tool_call_id" in action:
95
+ msg["tool_call_id"] = action["tool_call_id"]
96
+ msg["role"] = "tool"
97
+ else:
98
+ msg["role"] = "user" # human issued commands
99
+ if multimodal_regex:
100
+ msg = expand_multimodal_content(msg, pattern=multimodal_regex)
101
+ results.append(msg)
102
+ return results
@@ -0,0 +1,110 @@
1
+ """Parse actions & format observations for OpenAI Responses API toolcalls"""
2
+
3
+ import json
4
+ import time
5
+
6
+ from jinja2 import StrictUndefined, Template
7
+
8
+ from minisweagent.exceptions import FormatError
9
+
10
+ # OpenRouter/OpenAI Responses API uses a flat structure (no nested "function" key)
11
+ BASH_TOOL_RESPONSE_API = {
12
+ "type": "function",
13
+ "name": "bash",
14
+ "description": "Execute a bash command",
15
+ "parameters": {
16
+ "type": "object",
17
+ "properties": {
18
+ "command": {
19
+ "type": "string",
20
+ "description": "The bash command to execute",
21
+ }
22
+ },
23
+ "required": ["command"],
24
+ },
25
+ }
26
+
27
+
28
+ def _format_error_message(error_text: str) -> dict:
29
+ """Create a FormatError message in Responses API format."""
30
+ return {
31
+ "type": "message",
32
+ "role": "user",
33
+ "content": [{"type": "input_text", "text": error_text}],
34
+ "extra": {"interrupt_type": "FormatError"},
35
+ }
36
+
37
+
38
+ def parse_toolcall_actions_response(output: list, *, format_error_template: str) -> list[dict]:
39
+ """Parse tool calls from a Responses API response output.
40
+
41
+ Filters for function_call items and parses them.
42
+ Response API format has name/arguments at top level with call_id:
43
+ {"type": "function_call", "call_id": "...", "name": "bash", "arguments": "..."}
44
+ """
45
+ tool_calls = []
46
+ for item in output:
47
+ item_type = item.get("type") if isinstance(item, dict) else getattr(item, "type", None)
48
+ if item_type == "function_call":
49
+ tool_calls.append(
50
+ item.model_dump() if hasattr(item, "model_dump") else dict(item) if not isinstance(item, dict) else item
51
+ )
52
+ if not tool_calls:
53
+ error_text = Template(format_error_template, undefined=StrictUndefined).render(
54
+ error="No tool calls found in the response. Every response MUST include at least one tool call.",
55
+ )
56
+ raise FormatError(_format_error_message(error_text))
57
+ actions = []
58
+ for tool_call in tool_calls:
59
+ error_msg = ""
60
+ args = {}
61
+ try:
62
+ args = json.loads(tool_call.get("arguments", "{}"))
63
+ except Exception as e:
64
+ error_msg = f"Error parsing tool call arguments: {e}. "
65
+ if tool_call.get("name") != "bash":
66
+ error_msg += f"Unknown tool '{tool_call.get('name')}'."
67
+ if "command" not in args:
68
+ error_msg += "Missing 'command' argument in bash tool call."
69
+ if error_msg:
70
+ error_text = Template(format_error_template, undefined=StrictUndefined).render(error=error_msg.strip())
71
+ raise FormatError(_format_error_message(error_text))
72
+ actions.append({"command": args["command"], "tool_call_id": tool_call.get("call_id") or tool_call.get("id")})
73
+ return actions
74
+
75
+
76
+ def format_toolcall_observation_messages(
77
+ *,
78
+ actions: list[dict],
79
+ outputs: list[dict],
80
+ observation_template: str,
81
+ template_vars: dict | None = None,
82
+ multimodal_regex: str = "",
83
+ ) -> list[dict]:
84
+ """Format execution outputs into function_call_output messages for Responses API."""
85
+ not_executed = {"output": "", "returncode": -1, "exception_info": "action was not executed"}
86
+ padded_outputs = outputs + [not_executed] * (len(actions) - len(outputs))
87
+ results = []
88
+ for action, output in zip(actions, padded_outputs):
89
+ content = Template(observation_template, undefined=StrictUndefined).render(
90
+ output=output, **(template_vars or {})
91
+ )
92
+ msg: dict = {
93
+ "extra": {
94
+ "raw_output": output.get("output", ""),
95
+ "returncode": output.get("returncode"),
96
+ "timestamp": time.time(),
97
+ "exception_info": output.get("exception_info"),
98
+ **output.get("extra", {}),
99
+ },
100
+ }
101
+ if "tool_call_id" in action:
102
+ msg["type"] = "function_call_output"
103
+ msg["call_id"] = action["tool_call_id"]
104
+ msg["output"] = content
105
+ else: # human issued commands
106
+ msg["type"] = "message"
107
+ msg["role"] = "user"
108
+ msg["content"] = [{"type": "input_text", "text": content}]
109
+ results.append(msg)
110
+ return results
@@ -0,0 +1,28 @@
1
+ """Utilities for Anthropic API compatibility."""
2
+
3
+
4
+ def _is_anthropic_thinking_block(block) -> bool:
5
+ """Check if a content block is a thinking-type block."""
6
+ if not isinstance(block, dict):
7
+ return False
8
+ return block.get("type") in ("thinking", "redacted_thinking")
9
+
10
+
11
+ def _reorder_anthropic_thinking_blocks(messages: list[dict]) -> list[dict]:
12
+ """Reorder thinking blocks so they are not the final block in assistant messages.
13
+
14
+ This is an Anthropic API requirement: thinking blocks must come before other blocks.
15
+ """
16
+ result = []
17
+ for msg in messages:
18
+ if msg.get("role") == "assistant" and isinstance(msg.get("content"), list):
19
+ content = msg["content"]
20
+ thinking_blocks = [b for b in content if _is_anthropic_thinking_block(b)]
21
+ if thinking_blocks:
22
+ other_blocks = [b for b in content if not _is_anthropic_thinking_block(b)]
23
+ if other_blocks:
24
+ msg = {**msg, "content": thinking_blocks + other_blocks}
25
+ else:
26
+ msg = {**msg, "content": thinking_blocks + [{"type": "text", "text": ""}]}
27
+ result.append(msg)
28
+ return result
@@ -1,9 +1,15 @@
1
+ """Cache control utilities are mostly for Anthropic models.
2
+ They are used to explicitly set cache control points.
3
+ """
4
+
1
5
  import copy
2
6
  import warnings
3
7
  from typing import Literal
4
8
 
5
9
 
6
- def _get_content_text(entry: dict) -> str:
10
+ def _get_content_text(entry: dict) -> str | None:
11
+ if entry["content"] is None:
12
+ return None
7
13
  if isinstance(entry["content"], str):
8
14
  return entry["content"]
9
15
  assert len(entry["content"]) == 1, "Expected single message in content"
@@ -14,10 +20,16 @@ def _clear_cache_control(entry: dict) -> None:
14
20
  if isinstance(entry["content"], list):
15
21
  assert len(entry["content"]) == 1, "Expected single message in content"
16
22
  entry["content"][0].pop("cache_control", None)
23
+ # Note: entry["content"] can be None for assistant messages with only tool_use
17
24
  entry.pop("cache_control", None)
18
25
 
19
26
 
20
27
  def _set_cache_control(entry: dict) -> None:
28
+ # Handle None content (e.g., assistant messages with only tool_use)
29
+ if entry["content"] is None:
30
+ entry["cache_control"] = {"type": "ephemeral"}
31
+ return
32
+
21
33
  if not isinstance(entry["content"], list):
22
34
  entry["content"] = [ # type: ignore
23
35
  {
@@ -38,7 +50,8 @@ def set_cache_control(
38
50
  messages: list[dict], *, mode: Literal["default_end"] | None = "default_end", last_n_messages_offset: int = 0
39
51
  ) -> list[dict]:
40
52
  """This messages processor adds manual cache control marks to the messages."""
41
- # ONLY ADD TO THE LAST MESSAGE
53
+ if mode is None:
54
+ return messages
42
55
  if mode != "default_end":
43
56
  raise ValueError(f"Invalid mode: {mode}")
44
57
  if last_n_messages_offset:
@@ -0,0 +1,74 @@
1
+ """Helper function for pretty-printing content strings."""
2
+
3
+ import json
4
+
5
+
6
+ def _format_tool_call(args_str: str) -> str:
7
+ """Format tool call arguments, extracting command if it's a bash call."""
8
+ try:
9
+ args = json.loads(args_str) if isinstance(args_str, str) else args_str
10
+ if isinstance(args, dict) and "command" in args:
11
+ return f"```\n{args['command']}\n```"
12
+ except Exception:
13
+ pass
14
+ return f"```\n{args_str}\n```"
15
+
16
+
17
+ def _format_observation(content: str) -> str | None:
18
+ """Try to format an observation JSON as key-value pairs."""
19
+ try:
20
+ data = json.loads(content)
21
+ if isinstance(data, dict) and "returncode" in data:
22
+ lines = []
23
+ for key, value in data.items():
24
+ lines.append(f"<{key}>")
25
+ lines.append(str(value))
26
+ return "\n".join(lines)
27
+ return content
28
+ except Exception:
29
+ return content
30
+
31
+
32
+ def get_content_string(message: dict) -> str:
33
+ """Extract text content from any message format for display.
34
+
35
+ Handles:
36
+ - Traditional chat: {"content": "text"}
37
+ - Multimodal chat: {"content": [{"type": "text", "text": "..."}]}
38
+ - Observation messages: {"content": "{\"returncode\": 0, \"output\": \"...\"}"}
39
+ - Traditional tool calls: {"tool_calls": [{"function": {"name": "...", "arguments": "..."}}]}
40
+ - Responses API: {"output": [{"type": "message", "content": [...]}]}
41
+ """
42
+ texts = []
43
+
44
+ # Extract content (string or multimodal list)
45
+ content = message.get("content")
46
+ if isinstance(content, str):
47
+ texts.append(_format_observation(content))
48
+ elif isinstance(content, list):
49
+ texts.append("\n".join(item.get("text", "") for item in content if isinstance(item, dict)))
50
+
51
+ # Handle traditional tool_calls format (OpenAI/LiteLLM style)
52
+ if tool_calls := message.get("tool_calls"):
53
+ for tc in tool_calls:
54
+ func = tc.get("function", {}) if isinstance(tc, dict) else getattr(tc, "function", None)
55
+ if func:
56
+ args = func.get("arguments", "{}") if isinstance(func, dict) else getattr(func, "arguments", "{}")
57
+ texts.append(_format_tool_call(args))
58
+
59
+ # Handle Responses API format (output array)
60
+ if output := message.get("output"):
61
+ if isinstance(output, str):
62
+ texts.append(_format_observation(output))
63
+ elif isinstance(output, list):
64
+ for item in output:
65
+ if not isinstance(item, dict):
66
+ continue
67
+ if item.get("type") == "message":
68
+ for c in item.get("content", []):
69
+ if isinstance(c, dict) and (text := c.get("text")):
70
+ texts.append(text)
71
+ elif item.get("type") == "function_call":
72
+ texts.append(_format_tool_call(item.get("arguments", "{}")))
73
+
74
+ return "\n\n".join(t for t in texts if t)
@@ -0,0 +1,50 @@
1
+ """Utilities for handling multimodal content in OpenAI-style messages."""
2
+
3
+ import copy
4
+ import re
5
+ from typing import Any
6
+
7
+ DEFAULT_MULTIMODAL_REGEX = (
8
+ r"(?s)<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>(.+?)</CONTENT_TYPE>(.+?)</MSWEA_MULTIMODAL_CONTENT>"
9
+ )
10
+
11
+
12
+ def _expand_content_string(*, content: str, pattern: str) -> list[dict]:
13
+ """Expand a content string, replacing multimodal tags with structured content."""
14
+ matches = list(re.finditer(pattern, content))
15
+ if not matches:
16
+ return [{"type": "text", "text": content}]
17
+ result = []
18
+ last_end = 0
19
+ for match in matches:
20
+ text_before = content[last_end : match.start()]
21
+ if text_before:
22
+ result.append({"type": "text", "text": text_before})
23
+ content_type = match.group(1).strip()
24
+ extracted = match.group(2).strip()
25
+ if content_type == "image_url":
26
+ result.append({"type": "image_url", "image_url": {"url": extracted}})
27
+ last_end = match.end()
28
+ text_after = content[last_end:]
29
+ if text_after:
30
+ result.append({"type": "text", "text": text_after})
31
+ return result
32
+
33
+
34
+ def expand_multimodal_content(content: Any, *, pattern: str) -> Any:
35
+ """Recursively expand multimodal content in messages.
36
+ Note: Returns copy of content, original content is not modified.
37
+ """
38
+ if not pattern:
39
+ return content
40
+ content = copy.deepcopy(content)
41
+ if isinstance(content, str):
42
+ return _expand_content_string(content=content, pattern=pattern)
43
+ if isinstance(content, list):
44
+ return [expand_multimodal_content(item, pattern=pattern) for item in content]
45
+ if isinstance(content, dict):
46
+ if "content" not in content:
47
+ return content
48
+ content["content"] = expand_multimodal_content(content["content"], pattern=pattern)
49
+ return content
50
+ return str(content)
@@ -0,0 +1,25 @@
1
+ """Retry utility for model queries."""
2
+
3
+ import logging
4
+ import os
5
+
6
+ from tenacity import Retrying, before_sleep_log, retry_if_not_exception_type, stop_after_attempt, wait_exponential
7
+
8
+
9
+ def retry(*, logger: logging.Logger, abort_exceptions: list[type[Exception]]) -> Retrying:
10
+ """Thin wrapper around tenacity.Retrying to make use of global config etc.
11
+
12
+ Args:
13
+ logger: Logger to use for reporting retries
14
+ abort_exceptions: Exceptions to abort on.
15
+
16
+ Returns:
17
+ A tenacity.Retrying object.
18
+ """
19
+ return Retrying(
20
+ reraise=True,
21
+ stop=stop_after_attempt(int(os.getenv("MSWEA_MODEL_RETRY_STOP_AFTER_ATTEMPT", "10"))),
22
+ wait=wait_exponential(multiplier=1, min=4, max=60),
23
+ before_sleep=before_sleep_log(logger, logging.WARNING),
24
+ retry=retry_if_not_exception_type(tuple(abort_exceptions)),
25
+ )
@@ -0,0 +1 @@
1
+ """Benchmark run scripts for mini-SWE-agent (e.g., SWE-bench)."""
@@ -13,19 +13,17 @@ import traceback
13
13
  from pathlib import Path
14
14
 
15
15
  import typer
16
- import yaml
17
- from datasets import load_dataset
18
16
  from jinja2 import StrictUndefined, Template
19
17
  from rich.live import Live
20
18
 
21
19
  from minisweagent import Environment
22
20
  from minisweagent.agents.default import DefaultAgent
23
- from minisweagent.config import builtin_config_dir, get_config_path
21
+ from minisweagent.config import builtin_config_dir, get_config_from_spec
24
22
  from minisweagent.environments import get_environment
25
23
  from minisweagent.models import get_model
26
- from minisweagent.run.extra.utils.batch_progress import RunBatchProgressManager
27
- from minisweagent.run.utils.save import save_traj
24
+ from minisweagent.run.benchmarks.utils.batch_progress import RunBatchProgressManager
28
25
  from minisweagent.utils.log import add_file_handler, logger
26
+ from minisweagent.utils.serialize import UNSET, recursive_merge
29
27
 
30
28
  _HELP_TEXT = """Run mini-SWE-agent on SWEBench instances.
31
29
 
@@ -34,7 +32,23 @@ More information about the usage: [bold green]https://mini-swe-agent.com/latest/
34
32
  [/not dim]
35
33
  """
36
34
 
37
- app = typer.Typer(rich_markup_mode="rich", add_completion=False)
35
+ _CONFIG_SPEC_HELP_TEXT = """Path to config files, filenames, or key-value pairs.
36
+
37
+ [bold red]IMPORTANT:[/bold red] [red]If you set this option, the default config file will not be used.[/red]
38
+ So you need to explicitly set it e.g., with [bold green]-c swebench.yaml <other options>[/bold green]
39
+
40
+ Multiple configs will be recursively merged.
41
+
42
+ Examples:
43
+
44
+ [bold red]-c model.model_kwargs.temperature=0[/bold red] [red]You forgot to add the default config file! See above.[/red]
45
+
46
+ [bold green]-c swebench.yaml -c model.model_kwargs.temperature=0.5[/bold green]
47
+
48
+ [bold green]-c swebench.yaml -c agent.max_iterations=50[/bold green]
49
+ """
50
+
51
+ DEFAULT_CONFIG_FILE = builtin_config_dir / "benchmarks" / "swebench.yaml"
38
52
 
39
53
  DATASET_MAPPING = {
40
54
  "full": "princeton-nlp/SWE-Bench",
@@ -46,7 +60,7 @@ DATASET_MAPPING = {
46
60
  "_test": "klieret/swe-bench-dummy-test-dataset",
47
61
  }
48
62
 
49
-
63
+ app = typer.Typer(rich_markup_mode="rich", add_completion=False)
50
64
  _OUTPUT_FILE_LOCK = threading.Lock()
51
65
 
52
66
 
@@ -60,9 +74,7 @@ class ProgressTrackingAgent(DefaultAgent):
60
74
 
61
75
  def step(self) -> dict:
62
76
  """Override step to provide progress updates."""
63
- self.progress_manager.update_instance_status(
64
- self.instance_id, f"Step {self.model.n_calls + 1:3d} (${self.model.cost:.2f})"
65
- )
77
+ self.progress_manager.update_instance_status(self.instance_id, f"Step {self.n_calls + 1:3d} (${self.cost:.2f})")
66
78
  return super().step()
67
79
 
68
80
 
@@ -81,7 +93,7 @@ def get_sb_environment(config: dict, instance: dict) -> Environment:
81
93
  env_config = config.setdefault("environment", {})
82
94
  env_config["environment_class"] = env_config.get("environment_class", "docker")
83
95
  image_name = get_swebench_docker_image_name(instance)
84
- if env_config["environment_class"] == "docker":
96
+ if env_config["environment_class"] in ["docker", "swerex_modal"]:
85
97
  env_config["image"] = image_name
86
98
  elif env_config["environment_class"] == "singularity":
87
99
  env_config["image"] = "docker://" + image_name
@@ -138,7 +150,9 @@ def process_instance(
138
150
  progress_manager.update_instance_status(instance_id, "Pulling/starting docker")
139
151
 
140
152
  agent = None
141
- extra_info = None
153
+ exit_status = None
154
+ result = None
155
+ extra_info = {}
142
156
 
143
157
  try:
144
158
  env = get_sb_environment(config, instance)
@@ -149,21 +163,28 @@ def process_instance(
149
163
  instance_id=instance_id,
150
164
  **config.get("agent", {}),
151
165
  )
152
- exit_status, result = agent.run(task)
166
+ info = agent.run(task)
167
+ exit_status = info.get("exit_status")
168
+ result = info.get("submission")
153
169
  except Exception as e:
154
170
  logger.error(f"Error processing instance {instance_id}: {e}", exc_info=True)
155
- exit_status, result = type(e).__name__, str(e)
156
- extra_info = {"traceback": traceback.format_exc()}
171
+ exit_status, result = type(e).__name__, ""
172
+ extra_info = {"traceback": traceback.format_exc(), "exception_str": str(e)}
157
173
  finally:
158
- save_traj(
159
- agent,
160
- instance_dir / f"{instance_id}.traj.json",
161
- exit_status=exit_status,
162
- result=result,
163
- extra_info=extra_info,
164
- instance_id=instance_id,
165
- print_fct=logger.info,
166
- )
174
+ if agent is not None:
175
+ traj_path = instance_dir / f"{instance_id}.traj.json"
176
+ agent.save(
177
+ traj_path,
178
+ {
179
+ "info": {
180
+ "exit_status": exit_status,
181
+ "submission": result,
182
+ **extra_info,
183
+ },
184
+ "instance_id": instance_id,
185
+ },
186
+ )
187
+ logger.info(f"Saved trajectory to '{traj_path}'")
167
188
  update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result)
168
189
  progress_manager.on_instance_end(instance_id, exit_status)
169
190
 
@@ -201,8 +222,8 @@ def main(
201
222
  model: str | None = typer.Option(None, "-m", "--model", help="Model to use", rich_help_panel="Basic"),
202
223
  model_class: str | None = typer.Option(None, "--model-class", help="Model class to use (e.g., 'anthropic' or 'minisweagent.models.anthropic.AnthropicModel')", rich_help_panel="Advanced"),
203
224
  redo_existing: bool = typer.Option(False, "--redo-existing", help="Redo existing instances", rich_help_panel="Data selection"),
204
- config_spec: Path = typer.Option( builtin_config_dir / "extra" / "swebench.yaml", "-c", "--config", help="Path to a config file", rich_help_panel="Basic"),
205
- environment_class: str | None = typer.Option( None, "--environment-class", help="Environment type to use. Recommended are docker or singularity", rich_help_panel="Advanced"),
225
+ config_spec: list[str] = typer.Option([str(DEFAULT_CONFIG_FILE)], "-c", "--config", help=_CONFIG_SPEC_HELP_TEXT, rich_help_panel="Basic"),
226
+ environment_class: str | None = typer.Option(None, "--environment-class", help="Environment type to use. Recommended are docker or singularity", rich_help_panel="Advanced"),
206
227
  ) -> None:
207
228
  # fmt: on
208
229
  output_path = Path(output)
@@ -210,6 +231,8 @@ def main(
210
231
  logger.info(f"Results will be saved to {output_path}")
211
232
  add_file_handler(output_path / "minisweagent.log")
212
233
 
234
+ from datasets import load_dataset
235
+
213
236
  dataset_path = DATASET_MAPPING.get(subset, subset)
214
237
  logger.info(f"Loading dataset {dataset_path}, split {split}...")
215
238
  instances = list(load_dataset(dataset_path, split=split))
@@ -221,15 +244,13 @@ def main(
221
244
  instances = [instance for instance in instances if instance["instance_id"] not in existing_instances]
222
245
  logger.info(f"Running on {len(instances)} instances...")
223
246
 
224
- config_path = get_config_path(config_spec)
225
- logger.info(f"Loading agent config from '{config_path}'")
226
- config = yaml.safe_load(config_path.read_text())
227
- if environment_class is not None:
228
- config.setdefault("environment", {})["environment_class"] = environment_class
229
- if model is not None:
230
- config.setdefault("model", {})["model_name"] = model
231
- if model_class is not None:
232
- config.setdefault("model", {})["model_class"] = model_class
247
+ logger.info(f"Building agent config from specs: {config_spec}")
248
+ configs = [get_config_from_spec(spec) for spec in config_spec]
249
+ configs.append({
250
+ "environment": {"environment_class": environment_class or UNSET},
251
+ "model": {"model_name": model or UNSET, "model_class": model_class or UNSET},
252
+ })
253
+ config = recursive_merge(*configs)
233
254
 
234
255
  progress_manager = RunBatchProgressManager(len(instances), output_path / f"exit_statuses_{time.time()}.yaml")
235
256