mini-swe-agent 1.17.5__py3-none-any.whl → 2.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/METADATA +36 -52
- mini_swe_agent-2.0.0a1.dist-info/RECORD +70 -0
- mini_swe_agent-2.0.0a1.dist-info/entry_points.txt +5 -0
- minisweagent/__init__.py +19 -26
- minisweagent/agents/default.py +128 -113
- minisweagent/agents/interactive.py +119 -58
- minisweagent/config/README.md +3 -4
- minisweagent/config/__init__.py +36 -1
- minisweagent/config/benchmarks/swebench.yaml +156 -0
- minisweagent/config/{extra/swebench.yaml → benchmarks/swebench_backticks.yaml} +69 -64
- minisweagent/config/benchmarks/swebench_modal.yaml +47 -0
- minisweagent/config/{extra → benchmarks}/swebench_xml.yaml +73 -70
- minisweagent/config/default.yaml +24 -21
- minisweagent/config/inspector.tcss +42 -0
- minisweagent/config/mini.yaml +53 -71
- minisweagent/config/{github_issue.yaml → mini_textbased.yaml} +43 -29
- minisweagent/environments/__init__.py +1 -0
- minisweagent/environments/docker.py +67 -20
- minisweagent/environments/extra/bubblewrap.py +86 -47
- minisweagent/environments/extra/swerex_docker.py +53 -20
- minisweagent/environments/extra/swerex_modal.py +90 -0
- minisweagent/environments/local.py +62 -21
- minisweagent/environments/singularity.py +59 -18
- minisweagent/exceptions.py +22 -0
- minisweagent/models/__init__.py +6 -7
- minisweagent/models/extra/roulette.py +20 -17
- minisweagent/models/litellm_model.py +90 -44
- minisweagent/models/litellm_response_model.py +80 -0
- minisweagent/models/litellm_textbased_model.py +45 -0
- minisweagent/models/openrouter_model.py +87 -45
- minisweagent/models/openrouter_response_model.py +123 -0
- minisweagent/models/openrouter_textbased_model.py +76 -0
- minisweagent/models/portkey_model.py +84 -42
- minisweagent/models/portkey_response_model.py +163 -0
- minisweagent/models/requesty_model.py +91 -41
- minisweagent/models/test_models.py +246 -19
- minisweagent/models/utils/actions_text.py +60 -0
- minisweagent/models/utils/actions_toolcall.py +102 -0
- minisweagent/models/utils/actions_toolcall_response.py +110 -0
- minisweagent/models/utils/anthropic_utils.py +28 -0
- minisweagent/models/utils/cache_control.py +15 -2
- minisweagent/models/utils/content_string.py +74 -0
- minisweagent/models/utils/openai_multimodal.py +50 -0
- minisweagent/models/utils/retry.py +25 -0
- minisweagent/run/benchmarks/__init__.py +1 -0
- minisweagent/run/{extra → benchmarks}/swebench.py +56 -35
- minisweagent/run/{extra → benchmarks}/swebench_single.py +36 -26
- minisweagent/run/{extra → benchmarks}/utils/batch_progress.py +1 -1
- minisweagent/run/hello_world.py +6 -0
- minisweagent/run/mini.py +54 -63
- minisweagent/run/utilities/__init__.py +1 -0
- minisweagent/run/{extra → utilities}/config.py +2 -0
- minisweagent/run/{inspector.py → utilities/inspector.py} +90 -11
- minisweagent/run/{mini_extra.py → utilities/mini_extra.py} +9 -5
- minisweagent/utils/serialize.py +26 -0
- mini_swe_agent-1.17.5.dist-info/RECORD +0 -61
- mini_swe_agent-1.17.5.dist-info/entry_points.txt +0 -5
- minisweagent/agents/interactive_textual.py +0 -450
- minisweagent/config/extra/swebench_roulette.yaml +0 -233
- minisweagent/config/mini.tcss +0 -86
- minisweagent/models/anthropic.py +0 -35
- minisweagent/models/litellm_response_api_model.py +0 -82
- minisweagent/models/portkey_response_api_model.py +0 -75
- minisweagent/models/utils/key_per_thread.py +0 -20
- minisweagent/models/utils/openai_utils.py +0 -41
- minisweagent/run/github_issue.py +0 -87
- minisweagent/run/utils/__init__.py +0 -0
- minisweagent/run/utils/save.py +0 -78
- {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/WHEEL +0 -0
- {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/licenses/LICENSE.md +0 -0
- {mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/top_level.txt +0 -0
- /minisweagent/config/{extra → benchmarks}/__init__.py +0 -0
- /minisweagent/run/{extra → benchmarks}/utils/__init__.py +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Parse actions & format observations without toolcalls.
|
|
2
|
+
This was the method used for mini-swe-agent v1.0 and the original SWE-agent.
|
|
3
|
+
As of mini-swe-agent v2.0, we strongly recommend to use toolcalls instead.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
from jinja2 import StrictUndefined, Template
|
|
10
|
+
|
|
11
|
+
from minisweagent.exceptions import FormatError
|
|
12
|
+
from minisweagent.models.utils.openai_multimodal import expand_multimodal_content
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_regex_actions(content: str, *, action_regex: str, format_error_template: str) -> list[dict]:
|
|
16
|
+
"""Parse actions from text content using regex. Raises FormatError if not exactly one action."""
|
|
17
|
+
actions = [a.strip() for a in re.findall(action_regex, content, re.DOTALL)]
|
|
18
|
+
if len(actions) != 1:
|
|
19
|
+
raise FormatError(
|
|
20
|
+
{
|
|
21
|
+
"role": "user",
|
|
22
|
+
"content": Template(format_error_template, undefined=StrictUndefined).render(actions=actions),
|
|
23
|
+
"extra": {
|
|
24
|
+
"interrupt_type": "FormatError",
|
|
25
|
+
"n_actions": len(actions),
|
|
26
|
+
"model_response": content,
|
|
27
|
+
},
|
|
28
|
+
}
|
|
29
|
+
)
|
|
30
|
+
return [{"command": action} for action in actions]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def format_observation_messages(
|
|
34
|
+
outputs: list[dict],
|
|
35
|
+
*,
|
|
36
|
+
observation_template: str,
|
|
37
|
+
template_vars: dict | None = None,
|
|
38
|
+
multimodal_regex: str = "",
|
|
39
|
+
) -> list[dict]:
|
|
40
|
+
"""Format execution outputs into user observation messages."""
|
|
41
|
+
results = []
|
|
42
|
+
for output in outputs:
|
|
43
|
+
content = Template(observation_template, undefined=StrictUndefined).render(
|
|
44
|
+
output=output, **(template_vars or {})
|
|
45
|
+
)
|
|
46
|
+
msg: dict = {
|
|
47
|
+
"role": "user",
|
|
48
|
+
"content": content,
|
|
49
|
+
"extra": {
|
|
50
|
+
"raw_output": output.get("output", ""),
|
|
51
|
+
"returncode": output.get("returncode"),
|
|
52
|
+
"timestamp": time.time(),
|
|
53
|
+
"exception_info": output.get("exception_info"),
|
|
54
|
+
**output.get("extra", {}),
|
|
55
|
+
},
|
|
56
|
+
}
|
|
57
|
+
if multimodal_regex:
|
|
58
|
+
msg = expand_multimodal_content(msg, pattern=multimodal_regex)
|
|
59
|
+
results.append(msg)
|
|
60
|
+
return results
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Parse actions & format observations with toolcalls"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from jinja2 import StrictUndefined, Template
|
|
7
|
+
|
|
8
|
+
from minisweagent.exceptions import FormatError
|
|
9
|
+
from minisweagent.models.utils.openai_multimodal import expand_multimodal_content
|
|
10
|
+
|
|
11
|
+
BASH_TOOL = {
|
|
12
|
+
"type": "function",
|
|
13
|
+
"function": {
|
|
14
|
+
"name": "bash",
|
|
15
|
+
"description": "Execute a bash command",
|
|
16
|
+
"parameters": {
|
|
17
|
+
"type": "object",
|
|
18
|
+
"properties": {
|
|
19
|
+
"command": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"description": "The bash command to execute",
|
|
22
|
+
}
|
|
23
|
+
},
|
|
24
|
+
"required": ["command"],
|
|
25
|
+
},
|
|
26
|
+
},
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_toolcall_actions(tool_calls: list, *, format_error_template: str) -> list[dict]:
|
|
31
|
+
"""Parse tool calls from the response. Raises FormatError if unknown tool or invalid args."""
|
|
32
|
+
if not tool_calls:
|
|
33
|
+
raise FormatError(
|
|
34
|
+
{
|
|
35
|
+
"role": "user",
|
|
36
|
+
"content": Template(format_error_template, undefined=StrictUndefined).render(
|
|
37
|
+
error="No tool calls found in the response. Every response MUST include at least one tool call."
|
|
38
|
+
),
|
|
39
|
+
"extra": {"interrupt_type": "FormatError"},
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
actions = []
|
|
43
|
+
for tool_call in tool_calls:
|
|
44
|
+
error_msg = ""
|
|
45
|
+
args = {}
|
|
46
|
+
try:
|
|
47
|
+
args = json.loads(tool_call.function.arguments)
|
|
48
|
+
except Exception as e:
|
|
49
|
+
error_msg = f"Error parsing tool call arguments: {e}. "
|
|
50
|
+
if tool_call.function.name != "bash":
|
|
51
|
+
error_msg += f"Unknown tool '{tool_call.function.name}'."
|
|
52
|
+
if "command" not in args:
|
|
53
|
+
error_msg += "Missing 'command' argument in bash tool call."
|
|
54
|
+
if error_msg:
|
|
55
|
+
raise FormatError(
|
|
56
|
+
{
|
|
57
|
+
"role": "user",
|
|
58
|
+
"content": Template(format_error_template, undefined=StrictUndefined).render(
|
|
59
|
+
error=error_msg.strip()
|
|
60
|
+
),
|
|
61
|
+
"extra": {"interrupt_type": "FormatError"},
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
actions.append({"command": args["command"], "tool_call_id": tool_call.id})
|
|
65
|
+
return actions
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def format_toolcall_observation_messages(
|
|
69
|
+
*,
|
|
70
|
+
actions: list[dict],
|
|
71
|
+
outputs: list[dict],
|
|
72
|
+
observation_template: str,
|
|
73
|
+
template_vars: dict | None = None,
|
|
74
|
+
multimodal_regex: str = "",
|
|
75
|
+
) -> list[dict]:
|
|
76
|
+
"""Format execution outputs into tool result messages."""
|
|
77
|
+
not_executed = {"output": "", "returncode": -1, "exception_info": "action was not executed"}
|
|
78
|
+
padded_outputs = outputs + [not_executed] * (len(actions) - len(outputs))
|
|
79
|
+
results = []
|
|
80
|
+
for action, output in zip(actions, padded_outputs):
|
|
81
|
+
content = Template(observation_template, undefined=StrictUndefined).render(
|
|
82
|
+
output=output, **(template_vars or {})
|
|
83
|
+
)
|
|
84
|
+
msg = {
|
|
85
|
+
"content": content,
|
|
86
|
+
"extra": {
|
|
87
|
+
"raw_output": output.get("output", ""),
|
|
88
|
+
"returncode": output.get("returncode"),
|
|
89
|
+
"timestamp": time.time(),
|
|
90
|
+
"exception_info": output.get("exception_info"),
|
|
91
|
+
**output.get("extra", {}),
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
if "tool_call_id" in action:
|
|
95
|
+
msg["tool_call_id"] = action["tool_call_id"]
|
|
96
|
+
msg["role"] = "tool"
|
|
97
|
+
else:
|
|
98
|
+
msg["role"] = "user" # human issued commands
|
|
99
|
+
if multimodal_regex:
|
|
100
|
+
msg = expand_multimodal_content(msg, pattern=multimodal_regex)
|
|
101
|
+
results.append(msg)
|
|
102
|
+
return results
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Parse actions & format observations for OpenAI Responses API toolcalls"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from jinja2 import StrictUndefined, Template
|
|
7
|
+
|
|
8
|
+
from minisweagent.exceptions import FormatError
|
|
9
|
+
|
|
10
|
+
# OpenRouter/OpenAI Responses API uses a flat structure (no nested "function" key)
|
|
11
|
+
BASH_TOOL_RESPONSE_API = {
|
|
12
|
+
"type": "function",
|
|
13
|
+
"name": "bash",
|
|
14
|
+
"description": "Execute a bash command",
|
|
15
|
+
"parameters": {
|
|
16
|
+
"type": "object",
|
|
17
|
+
"properties": {
|
|
18
|
+
"command": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "The bash command to execute",
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"required": ["command"],
|
|
24
|
+
},
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _format_error_message(error_text: str) -> dict:
|
|
29
|
+
"""Create a FormatError message in Responses API format."""
|
|
30
|
+
return {
|
|
31
|
+
"type": "message",
|
|
32
|
+
"role": "user",
|
|
33
|
+
"content": [{"type": "input_text", "text": error_text}],
|
|
34
|
+
"extra": {"interrupt_type": "FormatError"},
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse_toolcall_actions_response(output: list, *, format_error_template: str) -> list[dict]:
|
|
39
|
+
"""Parse tool calls from a Responses API response output.
|
|
40
|
+
|
|
41
|
+
Filters for function_call items and parses them.
|
|
42
|
+
Response API format has name/arguments at top level with call_id:
|
|
43
|
+
{"type": "function_call", "call_id": "...", "name": "bash", "arguments": "..."}
|
|
44
|
+
"""
|
|
45
|
+
tool_calls = []
|
|
46
|
+
for item in output:
|
|
47
|
+
item_type = item.get("type") if isinstance(item, dict) else getattr(item, "type", None)
|
|
48
|
+
if item_type == "function_call":
|
|
49
|
+
tool_calls.append(
|
|
50
|
+
item.model_dump() if hasattr(item, "model_dump") else dict(item) if not isinstance(item, dict) else item
|
|
51
|
+
)
|
|
52
|
+
if not tool_calls:
|
|
53
|
+
error_text = Template(format_error_template, undefined=StrictUndefined).render(
|
|
54
|
+
error="No tool calls found in the response. Every response MUST include at least one tool call.",
|
|
55
|
+
)
|
|
56
|
+
raise FormatError(_format_error_message(error_text))
|
|
57
|
+
actions = []
|
|
58
|
+
for tool_call in tool_calls:
|
|
59
|
+
error_msg = ""
|
|
60
|
+
args = {}
|
|
61
|
+
try:
|
|
62
|
+
args = json.loads(tool_call.get("arguments", "{}"))
|
|
63
|
+
except Exception as e:
|
|
64
|
+
error_msg = f"Error parsing tool call arguments: {e}. "
|
|
65
|
+
if tool_call.get("name") != "bash":
|
|
66
|
+
error_msg += f"Unknown tool '{tool_call.get('name')}'."
|
|
67
|
+
if "command" not in args:
|
|
68
|
+
error_msg += "Missing 'command' argument in bash tool call."
|
|
69
|
+
if error_msg:
|
|
70
|
+
error_text = Template(format_error_template, undefined=StrictUndefined).render(error=error_msg.strip())
|
|
71
|
+
raise FormatError(_format_error_message(error_text))
|
|
72
|
+
actions.append({"command": args["command"], "tool_call_id": tool_call.get("call_id") or tool_call.get("id")})
|
|
73
|
+
return actions
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def format_toolcall_observation_messages(
|
|
77
|
+
*,
|
|
78
|
+
actions: list[dict],
|
|
79
|
+
outputs: list[dict],
|
|
80
|
+
observation_template: str,
|
|
81
|
+
template_vars: dict | None = None,
|
|
82
|
+
multimodal_regex: str = "",
|
|
83
|
+
) -> list[dict]:
|
|
84
|
+
"""Format execution outputs into function_call_output messages for Responses API."""
|
|
85
|
+
not_executed = {"output": "", "returncode": -1, "exception_info": "action was not executed"}
|
|
86
|
+
padded_outputs = outputs + [not_executed] * (len(actions) - len(outputs))
|
|
87
|
+
results = []
|
|
88
|
+
for action, output in zip(actions, padded_outputs):
|
|
89
|
+
content = Template(observation_template, undefined=StrictUndefined).render(
|
|
90
|
+
output=output, **(template_vars or {})
|
|
91
|
+
)
|
|
92
|
+
msg: dict = {
|
|
93
|
+
"extra": {
|
|
94
|
+
"raw_output": output.get("output", ""),
|
|
95
|
+
"returncode": output.get("returncode"),
|
|
96
|
+
"timestamp": time.time(),
|
|
97
|
+
"exception_info": output.get("exception_info"),
|
|
98
|
+
**output.get("extra", {}),
|
|
99
|
+
},
|
|
100
|
+
}
|
|
101
|
+
if "tool_call_id" in action:
|
|
102
|
+
msg["type"] = "function_call_output"
|
|
103
|
+
msg["call_id"] = action["tool_call_id"]
|
|
104
|
+
msg["output"] = content
|
|
105
|
+
else: # human issued commands
|
|
106
|
+
msg["type"] = "message"
|
|
107
|
+
msg["role"] = "user"
|
|
108
|
+
msg["content"] = [{"type": "input_text", "text": content}]
|
|
109
|
+
results.append(msg)
|
|
110
|
+
return results
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Utilities for Anthropic API compatibility."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _is_anthropic_thinking_block(block) -> bool:
|
|
5
|
+
"""Check if a content block is a thinking-type block."""
|
|
6
|
+
if not isinstance(block, dict):
|
|
7
|
+
return False
|
|
8
|
+
return block.get("type") in ("thinking", "redacted_thinking")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _reorder_anthropic_thinking_blocks(messages: list[dict]) -> list[dict]:
|
|
12
|
+
"""Reorder thinking blocks so they are not the final block in assistant messages.
|
|
13
|
+
|
|
14
|
+
This is an Anthropic API requirement: thinking blocks must come before other blocks.
|
|
15
|
+
"""
|
|
16
|
+
result = []
|
|
17
|
+
for msg in messages:
|
|
18
|
+
if msg.get("role") == "assistant" and isinstance(msg.get("content"), list):
|
|
19
|
+
content = msg["content"]
|
|
20
|
+
thinking_blocks = [b for b in content if _is_anthropic_thinking_block(b)]
|
|
21
|
+
if thinking_blocks:
|
|
22
|
+
other_blocks = [b for b in content if not _is_anthropic_thinking_block(b)]
|
|
23
|
+
if other_blocks:
|
|
24
|
+
msg = {**msg, "content": thinking_blocks + other_blocks}
|
|
25
|
+
else:
|
|
26
|
+
msg = {**msg, "content": thinking_blocks + [{"type": "text", "text": ""}]}
|
|
27
|
+
result.append(msg)
|
|
28
|
+
return result
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
+
"""Cache control utilities are mostly for Anthropic models.
|
|
2
|
+
They are used to explicitly set cache control points.
|
|
3
|
+
"""
|
|
4
|
+
|
|
1
5
|
import copy
|
|
2
6
|
import warnings
|
|
3
7
|
from typing import Literal
|
|
4
8
|
|
|
5
9
|
|
|
6
|
-
def _get_content_text(entry: dict) -> str:
|
|
10
|
+
def _get_content_text(entry: dict) -> str | None:
|
|
11
|
+
if entry["content"] is None:
|
|
12
|
+
return None
|
|
7
13
|
if isinstance(entry["content"], str):
|
|
8
14
|
return entry["content"]
|
|
9
15
|
assert len(entry["content"]) == 1, "Expected single message in content"
|
|
@@ -14,10 +20,16 @@ def _clear_cache_control(entry: dict) -> None:
|
|
|
14
20
|
if isinstance(entry["content"], list):
|
|
15
21
|
assert len(entry["content"]) == 1, "Expected single message in content"
|
|
16
22
|
entry["content"][0].pop("cache_control", None)
|
|
23
|
+
# Note: entry["content"] can be None for assistant messages with only tool_use
|
|
17
24
|
entry.pop("cache_control", None)
|
|
18
25
|
|
|
19
26
|
|
|
20
27
|
def _set_cache_control(entry: dict) -> None:
|
|
28
|
+
# Handle None content (e.g., assistant messages with only tool_use)
|
|
29
|
+
if entry["content"] is None:
|
|
30
|
+
entry["cache_control"] = {"type": "ephemeral"}
|
|
31
|
+
return
|
|
32
|
+
|
|
21
33
|
if not isinstance(entry["content"], list):
|
|
22
34
|
entry["content"] = [ # type: ignore
|
|
23
35
|
{
|
|
@@ -38,7 +50,8 @@ def set_cache_control(
|
|
|
38
50
|
messages: list[dict], *, mode: Literal["default_end"] | None = "default_end", last_n_messages_offset: int = 0
|
|
39
51
|
) -> list[dict]:
|
|
40
52
|
"""This messages processor adds manual cache control marks to the messages."""
|
|
41
|
-
|
|
53
|
+
if mode is None:
|
|
54
|
+
return messages
|
|
42
55
|
if mode != "default_end":
|
|
43
56
|
raise ValueError(f"Invalid mode: {mode}")
|
|
44
57
|
if last_n_messages_offset:
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Helper function for pretty-printing content strings."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _format_tool_call(args_str: str) -> str:
|
|
7
|
+
"""Format tool call arguments, extracting command if it's a bash call."""
|
|
8
|
+
try:
|
|
9
|
+
args = json.loads(args_str) if isinstance(args_str, str) else args_str
|
|
10
|
+
if isinstance(args, dict) and "command" in args:
|
|
11
|
+
return f"```\n{args['command']}\n```"
|
|
12
|
+
except Exception:
|
|
13
|
+
pass
|
|
14
|
+
return f"```\n{args_str}\n```"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _format_observation(content: str) -> str | None:
|
|
18
|
+
"""Try to format an observation JSON as key-value pairs."""
|
|
19
|
+
try:
|
|
20
|
+
data = json.loads(content)
|
|
21
|
+
if isinstance(data, dict) and "returncode" in data:
|
|
22
|
+
lines = []
|
|
23
|
+
for key, value in data.items():
|
|
24
|
+
lines.append(f"<{key}>")
|
|
25
|
+
lines.append(str(value))
|
|
26
|
+
return "\n".join(lines)
|
|
27
|
+
return content
|
|
28
|
+
except Exception:
|
|
29
|
+
return content
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_content_string(message: dict) -> str:
|
|
33
|
+
"""Extract text content from any message format for display.
|
|
34
|
+
|
|
35
|
+
Handles:
|
|
36
|
+
- Traditional chat: {"content": "text"}
|
|
37
|
+
- Multimodal chat: {"content": [{"type": "text", "text": "..."}]}
|
|
38
|
+
- Observation messages: {"content": "{\"returncode\": 0, \"output\": \"...\"}"}
|
|
39
|
+
- Traditional tool calls: {"tool_calls": [{"function": {"name": "...", "arguments": "..."}}]}
|
|
40
|
+
- Responses API: {"output": [{"type": "message", "content": [...]}]}
|
|
41
|
+
"""
|
|
42
|
+
texts = []
|
|
43
|
+
|
|
44
|
+
# Extract content (string or multimodal list)
|
|
45
|
+
content = message.get("content")
|
|
46
|
+
if isinstance(content, str):
|
|
47
|
+
texts.append(_format_observation(content))
|
|
48
|
+
elif isinstance(content, list):
|
|
49
|
+
texts.append("\n".join(item.get("text", "") for item in content if isinstance(item, dict)))
|
|
50
|
+
|
|
51
|
+
# Handle traditional tool_calls format (OpenAI/LiteLLM style)
|
|
52
|
+
if tool_calls := message.get("tool_calls"):
|
|
53
|
+
for tc in tool_calls:
|
|
54
|
+
func = tc.get("function", {}) if isinstance(tc, dict) else getattr(tc, "function", None)
|
|
55
|
+
if func:
|
|
56
|
+
args = func.get("arguments", "{}") if isinstance(func, dict) else getattr(func, "arguments", "{}")
|
|
57
|
+
texts.append(_format_tool_call(args))
|
|
58
|
+
|
|
59
|
+
# Handle Responses API format (output array)
|
|
60
|
+
if output := message.get("output"):
|
|
61
|
+
if isinstance(output, str):
|
|
62
|
+
texts.append(_format_observation(output))
|
|
63
|
+
elif isinstance(output, list):
|
|
64
|
+
for item in output:
|
|
65
|
+
if not isinstance(item, dict):
|
|
66
|
+
continue
|
|
67
|
+
if item.get("type") == "message":
|
|
68
|
+
for c in item.get("content", []):
|
|
69
|
+
if isinstance(c, dict) and (text := c.get("text")):
|
|
70
|
+
texts.append(text)
|
|
71
|
+
elif item.get("type") == "function_call":
|
|
72
|
+
texts.append(_format_tool_call(item.get("arguments", "{}")))
|
|
73
|
+
|
|
74
|
+
return "\n\n".join(t for t in texts if t)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Utilities for handling multimodal content in OpenAI-style messages."""
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
DEFAULT_MULTIMODAL_REGEX = (
|
|
8
|
+
r"(?s)<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>(.+?)</CONTENT_TYPE>(.+?)</MSWEA_MULTIMODAL_CONTENT>"
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _expand_content_string(*, content: str, pattern: str) -> list[dict]:
|
|
13
|
+
"""Expand a content string, replacing multimodal tags with structured content."""
|
|
14
|
+
matches = list(re.finditer(pattern, content))
|
|
15
|
+
if not matches:
|
|
16
|
+
return [{"type": "text", "text": content}]
|
|
17
|
+
result = []
|
|
18
|
+
last_end = 0
|
|
19
|
+
for match in matches:
|
|
20
|
+
text_before = content[last_end : match.start()]
|
|
21
|
+
if text_before:
|
|
22
|
+
result.append({"type": "text", "text": text_before})
|
|
23
|
+
content_type = match.group(1).strip()
|
|
24
|
+
extracted = match.group(2).strip()
|
|
25
|
+
if content_type == "image_url":
|
|
26
|
+
result.append({"type": "image_url", "image_url": {"url": extracted}})
|
|
27
|
+
last_end = match.end()
|
|
28
|
+
text_after = content[last_end:]
|
|
29
|
+
if text_after:
|
|
30
|
+
result.append({"type": "text", "text": text_after})
|
|
31
|
+
return result
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def expand_multimodal_content(content: Any, *, pattern: str) -> Any:
|
|
35
|
+
"""Recursively expand multimodal content in messages.
|
|
36
|
+
Note: Returns copy of content, original content is not modified.
|
|
37
|
+
"""
|
|
38
|
+
if not pattern:
|
|
39
|
+
return content
|
|
40
|
+
content = copy.deepcopy(content)
|
|
41
|
+
if isinstance(content, str):
|
|
42
|
+
return _expand_content_string(content=content, pattern=pattern)
|
|
43
|
+
if isinstance(content, list):
|
|
44
|
+
return [expand_multimodal_content(item, pattern=pattern) for item in content]
|
|
45
|
+
if isinstance(content, dict):
|
|
46
|
+
if "content" not in content:
|
|
47
|
+
return content
|
|
48
|
+
content["content"] = expand_multimodal_content(content["content"], pattern=pattern)
|
|
49
|
+
return content
|
|
50
|
+
return str(content)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Retry utility for model queries."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from tenacity import Retrying, before_sleep_log, retry_if_not_exception_type, stop_after_attempt, wait_exponential
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def retry(*, logger: logging.Logger, abort_exceptions: list[type[Exception]]) -> Retrying:
|
|
10
|
+
"""Thin wrapper around tenacity.Retrying to make use of global config etc.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
logger: Logger to use for reporting retries
|
|
14
|
+
abort_exceptions: Exceptions to abort on.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
A tenacity.Retrying object.
|
|
18
|
+
"""
|
|
19
|
+
return Retrying(
|
|
20
|
+
reraise=True,
|
|
21
|
+
stop=stop_after_attempt(int(os.getenv("MSWEA_MODEL_RETRY_STOP_AFTER_ATTEMPT", "10"))),
|
|
22
|
+
wait=wait_exponential(multiplier=1, min=4, max=60),
|
|
23
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
24
|
+
retry=retry_if_not_exception_type(tuple(abort_exceptions)),
|
|
25
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Benchmark run scripts for mini-SWE-agent (e.g., SWE-bench)."""
|
|
@@ -13,19 +13,17 @@ import traceback
|
|
|
13
13
|
from pathlib import Path
|
|
14
14
|
|
|
15
15
|
import typer
|
|
16
|
-
import yaml
|
|
17
|
-
from datasets import load_dataset
|
|
18
16
|
from jinja2 import StrictUndefined, Template
|
|
19
17
|
from rich.live import Live
|
|
20
18
|
|
|
21
19
|
from minisweagent import Environment
|
|
22
20
|
from minisweagent.agents.default import DefaultAgent
|
|
23
|
-
from minisweagent.config import builtin_config_dir,
|
|
21
|
+
from minisweagent.config import builtin_config_dir, get_config_from_spec
|
|
24
22
|
from minisweagent.environments import get_environment
|
|
25
23
|
from minisweagent.models import get_model
|
|
26
|
-
from minisweagent.run.
|
|
27
|
-
from minisweagent.run.utils.save import save_traj
|
|
24
|
+
from minisweagent.run.benchmarks.utils.batch_progress import RunBatchProgressManager
|
|
28
25
|
from minisweagent.utils.log import add_file_handler, logger
|
|
26
|
+
from minisweagent.utils.serialize import UNSET, recursive_merge
|
|
29
27
|
|
|
30
28
|
_HELP_TEXT = """Run mini-SWE-agent on SWEBench instances.
|
|
31
29
|
|
|
@@ -34,7 +32,23 @@ More information about the usage: [bold green]https://mini-swe-agent.com/latest/
|
|
|
34
32
|
[/not dim]
|
|
35
33
|
"""
|
|
36
34
|
|
|
37
|
-
|
|
35
|
+
_CONFIG_SPEC_HELP_TEXT = """Path to config files, filenames, or key-value pairs.
|
|
36
|
+
|
|
37
|
+
[bold red]IMPORTANT:[/bold red] [red]If you set this option, the default config file will not be used.[/red]
|
|
38
|
+
So you need to explicitly set it e.g., with [bold green]-c swebench.yaml <other options>[/bold green]
|
|
39
|
+
|
|
40
|
+
Multiple configs will be recursively merged.
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
|
|
44
|
+
[bold red]-c model.model_kwargs.temperature=0[/bold red] [red]You forgot to add the default config file! See above.[/red]
|
|
45
|
+
|
|
46
|
+
[bold green]-c swebench.yaml -c model.model_kwargs.temperature=0.5[/bold green]
|
|
47
|
+
|
|
48
|
+
[bold green]-c swebench.yaml -c agent.max_iterations=50[/bold green]
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
DEFAULT_CONFIG_FILE = builtin_config_dir / "benchmarks" / "swebench.yaml"
|
|
38
52
|
|
|
39
53
|
DATASET_MAPPING = {
|
|
40
54
|
"full": "princeton-nlp/SWE-Bench",
|
|
@@ -46,7 +60,7 @@ DATASET_MAPPING = {
|
|
|
46
60
|
"_test": "klieret/swe-bench-dummy-test-dataset",
|
|
47
61
|
}
|
|
48
62
|
|
|
49
|
-
|
|
63
|
+
app = typer.Typer(rich_markup_mode="rich", add_completion=False)
|
|
50
64
|
_OUTPUT_FILE_LOCK = threading.Lock()
|
|
51
65
|
|
|
52
66
|
|
|
@@ -60,9 +74,7 @@ class ProgressTrackingAgent(DefaultAgent):
|
|
|
60
74
|
|
|
61
75
|
def step(self) -> dict:
|
|
62
76
|
"""Override step to provide progress updates."""
|
|
63
|
-
self.progress_manager.update_instance_status(
|
|
64
|
-
self.instance_id, f"Step {self.model.n_calls + 1:3d} (${self.model.cost:.2f})"
|
|
65
|
-
)
|
|
77
|
+
self.progress_manager.update_instance_status(self.instance_id, f"Step {self.n_calls + 1:3d} (${self.cost:.2f})")
|
|
66
78
|
return super().step()
|
|
67
79
|
|
|
68
80
|
|
|
@@ -81,7 +93,7 @@ def get_sb_environment(config: dict, instance: dict) -> Environment:
|
|
|
81
93
|
env_config = config.setdefault("environment", {})
|
|
82
94
|
env_config["environment_class"] = env_config.get("environment_class", "docker")
|
|
83
95
|
image_name = get_swebench_docker_image_name(instance)
|
|
84
|
-
if env_config["environment_class"]
|
|
96
|
+
if env_config["environment_class"] in ["docker", "swerex_modal"]:
|
|
85
97
|
env_config["image"] = image_name
|
|
86
98
|
elif env_config["environment_class"] == "singularity":
|
|
87
99
|
env_config["image"] = "docker://" + image_name
|
|
@@ -138,7 +150,9 @@ def process_instance(
|
|
|
138
150
|
progress_manager.update_instance_status(instance_id, "Pulling/starting docker")
|
|
139
151
|
|
|
140
152
|
agent = None
|
|
141
|
-
|
|
153
|
+
exit_status = None
|
|
154
|
+
result = None
|
|
155
|
+
extra_info = {}
|
|
142
156
|
|
|
143
157
|
try:
|
|
144
158
|
env = get_sb_environment(config, instance)
|
|
@@ -149,21 +163,28 @@ def process_instance(
|
|
|
149
163
|
instance_id=instance_id,
|
|
150
164
|
**config.get("agent", {}),
|
|
151
165
|
)
|
|
152
|
-
|
|
166
|
+
info = agent.run(task)
|
|
167
|
+
exit_status = info.get("exit_status")
|
|
168
|
+
result = info.get("submission")
|
|
153
169
|
except Exception as e:
|
|
154
170
|
logger.error(f"Error processing instance {instance_id}: {e}", exc_info=True)
|
|
155
|
-
exit_status, result = type(e).__name__,
|
|
156
|
-
extra_info = {"traceback": traceback.format_exc()}
|
|
171
|
+
exit_status, result = type(e).__name__, ""
|
|
172
|
+
extra_info = {"traceback": traceback.format_exc(), "exception_str": str(e)}
|
|
157
173
|
finally:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
174
|
+
if agent is not None:
|
|
175
|
+
traj_path = instance_dir / f"{instance_id}.traj.json"
|
|
176
|
+
agent.save(
|
|
177
|
+
traj_path,
|
|
178
|
+
{
|
|
179
|
+
"info": {
|
|
180
|
+
"exit_status": exit_status,
|
|
181
|
+
"submission": result,
|
|
182
|
+
**extra_info,
|
|
183
|
+
},
|
|
184
|
+
"instance_id": instance_id,
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
logger.info(f"Saved trajectory to '{traj_path}'")
|
|
167
188
|
update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result)
|
|
168
189
|
progress_manager.on_instance_end(instance_id, exit_status)
|
|
169
190
|
|
|
@@ -201,8 +222,8 @@ def main(
|
|
|
201
222
|
model: str | None = typer.Option(None, "-m", "--model", help="Model to use", rich_help_panel="Basic"),
|
|
202
223
|
model_class: str | None = typer.Option(None, "--model-class", help="Model class to use (e.g., 'anthropic' or 'minisweagent.models.anthropic.AnthropicModel')", rich_help_panel="Advanced"),
|
|
203
224
|
redo_existing: bool = typer.Option(False, "--redo-existing", help="Redo existing instances", rich_help_panel="Data selection"),
|
|
204
|
-
config_spec:
|
|
205
|
-
environment_class: str | None = typer.Option(
|
|
225
|
+
config_spec: list[str] = typer.Option([str(DEFAULT_CONFIG_FILE)], "-c", "--config", help=_CONFIG_SPEC_HELP_TEXT, rich_help_panel="Basic"),
|
|
226
|
+
environment_class: str | None = typer.Option(None, "--environment-class", help="Environment type to use. Recommended are docker or singularity", rich_help_panel="Advanced"),
|
|
206
227
|
) -> None:
|
|
207
228
|
# fmt: on
|
|
208
229
|
output_path = Path(output)
|
|
@@ -210,6 +231,8 @@ def main(
|
|
|
210
231
|
logger.info(f"Results will be saved to {output_path}")
|
|
211
232
|
add_file_handler(output_path / "minisweagent.log")
|
|
212
233
|
|
|
234
|
+
from datasets import load_dataset
|
|
235
|
+
|
|
213
236
|
dataset_path = DATASET_MAPPING.get(subset, subset)
|
|
214
237
|
logger.info(f"Loading dataset {dataset_path}, split {split}...")
|
|
215
238
|
instances = list(load_dataset(dataset_path, split=split))
|
|
@@ -221,15 +244,13 @@ def main(
|
|
|
221
244
|
instances = [instance for instance in instances if instance["instance_id"] not in existing_instances]
|
|
222
245
|
logger.info(f"Running on {len(instances)} instances...")
|
|
223
246
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
if model_class is not None:
|
|
232
|
-
config.setdefault("model", {})["model_class"] = model_class
|
|
247
|
+
logger.info(f"Building agent config from specs: {config_spec}")
|
|
248
|
+
configs = [get_config_from_spec(spec) for spec in config_spec]
|
|
249
|
+
configs.append({
|
|
250
|
+
"environment": {"environment_class": environment_class or UNSET},
|
|
251
|
+
"model": {"model_name": model or UNSET, "model_class": model_class or UNSET},
|
|
252
|
+
})
|
|
253
|
+
config = recursive_merge(*configs)
|
|
233
254
|
|
|
234
255
|
progress_manager = RunBatchProgressManager(len(instances), output_path / f"exit_statuses_{time.time()}.yaml")
|
|
235
256
|
|