PyPI - mini-swe-agent - Versions diffs - 1.17.5__py3-none-any.whl → 2.0.0a1__py3-none-any.whl - Mend

mini-swe-agent 1.17.5py3-none-any.whl → 2.0.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

{mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/METADATA +36 -52
mini_swe_agent-2.0.0a1.dist-info/RECORD +70 -0
mini_swe_agent-2.0.0a1.dist-info/entry_points.txt +5 -0
minisweagent/__init__.py +19 -26
minisweagent/agents/default.py +128 -113
minisweagent/agents/interactive.py +119 -58
minisweagent/config/README.md +3 -4
minisweagent/config/__init__.py +36 -1
minisweagent/config/benchmarks/swebench.yaml +156 -0
minisweagent/config/{extra/swebench.yaml → benchmarks/swebench_backticks.yaml} +69 -64
minisweagent/config/benchmarks/swebench_modal.yaml +47 -0
minisweagent/config/{extra → benchmarks}/swebench_xml.yaml +73 -70
minisweagent/config/default.yaml +24 -21
minisweagent/config/inspector.tcss +42 -0
minisweagent/config/mini.yaml +53 -71
minisweagent/config/{github_issue.yaml → mini_textbased.yaml} +43 -29
minisweagent/environments/__init__.py +1 -0
minisweagent/environments/docker.py +67 -20
minisweagent/environments/extra/bubblewrap.py +86 -47
minisweagent/environments/extra/swerex_docker.py +53 -20
minisweagent/environments/extra/swerex_modal.py +90 -0
minisweagent/environments/local.py +62 -21
minisweagent/environments/singularity.py +59 -18
minisweagent/exceptions.py +22 -0
minisweagent/models/__init__.py +6 -7
minisweagent/models/extra/roulette.py +20 -17
minisweagent/models/litellm_model.py +90 -44
minisweagent/models/litellm_response_model.py +80 -0
minisweagent/models/litellm_textbased_model.py +45 -0
minisweagent/models/openrouter_model.py +87 -45
minisweagent/models/openrouter_response_model.py +123 -0
minisweagent/models/openrouter_textbased_model.py +76 -0
minisweagent/models/portkey_model.py +84 -42
minisweagent/models/portkey_response_model.py +163 -0
minisweagent/models/requesty_model.py +91 -41
minisweagent/models/test_models.py +246 -19
minisweagent/models/utils/actions_text.py +60 -0
minisweagent/models/utils/actions_toolcall.py +102 -0
minisweagent/models/utils/actions_toolcall_response.py +110 -0
minisweagent/models/utils/anthropic_utils.py +28 -0
minisweagent/models/utils/cache_control.py +15 -2
minisweagent/models/utils/content_string.py +74 -0
minisweagent/models/utils/openai_multimodal.py +50 -0
minisweagent/models/utils/retry.py +25 -0
minisweagent/run/benchmarks/__init__.py +1 -0
minisweagent/run/{extra → benchmarks}/swebench.py +56 -35
minisweagent/run/{extra → benchmarks}/swebench_single.py +36 -26
minisweagent/run/{extra → benchmarks}/utils/batch_progress.py +1 -1
minisweagent/run/hello_world.py +6 -0
minisweagent/run/mini.py +54 -63
minisweagent/run/utilities/__init__.py +1 -0
minisweagent/run/{extra → utilities}/config.py +2 -0
minisweagent/run/{inspector.py → utilities/inspector.py} +90 -11
minisweagent/run/{mini_extra.py → utilities/mini_extra.py} +9 -5
minisweagent/utils/serialize.py +26 -0
mini_swe_agent-1.17.5.dist-info/RECORD +0 -61
mini_swe_agent-1.17.5.dist-info/entry_points.txt +0 -5
minisweagent/agents/interactive_textual.py +0 -450
minisweagent/config/extra/swebench_roulette.yaml +0 -233
minisweagent/config/mini.tcss +0 -86
minisweagent/models/anthropic.py +0 -35
minisweagent/models/litellm_response_api_model.py +0 -82
minisweagent/models/portkey_response_api_model.py +0 -75
minisweagent/models/utils/key_per_thread.py +0 -20
minisweagent/models/utils/openai_utils.py +0 -41
minisweagent/run/github_issue.py +0 -87
minisweagent/run/utils/__init__.py +0 -0
minisweagent/run/utils/save.py +0 -78
{mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/WHEEL +0 -0
{mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/licenses/LICENSE.md +0 -0
{mini_swe_agent-1.17.5.dist-info → mini_swe_agent-2.0.0a1.dist-info}/top_level.txt +0 -0
/minisweagent/config/{extra → benchmarks}/__init__.py +0 -0
/minisweagent/run/{extra → benchmarks}/utils/__init__.py +0 -0

minisweagent/models/utils/actions_text.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Parse actions & format observations without toolcalls.
+This was the method used for mini-swe-agent v1.0 and the original SWE-agent.
+As of mini-swe-agent v2.0, we strongly recommend to use toolcalls instead.
+"""
+import re
+import time
+from jinja2 import StrictUndefined, Template
+from minisweagent.exceptions import FormatError
+from minisweagent.models.utils.openai_multimodal import expand_multimodal_content
+def parse_regex_actions(content: str, *, action_regex: str, format_error_template: str) -> list[dict]:
+    """Parse actions from text content using regex. Raises FormatError if not exactly one action."""
+    actions = [a.strip() for a in re.findall(action_regex, content, re.DOTALL)]
+    if len(actions) != 1:
+        raise FormatError(
+            {
+                "role": "user",
+                "content": Template(format_error_template, undefined=StrictUndefined).render(actions=actions),
+                "extra": {
+                    "interrupt_type": "FormatError",
+                    "n_actions": len(actions),
+                    "model_response": content,
+                },
+            }
+        )
+    return [{"command": action} for action in actions]
+def format_observation_messages(
+    outputs: list[dict],
+    *,
+    observation_template: str,
+    template_vars: dict | None = None,
+    multimodal_regex: str = "",
+) -> list[dict]:
+    """Format execution outputs into user observation messages."""
+    results = []
+    for output in outputs:
+        content = Template(observation_template, undefined=StrictUndefined).render(
+            output=output, **(template_vars or {})
+        )
+        msg: dict = {
+            "role": "user",
+            "content": content,
+            "extra": {
+                "raw_output": output.get("output", ""),
+                "returncode": output.get("returncode"),
+                "timestamp": time.time(),
+                "exception_info": output.get("exception_info"),
+                **output.get("extra", {}),
+            },
+        }
+        if multimodal_regex:
+            msg = expand_multimodal_content(msg, pattern=multimodal_regex)
+        results.append(msg)
+    return results

minisweagent/models/utils/actions_toolcall.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Parse actions & format observations with toolcalls"""
+import json
+import time
+from jinja2 import StrictUndefined, Template
+from minisweagent.exceptions import FormatError
+from minisweagent.models.utils.openai_multimodal import expand_multimodal_content
+BASH_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "bash",
+        "description": "Execute a bash command",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "command": {
+                    "type": "string",
+                    "description": "The bash command to execute",
+                }
+            },
+            "required": ["command"],
+        },
+    },
+}
+def parse_toolcall_actions(tool_calls: list, *, format_error_template: str) -> list[dict]:
+    """Parse tool calls from the response. Raises FormatError if unknown tool or invalid args."""
+    if not tool_calls:
+        raise FormatError(
+            {
+                "role": "user",
+                "content": Template(format_error_template, undefined=StrictUndefined).render(
+                    error="No tool calls found in the response. Every response MUST include at least one tool call."
+                ),
+                "extra": {"interrupt_type": "FormatError"},
+            }
+        )
+    actions = []
+    for tool_call in tool_calls:
+        error_msg = ""
+        args = {}
+        try:
+            args = json.loads(tool_call.function.arguments)
+        except Exception as e:
+            error_msg = f"Error parsing tool call arguments: {e}. "
+        if tool_call.function.name != "bash":
+            error_msg += f"Unknown tool '{tool_call.function.name}'."
+        if "command" not in args:
+            error_msg += "Missing 'command' argument in bash tool call."
+        if error_msg:
+            raise FormatError(
+                {
+                    "role": "user",
+                    "content": Template(format_error_template, undefined=StrictUndefined).render(
+                        error=error_msg.strip()
+                    ),
+                    "extra": {"interrupt_type": "FormatError"},
+                }
+            )
+        actions.append({"command": args["command"], "tool_call_id": tool_call.id})
+    return actions
+def format_toolcall_observation_messages(
+    *,
+    actions: list[dict],
+    outputs: list[dict],
+    observation_template: str,
+    template_vars: dict | None = None,
+    multimodal_regex: str = "",
+) -> list[dict]:
+    """Format execution outputs into tool result messages."""
+    not_executed = {"output": "", "returncode": -1, "exception_info": "action was not executed"}
+    padded_outputs = outputs + [not_executed] * (len(actions) - len(outputs))
+    results = []
+    for action, output in zip(actions, padded_outputs):
+        content = Template(observation_template, undefined=StrictUndefined).render(
+            output=output, **(template_vars or {})
+        )
+        msg = {
+            "content": content,
+            "extra": {
+                "raw_output": output.get("output", ""),
+                "returncode": output.get("returncode"),
+                "timestamp": time.time(),
+                "exception_info": output.get("exception_info"),
+                **output.get("extra", {}),
+            },
+        }
+        if "tool_call_id" in action:
+            msg["tool_call_id"] = action["tool_call_id"]
+            msg["role"] = "tool"
+        else:
+            msg["role"] = "user"  # human issued commands
+        if multimodal_regex:
+            msg = expand_multimodal_content(msg, pattern=multimodal_regex)
+        results.append(msg)
+    return results

minisweagent/models/utils/actions_toolcall_response.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Parse actions & format observations for OpenAI Responses API toolcalls"""
+import json
+import time
+from jinja2 import StrictUndefined, Template
+from minisweagent.exceptions import FormatError
+# OpenRouter/OpenAI Responses API uses a flat structure (no nested "function" key)
+BASH_TOOL_RESPONSE_API = {
+    "type": "function",
+    "name": "bash",
+    "description": "Execute a bash command",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "command": {
+                "type": "string",
+                "description": "The bash command to execute",
+            }
+        },
+        "required": ["command"],
+    },
+}
+def _format_error_message(error_text: str) -> dict:
+    """Create a FormatError message in Responses API format."""
+    return {
+        "type": "message",
+        "role": "user",
+        "content": [{"type": "input_text", "text": error_text}],
+        "extra": {"interrupt_type": "FormatError"},
+    }
+def parse_toolcall_actions_response(output: list, *, format_error_template: str) -> list[dict]:
+    """Parse tool calls from a Responses API response output.
+    Filters for function_call items and parses them.
+    Response API format has name/arguments at top level with call_id:
+    {"type": "function_call", "call_id": "...", "name": "bash", "arguments": "..."}
+    """
+    tool_calls = []
+    for item in output:
+        item_type = item.get("type") if isinstance(item, dict) else getattr(item, "type", None)
+        if item_type == "function_call":
+            tool_calls.append(
+                item.model_dump() if hasattr(item, "model_dump") else dict(item) if not isinstance(item, dict) else item
+            )
+    if not tool_calls:
+        error_text = Template(format_error_template, undefined=StrictUndefined).render(
+            error="No tool calls found in the response. Every response MUST include at least one tool call.",
+        )
+        raise FormatError(_format_error_message(error_text))
+    actions = []
+    for tool_call in tool_calls:
+        error_msg = ""
+        args = {}
+        try:
+            args = json.loads(tool_call.get("arguments", "{}"))
+        except Exception as e:
+            error_msg = f"Error parsing tool call arguments: {e}. "
+        if tool_call.get("name") != "bash":
+            error_msg += f"Unknown tool '{tool_call.get('name')}'."
+        if "command" not in args:
+            error_msg += "Missing 'command' argument in bash tool call."
+        if error_msg:
+            error_text = Template(format_error_template, undefined=StrictUndefined).render(error=error_msg.strip())
+            raise FormatError(_format_error_message(error_text))
+        actions.append({"command": args["command"], "tool_call_id": tool_call.get("call_id") or tool_call.get("id")})
+    return actions
+def format_toolcall_observation_messages(
+    *,
+    actions: list[dict],
+    outputs: list[dict],
+    observation_template: str,
+    template_vars: dict | None = None,
+    multimodal_regex: str = "",
+) -> list[dict]:
+    """Format execution outputs into function_call_output messages for Responses API."""
+    not_executed = {"output": "", "returncode": -1, "exception_info": "action was not executed"}
+    padded_outputs = outputs + [not_executed] * (len(actions) - len(outputs))
+    results = []
+    for action, output in zip(actions, padded_outputs):
+        content = Template(observation_template, undefined=StrictUndefined).render(
+            output=output, **(template_vars or {})
+        )
+        msg: dict = {
+            "extra": {
+                "raw_output": output.get("output", ""),
+                "returncode": output.get("returncode"),
+                "timestamp": time.time(),
+                "exception_info": output.get("exception_info"),
+                **output.get("extra", {}),
+            },
+        }
+        if "tool_call_id" in action:
+            msg["type"] = "function_call_output"
+            msg["call_id"] = action["tool_call_id"]
+            msg["output"] = content
+        else:  # human issued commands
+            msg["type"] = "message"
+            msg["role"] = "user"
+            msg["content"] = [{"type": "input_text", "text": content}]
+        results.append(msg)
+    return results

minisweagent/models/utils/anthropic_utils.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Utilities for Anthropic API compatibility."""
+def _is_anthropic_thinking_block(block) -> bool:
+    """Check if a content block is a thinking-type block."""
+    if not isinstance(block, dict):
+        return False
+    return block.get("type") in ("thinking", "redacted_thinking")
+def _reorder_anthropic_thinking_blocks(messages: list[dict]) -> list[dict]:
+    """Reorder thinking blocks so they are not the final block in assistant messages.
+    This is an Anthropic API requirement: thinking blocks must come before other blocks.
+    """
+    result = []
+    for msg in messages:
+        if msg.get("role") == "assistant" and isinstance(msg.get("content"), list):
+            content = msg["content"]
+            thinking_blocks = [b for b in content if _is_anthropic_thinking_block(b)]
+            if thinking_blocks:
+                other_blocks = [b for b in content if not _is_anthropic_thinking_block(b)]
+                if other_blocks:
+                    msg = {**msg, "content": thinking_blocks + other_blocks}
+                else:
+                    msg = {**msg, "content": thinking_blocks + [{"type": "text", "text": ""}]}
+        result.append(msg)
+    return result

minisweagent/models/utils/cache_control.py CHANGED Viewed

@@ -1,9 +1,15 @@
+"""Cache control utilities are mostly for Anthropic models.
+They are used to explicitly set cache control points.
+"""
 import copy
 import warnings
 from typing import Literal
-def _get_content_text(entry: dict) -> str:
+def _get_content_text(entry: dict) -> str | None:
+    if entry["content"] is None:
+        return None
     if isinstance(entry["content"], str):
         return entry["content"]
     assert len(entry["content"]) == 1, "Expected single message in content"
@@ -14,10 +20,16 @@ def _clear_cache_control(entry: dict) -> None:
     if isinstance(entry["content"], list):
         assert len(entry["content"]) == 1, "Expected single message in content"
         entry["content"][0].pop("cache_control", None)
+    # Note: entry["content"] can be None for assistant messages with only tool_use
     entry.pop("cache_control", None)
 def _set_cache_control(entry: dict) -> None:
+    # Handle None content (e.g., assistant messages with only tool_use)
+    if entry["content"] is None:
+        entry["cache_control"] = {"type": "ephemeral"}
+        return
     if not isinstance(entry["content"], list):
         entry["content"] = [  # type: ignore
             {
@@ -38,7 +50,8 @@ def set_cache_control(
     messages: list[dict], *, mode: Literal["default_end"] | None = "default_end", last_n_messages_offset: int = 0
 ) -> list[dict]:
     """This messages processor adds manual cache control marks to the messages."""
-    # ONLY ADD TO THE LAST MESSAGE
+    if mode is None:
+        return messages
     if mode != "default_end":
         raise ValueError(f"Invalid mode: {mode}")
     if last_n_messages_offset:

minisweagent/models/utils/content_string.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Helper function for pretty-printing content strings."""
+import json
+def _format_tool_call(args_str: str) -> str:
+    """Format tool call arguments, extracting command if it's a bash call."""
+    try:
+        args = json.loads(args_str) if isinstance(args_str, str) else args_str
+        if isinstance(args, dict) and "command" in args:
+            return f"```\n{args['command']}\n```"
+    except Exception:
+        pass
+    return f"```\n{args_str}\n```"
+def _format_observation(content: str) -> str | None:
+    """Try to format an observation JSON as key-value pairs."""
+    try:
+        data = json.loads(content)
+        if isinstance(data, dict) and "returncode" in data:
+            lines = []
+            for key, value in data.items():
+                lines.append(f"<{key}>")
+                lines.append(str(value))
+            return "\n".join(lines)
+        return content
+    except Exception:
+        return content
+def get_content_string(message: dict) -> str:
+    """Extract text content from any message format for display.
+    Handles:
+    - Traditional chat: {"content": "text"}
+    - Multimodal chat: {"content": [{"type": "text", "text": "..."}]}
+    - Observation messages: {"content": "{\"returncode\": 0, \"output\": \"...\"}"}
+    - Traditional tool calls: {"tool_calls": [{"function": {"name": "...", "arguments": "..."}}]}
+    - Responses API: {"output": [{"type": "message", "content": [...]}]}
+    """
+    texts = []
+    # Extract content (string or multimodal list)
+    content = message.get("content")
+    if isinstance(content, str):
+        texts.append(_format_observation(content))
+    elif isinstance(content, list):
+        texts.append("\n".join(item.get("text", "") for item in content if isinstance(item, dict)))
+    # Handle traditional tool_calls format (OpenAI/LiteLLM style)
+    if tool_calls := message.get("tool_calls"):
+        for tc in tool_calls:
+            func = tc.get("function", {}) if isinstance(tc, dict) else getattr(tc, "function", None)
+            if func:
+                args = func.get("arguments", "{}") if isinstance(func, dict) else getattr(func, "arguments", "{}")
+                texts.append(_format_tool_call(args))
+    # Handle Responses API format (output array)
+    if output := message.get("output"):
+        if isinstance(output, str):
+            texts.append(_format_observation(output))
+        elif isinstance(output, list):
+            for item in output:
+                if not isinstance(item, dict):
+                    continue
+                if item.get("type") == "message":
+                    for c in item.get("content", []):
+                        if isinstance(c, dict) and (text := c.get("text")):
+                            texts.append(text)
+                elif item.get("type") == "function_call":
+                    texts.append(_format_tool_call(item.get("arguments", "{}")))
+    return "\n\n".join(t for t in texts if t)

minisweagent/models/utils/openai_multimodal.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Utilities for handling multimodal content in OpenAI-style messages."""
+import copy
+import re
+from typing import Any
+DEFAULT_MULTIMODAL_REGEX = (
+    r"(?s)<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>(.+?)</CONTENT_TYPE>(.+?)</MSWEA_MULTIMODAL_CONTENT>"
+)
+def _expand_content_string(*, content: str, pattern: str) -> list[dict]:
+    """Expand a content string, replacing multimodal tags with structured content."""
+    matches = list(re.finditer(pattern, content))
+    if not matches:
+        return [{"type": "text", "text": content}]
+    result = []
+    last_end = 0
+    for match in matches:
+        text_before = content[last_end : match.start()]
+        if text_before:
+            result.append({"type": "text", "text": text_before})
+        content_type = match.group(1).strip()
+        extracted = match.group(2).strip()
+        if content_type == "image_url":
+            result.append({"type": "image_url", "image_url": {"url": extracted}})
+        last_end = match.end()
+    text_after = content[last_end:]
+    if text_after:
+        result.append({"type": "text", "text": text_after})
+    return result
+def expand_multimodal_content(content: Any, *, pattern: str) -> Any:
+    """Recursively expand multimodal content in messages.
+    Note: Returns copy of content, original content is not modified.
+    """
+    if not pattern:
+        return content
+    content = copy.deepcopy(content)
+    if isinstance(content, str):
+        return _expand_content_string(content=content, pattern=pattern)
+    if isinstance(content, list):
+        return [expand_multimodal_content(item, pattern=pattern) for item in content]
+    if isinstance(content, dict):
+        if "content" not in content:
+            return content
+        content["content"] = expand_multimodal_content(content["content"], pattern=pattern)
+        return content
+    return str(content)

minisweagent/models/utils/retry.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Retry utility for model queries."""
+import logging
+import os
+from tenacity import Retrying, before_sleep_log, retry_if_not_exception_type, stop_after_attempt, wait_exponential
+def retry(*, logger: logging.Logger, abort_exceptions: list[type[Exception]]) -> Retrying:
+    """Thin wrapper around tenacity.Retrying to make use of global config etc.
+    Args:
+        logger: Logger to use for reporting retries
+        abort_exceptions: Exceptions to abort on.
+    Returns:
+        A tenacity.Retrying object.
+    """
+    return Retrying(
+        reraise=True,
+        stop=stop_after_attempt(int(os.getenv("MSWEA_MODEL_RETRY_STOP_AFTER_ATTEMPT", "10"))),
+        wait=wait_exponential(multiplier=1, min=4, max=60),
+        before_sleep=before_sleep_log(logger, logging.WARNING),
+        retry=retry_if_not_exception_type(tuple(abort_exceptions)),
+    )

minisweagent/run/benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Benchmark run scripts for mini-SWE-agent (e.g., SWE-bench)."""

minisweagent/run/{extra → benchmarks}/swebench.py RENAMED Viewed

@@ -13,19 +13,17 @@ import traceback
 from pathlib import Path
 import typer
-import yaml
-from datasets import load_dataset
 from jinja2 import StrictUndefined, Template
 from rich.live import Live
 from minisweagent import Environment
 from minisweagent.agents.default import DefaultAgent
-from minisweagent.config import builtin_config_dir, get_config_path
+from minisweagent.config import builtin_config_dir, get_config_from_spec
 from minisweagent.environments import get_environment
 from minisweagent.models import get_model
-from minisweagent.run.extra.utils.batch_progress import RunBatchProgressManager
-from minisweagent.run.utils.save import save_traj
+from minisweagent.run.benchmarks.utils.batch_progress import RunBatchProgressManager
 from minisweagent.utils.log import add_file_handler, logger
+from minisweagent.utils.serialize import UNSET, recursive_merge
 _HELP_TEXT = """Run mini-SWE-agent on SWEBench instances.
@@ -34,7 +32,23 @@ More information about the usage: [bold green]https://mini-swe-agent.com/latest/
 [/not dim]
 """
-app = typer.Typer(rich_markup_mode="rich", add_completion=False)
+_CONFIG_SPEC_HELP_TEXT = """Path to config files, filenames, or key-value pairs.
+[bold red]IMPORTANT:[/bold red] [red]If you set this option, the default config file will not be used.[/red]
+So you need to explicitly set it e.g., with [bold green]-c swebench.yaml <other options>[/bold green]
+Multiple configs will be recursively merged.
+Examples:
+[bold red]-c model.model_kwargs.temperature=0[/bold red] [red]You forgot to add the default config file! See above.[/red]
+[bold green]-c swebench.yaml -c model.model_kwargs.temperature=0.5[/bold green]
+[bold green]-c swebench.yaml -c agent.max_iterations=50[/bold green]
+"""
+DEFAULT_CONFIG_FILE = builtin_config_dir / "benchmarks" / "swebench.yaml"
 DATASET_MAPPING = {
     "full": "princeton-nlp/SWE-Bench",
@@ -46,7 +60,7 @@ DATASET_MAPPING = {
     "_test": "klieret/swe-bench-dummy-test-dataset",
 }
+app = typer.Typer(rich_markup_mode="rich", add_completion=False)
 _OUTPUT_FILE_LOCK = threading.Lock()
@@ -60,9 +74,7 @@ class ProgressTrackingAgent(DefaultAgent):
     def step(self) -> dict:
         """Override step to provide progress updates."""
-        self.progress_manager.update_instance_status(
-            self.instance_id, f"Step {self.model.n_calls + 1:3d} (${self.model.cost:.2f})"
-        )
+        self.progress_manager.update_instance_status(self.instance_id, f"Step {self.n_calls + 1:3d} (${self.cost:.2f})")
         return super().step()
@@ -81,7 +93,7 @@ def get_sb_environment(config: dict, instance: dict) -> Environment:
     env_config = config.setdefault("environment", {})
     env_config["environment_class"] = env_config.get("environment_class", "docker")
     image_name = get_swebench_docker_image_name(instance)
-    if env_config["environment_class"] == "docker":
+    if env_config["environment_class"] in ["docker", "swerex_modal"]:
         env_config["image"] = image_name
     elif env_config["environment_class"] == "singularity":
         env_config["image"] = "docker://" + image_name
@@ -138,7 +150,9 @@ def process_instance(
     progress_manager.update_instance_status(instance_id, "Pulling/starting docker")
     agent = None
-    extra_info = None
+    exit_status = None
+    result = None
+    extra_info = {}
     try:
         env = get_sb_environment(config, instance)
@@ -149,21 +163,28 @@ def process_instance(
             instance_id=instance_id,
             **config.get("agent", {}),
         )
-        exit_status, result = agent.run(task)
+        info = agent.run(task)
+        exit_status = info.get("exit_status")
+        result = info.get("submission")
     except Exception as e:
         logger.error(f"Error processing instance {instance_id}: {e}", exc_info=True)
-        exit_status, result = type(e).__name__, str(e)
-        extra_info = {"traceback": traceback.format_exc()}
+        exit_status, result = type(e).__name__, ""
+        extra_info = {"traceback": traceback.format_exc(), "exception_str": str(e)}
     finally:
-        save_traj(
-            agent,
-            instance_dir / f"{instance_id}.traj.json",
-            exit_status=exit_status,
-            result=result,
-            extra_info=extra_info,
-            instance_id=instance_id,
-            print_fct=logger.info,
-        )
+        if agent is not None:
+            traj_path = instance_dir / f"{instance_id}.traj.json"
+            agent.save(
+                traj_path,
+                {
+                    "info": {
+                        "exit_status": exit_status,
+                        "submission": result,
+                        **extra_info,
+                    },
+                    "instance_id": instance_id,
+                },
+            )
+            logger.info(f"Saved trajectory to '{traj_path}'")
         update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result)
         progress_manager.on_instance_end(instance_id, exit_status)
@@ -201,8 +222,8 @@ def main(
     model: str | None = typer.Option(None, "-m", "--model", help="Model to use", rich_help_panel="Basic"),
     model_class: str | None = typer.Option(None, "--model-class", help="Model class to use (e.g., 'anthropic' or 'minisweagent.models.anthropic.AnthropicModel')", rich_help_panel="Advanced"),
     redo_existing: bool = typer.Option(False, "--redo-existing", help="Redo existing instances", rich_help_panel="Data selection"),
-    config_spec: Path = typer.Option( builtin_config_dir / "extra" / "swebench.yaml", "-c", "--config", help="Path to a config file", rich_help_panel="Basic"),
-    environment_class: str | None = typer.Option( None, "--environment-class", help="Environment type to use. Recommended are docker or singularity", rich_help_panel="Advanced"),
+    config_spec: list[str] = typer.Option([str(DEFAULT_CONFIG_FILE)], "-c", "--config", help=_CONFIG_SPEC_HELP_TEXT, rich_help_panel="Basic"),
+    environment_class: str | None = typer.Option(None, "--environment-class", help="Environment type to use. Recommended are docker or singularity", rich_help_panel="Advanced"),
 ) -> None:
     # fmt: on
     output_path = Path(output)
@@ -210,6 +231,8 @@ def main(
     logger.info(f"Results will be saved to {output_path}")
     add_file_handler(output_path / "minisweagent.log")
+    from datasets import load_dataset
     dataset_path = DATASET_MAPPING.get(subset, subset)
     logger.info(f"Loading dataset {dataset_path}, split {split}...")
     instances = list(load_dataset(dataset_path, split=split))
@@ -221,15 +244,13 @@ def main(
         instances = [instance for instance in instances if instance["instance_id"] not in existing_instances]
     logger.info(f"Running on {len(instances)} instances...")
-    config_path = get_config_path(config_spec)
-    logger.info(f"Loading agent config from '{config_path}'")
-    config = yaml.safe_load(config_path.read_text())
-    if environment_class is not None:
-        config.setdefault("environment", {})["environment_class"] = environment_class
-    if model is not None:
-        config.setdefault("model", {})["model_name"] = model
-    if model_class is not None:
-        config.setdefault("model", {})["model_class"] = model_class
+    logger.info(f"Building agent config from specs: {config_spec}")
+    configs = [get_config_from_spec(spec) for spec in config_spec]
+    configs.append({
+        "environment": {"environment_class": environment_class or UNSET},
+        "model": {"model_name": model or UNSET, "model_class": model_class or UNSET},
+    })
+    config = recursive_merge(*configs)
     progress_manager = RunBatchProgressManager(len(instances), output_path / f"exit_statuses_{time.time()}.yaml")

mini-swe-agent 1.17.5__py3-none-any.whl → 2.0.0a1__py3-none-any.whl

mini-swe-agent 1.17.5py3-none-any.whl → 2.0.0a1py3-none-any.whl