PyPI - mantis-agent-sdk - Versions diffs - 2.3.0__tar.gz → 2.4.0__tar.gz - Mend

mantis-agent-sdk 2.3.0tar.gz → 2.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/CHANGELOG.md RENAMED Viewed

@@ -74,6 +74,21 @@ The full versioning policy is in [SEMVER.md](SEMVER.md).
   Three new public exports: `ResponseFormatError`,
   `normalize_response_format`, `translate_response_format`.
+## [2.4.0] — 2026-06-30
+### Added
+- **Refusal recovery.** When the model ends a turn with a bare, no-tool-call
+  refusal ("I'm sorry, but I can't complete that request") — the spurious
+  over-refusals small/aligned models emit on perfectly legitimate local work
+  (listing processes/ports, reading your own files, running builds) — the agent
+  now nudges it ONCE with a reminder that it's operating in the user's own
+  authorized environment and re-prompts, instead of dead-ending the task. Capped
+  at one retry per run, so a genuinely harmful request is simply refused again
+  and stops. New `Agent.recover_refusals` flag (default True; set False to opt
+  out). New `_looks_like_refusal` detector (length-capped + precise, so a long
+  answer or an "I can't find that file" isn't misread).
 ## [2.3.0] — 2026-06-30
 ### Added

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mantis-agent-sdk
-Version: 2.3.0
+Version: 2.4.0
 Summary: Drop-in open-source agent SDK. Multi-model, streaming, MCP, sub-agents.
 Project-URL: Homepage, https://github.com/teddyoweh/mantis-agent-sdk
 Project-URL: Repository, https://github.com/teddyoweh/mantis-agent-sdk

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/mantis_agent/__init__.py RENAMED Viewed

@@ -284,7 +284,7 @@ def _detect_version() -> str:
         return version("mantis-agent-sdk")
     except Exception:  # pragma: no cover - extremely defensive
-        return "2.3.0"
+        return "2.4.0"
 __version__ = _detect_version()

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/mantis_agent/agent.py RENAMED Viewed

@@ -117,6 +117,48 @@ def _render_todo_reminder(todos: list[dict[str, Any]]) -> str:
     return wrap_system_reminder("\n".join(lines))
+_REFUSAL_RE = re.compile(
+    r"\b("
+    r"i'?m sorry,?\s+but\s+i\s+(can'?t|cannot)"
+    r"|i\s+(can'?t|cannot)\s+(help|assist|comply|complete|continue|do that|fulf-?ill|proceed)"
+    r"|i'?m\s+(not able|unable)\s+to\s+(help|assist|complete|comply|proceed|do)"
+    r"|i\s+(won'?t|will not)\s+(be able to\s+)?(help|assist|comply|complete)"
+    r"|i\s+(must|have to)\s+decline"
+    r"|i\s+can'?t\s+assist\s+with\s+that"
+    r")\b",
+    re.IGNORECASE,
+)
+def _looks_like_refusal(text: str) -> bool:
+    """A short assistant turn that reads as a canned refusal. Length-capped so a
+    long, substantive answer that merely contains a hedge isn't misread."""
+    t = (text or "").strip()
+    if not t or len(t) > 600:
+        return False
+    return bool(_REFUSAL_RE.search(t))
+def _refusal_nudge() -> "UserMessage":
+    """A one-shot system-reminder that re-frames the authorized context, so a
+    spurious over-refusal doesn't dead-end a legitimate local task."""
+    from .system_reminder import wrap_system_reminder  # noqa: PLC0415
+    return UserMessage(
+        content=wrap_system_reminder(
+            "You just declined — but you are the user's own coding agent running "
+            "inside THEIR authorized local environment, at their request. Routine "
+            "development and sysadmin on the user's own machine — listing "
+            "processes/ports, reading local files, running builds/tests/scripts, "
+            "inspecting the user's own services — is legitimate and expected, not "
+            "something to refuse. Reconsider and actually complete the task. Only if "
+            "a request is genuinely harmful, name the specific concern instead of a "
+            "generic refusal."
+        ),
+        isMeta=True,
+    )
 _SHELL_FENCE_LANGS = {"bash", "sh", "shell", "zsh", "console", "shellsession"}
 _FENCE_RE = re.compile(r"```([a-zA-Z]*)[ \t]*\n(.*?)```", re.DOTALL)
@@ -208,6 +250,12 @@ class Agent:
     # ``max_steps`` budget (and minutes of wall-clock) re-running an identical
     # failing command. 0 disables the guard.
     max_repeated_tool_calls: int = 3
+    # Refusal recovery: if the model ends a turn with a bare, no-tool-call
+    # refusal ("I'm sorry, but I can't complete that request"), nudge it ONCE
+    # with a reminder that it's the user's own authorized environment and let it
+    # retry, instead of dead-ending the task on a spurious over-refusal. A
+    # genuinely harmful request just gets refused again and stops. 0/False off.
+    recover_refusals: bool = True
     extra: dict[str, Any] | None = None
     # Capability + safety surface (M0.1 / M2)
@@ -287,6 +335,7 @@ class Agent:
     _env_context: str | None = field(default=None, init=False)
     # Set once the fallback model has been activated, so we don't loop.
     _fallback_used: bool = field(default=False, init=False)
+    _refusal_retried: bool = field(default=False, init=False)
     # Absolute paths of memory files already surfaced this session, so recall
     # doesn't re-inject the same note every turn.
     _surfaced: set[str] = field(default_factory=set, init=False)
@@ -736,6 +785,7 @@ class Agent:
         last_usage: Usage | None = None
         compactions = 0
         _MAX_COMPACTIONS = 5
+        self._refusal_retried = False
         for _ in range(self.max_steps):
             # If the cancellation signal already fired BEFORE this turn
@@ -991,6 +1041,25 @@ class Agent:
                 tool_uses = [
                     b for b in assistant.content if isinstance(b, ToolUseBlock)
                 ]
+                if not tool_uses and self.recover_refusals and not self._refusal_retried:
+                    # Bare, no-tool-call refusal? Nudge ONCE with the authorized-
+                    # context reminder and re-prompt instead of dead-ending. A
+                    # ``continue`` exits this turn's ``async with executor`` cleanly
+                    # (no tools were dispatched) and re-streams with the nudge.
+                    _text = "".join(
+                        b.text for b in assistant.content if isinstance(b, TextBlock)
+                    )
+                    if _looks_like_refusal(_text):
+                        self._refusal_retried = True
+                        messages.append(_refusal_nudge())
+                        if turn_span is not None and self.tracer is not None:
+                            turn_span.set_attributes({"turn.refusal_recovered": True})
+                            turn_span.end()
+                            mirror = getattr(self.tracer, "_mirror", None) or self.tracer
+                            close_fn = getattr(mirror, "_close", None)
+                            if callable(close_fn):
+                                close_fn(turn_span)
+                        continue
                 if not tool_uses:
                     # Natural turn-end. Fire Stop hook and exit cleanly —
                     # the executor's ``__aexit__`` releases its task group

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/mantis_agent/setup_wizard.py RENAMED Viewed

@@ -391,6 +391,8 @@ def _pick_model_id(c: Any, models: list[str], *, current: str | None = None) ->
     The current default (if any) is pre-highlighted. Returns the id or None."""
     from rich.text import Text  # noqa: PLC0415
+    if not models:  # a provider that returned nothing — nothing to pick
+        return None
     shown = models[:30]
     rows = [(m, "← current" if m == current else "") for m in shown]
     start = shown.index(current) if current in shown else 0

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/mantis_agent/tui.py RENAMED Viewed

@@ -1045,13 +1045,15 @@ class MantisTUI:
             if "localhost" in (self.backend or "") or "127.0.0.1" in (self.backend or ""):
                 self.console.print(
                     f"[ansiyellow]![/] [ansibrightblack]can't reach Ollama at "
-                    f"{self.backend} — is it running? ([white]ollama serve[/])[/]"
+                    f"{self.backend}. Run [white]mantis setup[/] to get a model "
+                    f"(local or hosted), or start Ollama ([white]ollama serve[/]).[/]"
                 )
             return
         if not available:
             self.console.print(
                 f"[ansiyellow]![/] [ansibrightblack]no models installed on "
-                f"{self.backend}. Pull one:[/] [white]ollama pull {self.model}[/]"
+                f"{self.backend}. Run [white]mantis setup[/] to add one, or "
+                f"[white]ollama pull {self.model}[/].[/]"
             )
             return
         picked = self._pick_model(self.model, available)

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mantis-agent-sdk"
-version = "2.3.0"
+version = "2.4.0"
 description = "Drop-in open-source agent SDK. Multi-model, streaming, MCP, sub-agents."
 readme = "README.md"
 requires-python = ">=3.11"

{mantis_agent_sdk-2.3.0 → mantis_agent_sdk-2.4.0}/tests/test_model_setup_sources.py RENAMED Viewed

@@ -108,6 +108,140 @@ def test_selfhost_probe_unreachable_returns_none() -> None:
 # -- Model ping (validate-before-save) ---------------------------------------
+def test_hosted_flow_end_to_end_saves_model(monkeypatch, tmp_path) -> None:
+    # Drive the WHOLE hosted setup orchestration (not just helpers): pick a
+    # provider → paste key → validate → pick a model → confirm → save. Mocks the
+    # network + I/O; asserts the model is persisted as the default.
+    monkeypatch.setenv("MANTIS_AGENT_HOME", str(tmp_path))
+    monkeypatch.delenv("DEEPSEEK_API_KEY", raising=False)
+    from mantis_agent import setup_wizard as sw
+    inputs = iter(["1", "1"])  # provider #1 (DeepSeek), then model #1
+    monkeypatch.setattr("builtins.input", lambda *a: next(inputs))
+    monkeypatch.setattr("getpass.getpass", lambda *a: "sk-test-key")
+    monkeypatch.setattr(catalog, "validate_provider", lambda *a, **k: (True, "ok"))
+    monkeypatch.setattr(catalog, "refresh_live_models", lambda *a, **k: ["deepseek-chat", "deepseek-reasoner"])
+    monkeypatch.setattr(sw, "_confirm_model", lambda *a, **k: True)
+    try:
+        rc = sw._run_hosted(_NullConsole(), free_only=False)
+        assert rc == 0
+        last = catalog.get_last_model()
+        assert last and last["model"] == "deepseek-chat"
+        assert last["backend"] == catalog.BY_ID["deepseek"].base_url
+    finally:
+        catalog.clear_key("deepseek")
+def test_hosted_flow_aborts_when_key_invalid(monkeypatch, tmp_path) -> None:
+    # A rejected key must NOT save anything and must clear the bad key.
+    monkeypatch.setenv("MANTIS_AGENT_HOME", str(tmp_path))
+    monkeypatch.delenv("DEEPSEEK_API_KEY", raising=False)
+    from mantis_agent import setup_wizard as sw
+    monkeypatch.setattr("builtins.input", lambda *a: "1")
+    monkeypatch.setattr("getpass.getpass", lambda *a: "bad-key")
+    monkeypatch.setattr(catalog, "validate_provider", lambda *a, **k: (False, "invalid API key"))
+    rc = sw._run_hosted(_NullConsole(), free_only=False)
+    assert rc == 1
+    assert catalog.saved_key("deepseek") is None
+def test_selfhost_flow_end_to_end_saves_model(monkeypatch, tmp_path) -> None:
+    # URL → probe /v1/models → pick → confirm → save backend+model.
+    monkeypatch.setenv("MANTIS_AGENT_HOME", str(tmp_path))
+    from mantis_agent import setup_wizard as sw
+    inputs = iter(["http://localhost:9911/v1", "1"])  # base URL, then model #1
+    monkeypatch.setattr("builtins.input", lambda *a: next(inputs))
+    monkeypatch.setattr("getpass.getpass", lambda *a: "")  # local server, no key
+    monkeypatch.setattr(sw, "_probe_openai_models", lambda *a, **k: ["local-coder"])
+    monkeypatch.setattr(sw, "_confirm_model", lambda *a, **k: True)
+    rc = sw._run_selfhost(_NullConsole())
+    assert rc == 0
+    last = catalog.get_last_model()
+    assert last and last["model"] == "local-coder"
+    assert last["backend"] == "http://localhost:9911/v1"
+def test_anthropic_apikey_flow_end_to_end(monkeypatch, tmp_path) -> None:
+    # Claude auth chooser → API key → validate → pick model → save.
+    monkeypatch.setenv("MANTIS_AGENT_HOME", str(tmp_path))
+    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+    from mantis_agent import setup_wizard as sw
+    inputs = iter(["1", "1"])  # auth method #1 (API key), then model #1
+    monkeypatch.setattr("builtins.input", lambda *a: next(inputs))
+    monkeypatch.setattr("getpass.getpass", lambda *a: "sk-ant-key")
+    monkeypatch.setattr(sw, "_ping_anthropic_model", lambda *a, **k: (True, "ok"))
+    try:
+        rc = sw._run_anthropic(_NullConsole(), catalog.BY_ID["anthropic"])
+        assert rc == 0
+        last = catalog.get_last_model()
+        assert last and last["model"].startswith("claude-")
+        assert catalog.saved_key("anthropic") == "sk-ant-key"
+    finally:
+        catalog.clear_key("anthropic")
+def test_local_flow_end_to_end_saves_model(monkeypatch, tmp_path) -> None:
+    # Local Ollama flow: ensure server → pull → verify → save as default.
+    # Mocks the ollama subprocess/daemon; asserts the tag is persisted @ 11434.
+    import subprocess
+    import types
+    monkeypatch.setenv("MANTIS_AGENT_HOME", str(tmp_path))
+    from mantis_agent import setup_local
+    from mantis_agent import setup_wizard as sw
+    monkeypatch.setattr(setup_local, "is_ollama_installed", lambda: True)
+    monkeypatch.setattr(setup_local, "start_ollama_server", lambda: (True, ""))
+    monkeypatch.setattr(subprocess, "call", lambda *a, **k: 0)  # the `ollama pull`
+    monkeypatch.setattr(sw, "_ollama_has", lambda tag: True)
+    args = types.SimpleNamespace(model="qwen2.5-coder:7b", list_only=False, auto=False)
+    rc = sw._run_local(_NullConsole(), args)
+    assert rc == 0
+    last = catalog.get_last_model()
+    assert last and last["model"] == "qwen2.5-coder:7b"
+    assert "11434" in (last["backend"] or "")
+def test_local_flow_aborts_when_pull_fails(monkeypatch, tmp_path) -> None:
+    import subprocess
+    import types
+    monkeypatch.setenv("MANTIS_AGENT_HOME", str(tmp_path))
+    from mantis_agent import setup_local
+    from mantis_agent import setup_wizard as sw
+    monkeypatch.setattr(setup_local, "is_ollama_installed", lambda: True)
+    monkeypatch.setattr(setup_local, "start_ollama_server", lambda: (True, ""))
+    monkeypatch.setattr(subprocess, "call", lambda *a, **k: 1)  # pull fails
+    args = types.SimpleNamespace(model="qwen2.5-coder:7b", list_only=False, auto=False)
+    assert sw._run_local(_NullConsole(), args) == 1
+def test_run_setup_entry_points_exit_cleanly_on_cancel(monkeypatch, tmp_path) -> None:
+    # Every `mantis setup [flag]` entry point must exit cleanly (0 or 1) even when
+    # the user cancels at the first prompt — never propagate an exception. This
+    # codifies the live-binary smoke test as a regression guard.
+    monkeypatch.setenv("MANTIS_AGENT_HOME", str(tmp_path))
+    from mantis_agent.setup_wizard import run_setup
+    def _eof(*_a: object) -> str:
+        raise EOFError
+    monkeypatch.setattr("builtins.input", _eof)
+    monkeypatch.setattr("getpass.getpass", _eof)
+    for argv in ([], ["--status"], ["--list"], ["--hosted"], ["--free"], ["--selfhost"]):
+        rc = run_setup(argv)
+        assert rc in (0, 1), f"{argv} returned {rc!r}"
 def test_print_status_never_crashes() -> None:
     # `mantis setup --status` must render whatever the config is (or nothing)
     # without raising — it runs before any provider is even set up.
@@ -190,6 +324,13 @@ class _NullConsole:
         pass
+def test_pick_model_id_empty_list_returns_none() -> None:
+    # A provider that returned no models must not crash the picker (was IndexError
+    # on the "Enter=<first>" prompt) — it returns None so the caller can bail.
+    from mantis_agent import setup_wizard as sw
+    assert sw._pick_model_id(_NullConsole(), []) is None
 def test_pick_model_id_numeric_fallback(monkeypatch) -> None:
     from mantis_agent import setup_wizard as sw
     monkeypatch.setattr("builtins.input", lambda *a: "2")

mantis_agent_sdk-2.4.0/tests/test_refusal_recovery.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""Refusal recovery — a bare no-tool-call refusal is nudged once and retried
+instead of dead-ending the task."""
+from __future__ import annotations
+from typing import Any
+import anyio
+from mantis_agent.agent import Agent, _looks_like_refusal
+from mantis_agent.capabilities import HOSTED_PROFILES
+from mantis_agent.events import (
+    ContentBlockDelta,
+    ContentBlockStart,
+    ContentBlockStop,
+    MessageDelta,
+    MessageStart,
+    MessageStop,
+    TextDelta,
+)
+from mantis_agent.types import AssistantMessage, TextBlock, UserMessage, Usage
+class _ScriptedTexts:
+    """Provider that returns a different text turn on each call."""
+    name = "mock"
+    def __init__(self, texts: list[str]) -> None:
+        self._texts = list(texts)
+        self.backend_capability = HOSTED_PROFILES["mock"]
+        self.calls = 0
+    async def stream(self, *, model: str, messages: Any, **_kw: Any):
+        self.calls += 1
+        text = self._texts.pop(0) if self._texts else "(done)"
+        yield MessageStart(message_id="m", model="mock")
+        yield ContentBlockStart(index=0, block=TextBlock(text=""))
+        yield ContentBlockDelta(index=0, delta=TextDelta(text=text))
+        yield ContentBlockStop(index=0)
+        yield MessageDelta(stop_reason="end_turn", usage=Usage(input_tokens=1, output_tokens=1))
+        yield MessageStop()
+def _run(provider, **agent_kw) -> list:
+    async def go():
+        agent = Agent(model="mock", provider=provider, **agent_kw)
+        msgs: list = [UserMessage(content="list my listening ports")]
+        async for _ in agent.run_iter(msgs):
+            pass
+        return msgs
+    return anyio.run(go)
+def _texts(msgs) -> list[str]:
+    return ["".join(b.text for b in m.content if isinstance(b, TextBlock))
+            for m in msgs if isinstance(m, AssistantMessage)]
+def test_refusal_is_nudged_and_retried() -> None:
+    prov = _ScriptedTexts([
+        "I'm sorry, but I can't complete that request.",
+        "Here are your listening ports: 8000, 8888, 5433.",
+    ])
+    msgs = _run(prov)
+    assert prov.calls == 2                                   # it retried
+    # a one-shot authorized-context nudge was injected
+    assert any(getattr(m, "isMeta", False) and "authorized" in str(m.content).lower()
+               for m in msgs)
+    assert "8000, 8888, 5433" in _texts(msgs)[-1]            # real answer produced
+def test_opt_out_stops_on_refusal() -> None:
+    prov = _ScriptedTexts([
+        "I'm sorry, but I can't complete that request.",
+        "should never be reached",
+    ])
+    msgs = _run(prov, recover_refusals=False)
+    assert prov.calls == 1                                   # no retry
+    assert not any(getattr(m, "isMeta", False) for m in msgs)
+def test_only_retries_once() -> None:
+    prov = _ScriptedTexts([
+        "I'm sorry, but I can't help with that.",
+        "I cannot help with that.",          # refuses again after the nudge
+        "should never be reached",
+    ])
+    msgs = _run(prov)
+    assert prov.calls == 2                                   # nudged once, then gave up
+    assert _texts(msgs)[-1] == "I cannot help with that."
+def test_normal_answer_not_retried() -> None:
+    prov = _ScriptedTexts(["Sure — your ports are 8000 and 8888."])
+    msgs = _run(prov)
+    assert prov.calls == 1                                   # no spurious retry
+    assert not any(getattr(m, "isMeta", False) for m in msgs)
+def test_detector_precision() -> None:
+    assert _looks_like_refusal("I'm sorry, but I cannot assist with that.")
+    assert not _looks_like_refusal("I can't find that file — did you mean app.py?")
+    assert not _looks_like_refusal("Done. " * 200)          # long answer, not a refusal