npm - @miller-tech/uap - Versions diffs - 1.20.34 → 1.20.36 - Mend

@miller-tech/uap 1.20.34 → 1.20.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +1 -0
package/docs/deployment/QWEN35_LLAMA_CPP.md +15 -6
package/package.json +1 -1
package/tools/agents/config/qwen3.5-enhanced.jinja +187 -0
package/tools/agents/scripts/anthropic_proxy.py +1097 -59
package/tools/agents/scripts/tool-choice-proxy.cjs +12 -0
package/tools/agents/tests/test_anthropic_proxy_streaming.py +193 -8

package/tools/agents/scripts/tool-choice-proxy.cjs CHANGED Viewed

@@ -2,6 +2,18 @@
 /**
  * Layer 1: Intelligent Agent Execution Proxy (v1.0.0)
  *
+ * DEPRECATED: This OpenAI-only shim is superseded by anthropic_proxy.py's
+ * /v1/chat/completions route, which provides the same OpenAI Chat Completions
+ * surface AND runs through the canonical guarded Anthropic pipeline (loop
+ * detection, tool narrowing, malformed-payload retry, context pruning).
+ *
+ * New deployments should point clients at the anthropic-proxy port (default
+ * 4000) and use either /v1/messages (Anthropic) or /v1/chat/completions
+ * (OpenAI passthrough). This script is retained for backward compatibility
+ * with installations that still reference it via `uap tool-calls` tooling
+ * (see src/cli/tool-calls.ts) and the in-container Qwen benchmark
+ * (scripts/benchmarks/run-tbench-qwen35-quick.sh).
+ *
  * Model-agnostic proxy that sits between any OpenAI-compatible client and
  * any OpenAI-compatible inference server. Implements:
  *

package/tools/agents/tests/test_anthropic_proxy_streaming.py CHANGED Viewed

@@ -2140,8 +2140,12 @@ class TestToolTurnControls(unittest.TestCase):
             }
             openai = proxy.build_openai_request(body, monitor)
-            self.assertNotIn("tools", openai)
-            self.assertNotIn("tool_choice", openai)
+            # Finalize turn keeps tools available but switches tool_choice to
+            # 'auto' so the model can complete with a tool call or summarise.
+            # Earlier behaviour stripped tools entirely, which caused Anthropic
+            # clients to see end_turn with no action and halt.
+            self.assertIn("tools", openai)
+            self.assertEqual(openai.get("tool_choice"), "auto")
             self.assertEqual(monitor.tool_turn_phase, "finalize")
             self.assertTrue(monitor.finalize_turn_active)
         finally:
@@ -2229,7 +2233,7 @@ class TestToolTurnControls(unittest.TestCase):
         finally:
             setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
-    def test_state_machine_finalize_temporarily_disables_tools(self):
+    def test_state_machine_finalize_keeps_tools_with_auto_choice(self):
         old_state = getattr(proxy, "PROXY_TOOL_STATE_MACHINE")
         old_min_msgs = getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES")
         old_stagnation = getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD")
@@ -2293,8 +2297,10 @@ class TestToolTurnControls(unittest.TestCase):
             }
             openai = proxy.build_openai_request(body, monitor)
-            self.assertNotIn("tools", openai)
-            self.assertNotIn("tool_choice", openai)
+            # Finalize keeps tools + tool_choice=auto so the model can either
+            # complete with a tool call or emit a plain-text summary.
+            self.assertIn("tools", openai)
+            self.assertEqual(openai.get("tool_choice"), "auto")
         finally:
             setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
             setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
@@ -3512,28 +3518,78 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
             setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
     def test_max_tokens_floor_applied_when_thinking_active(self):
-        """max_tokens floor should apply when tools present and thinking enabled."""
+        """Floor applies on non-preflight tool turns with thinking enabled."""
         old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
         old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
         try:
             setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
             setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
+            # max_tokens=1536 is above SMALL_PREFLIGHT_THRESHOLD (1024), so the
+            # request does NOT take the preflight carveout and the regular
+            # floor path applies. Small-preflight bypass is covered separately
+            # in test_max_tokens_floor_bypassed_for_small_preflight.
             body = {
                 "model": "test",
-                "max_tokens": 512,
+                "max_tokens": 1536,
                 "messages": [{"role": "user", "content": "run command"}],
                 "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
             }
             openai = proxy.build_openai_request(
                 body, proxy.SessionMonitor(context_window=0)
             )
-            # Tools + thinking enabled = floor applied
             self.assertEqual(openai.get("max_tokens"), 4096)
         finally:
             setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
             setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
+    def test_max_tokens_floor_bypassed_for_small_preflight(self):
+        """Small preflight requests (max_tokens <= SMALL_PREFLIGHT_THRESHOLD)
+        bypass the big floor and instead get THINKING_MIN_FOR_TOOLS=2048
+        bump so Qwen's mandatory thinking has room before the tool call."""
+        old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
+        old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
+        try:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
+            body = {
+                "model": "test",
+                "max_tokens": 512,
+                "messages": [{"role": "user", "content": "run command"}],
+                "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
+            }
+            openai = proxy.build_openai_request(
+                body, proxy.SessionMonitor(context_window=0)
+            )
+            self.assertEqual(openai.get("max_tokens"), 2048)
+        finally:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
+    def test_max_tokens_true_preflight_left_alone(self):
+        """True preflight requests (max_tokens <= 16) are not inflated, even
+        with tools present, so plan-generation latency stays low."""
+        old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
+        old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
+        try:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
+            body = {
+                "model": "test",
+                "max_tokens": 1,
+                "messages": [{"role": "user", "content": "ping"}],
+                "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
+            }
+            openai = proxy.build_openai_request(
+                body, proxy.SessionMonitor(context_window=0)
+            )
+            self.assertEqual(openai.get("max_tokens"), 1)
+        finally:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
 class TestGenerationHangRecovery(unittest.TestCase):
     """Tests for generation hang recovery: timeouts, slot hang detection."""
@@ -4818,3 +4874,132 @@ class TestUpstream503Resilience(unittest.TestCase):
         """Does not match 200 even with loading text."""
         resp = httpx.Response(200, text='{"status":"loading model"}')
         self.assertFalse(proxy._is_loading_model_503(resp))
+class TestOpenAIPassthroughConversion(unittest.TestCase):
+    """Tests for the /v1/chat/completions OpenAI passthrough route.
+    The route converts OpenAI Chat Completions requests to Anthropic
+    Messages, runs the full guarded Anthropic pipeline, then converts the
+    response back to OpenAI shape. This exercises the pure conversion
+    helpers (openai_to_anthropic_request, anthropic_to_openai_response) in
+    isolation so a regression in the dual-interface surface is caught
+    without needing a live FastAPI client."""
+    def test_openai_to_anthropic_request_preserves_user_and_assistant_text(self):
+        """User and assistant text messages survive the OpenAI->Anthropic
+        conversion with the expected role + content shape."""
+        openai_body = {
+            "model": "qwen35",
+            "max_tokens": 1024,
+            "messages": [
+                {"role": "system", "content": "you are helpful"},
+                {"role": "user", "content": "hello"},
+                {"role": "assistant", "content": "hi there"},
+                {"role": "user", "content": "thanks"},
+            ],
+        }
+        anthropic_body = proxy.openai_to_anthropic_request(openai_body)
+        self.assertEqual(anthropic_body.get("model"), "qwen35")
+        self.assertEqual(anthropic_body.get("max_tokens"), 1024)
+        # System collapses into a top-level 'system' field
+        self.assertIn("system", anthropic_body)
+        # Non-system messages preserved in order
+        msgs = anthropic_body.get("messages", [])
+        self.assertEqual(len(msgs), 3)
+        self.assertEqual(msgs[0]["role"], "user")
+        self.assertEqual(msgs[1]["role"], "assistant")
+        self.assertEqual(msgs[2]["role"], "user")
+    def test_openai_to_anthropic_request_converts_tool_response(self):
+        """OpenAI 'role: tool' messages become Anthropic user messages with
+        a tool_result content block — required so the guarded pipeline can
+        track tool history across turns."""
+        openai_body = {
+            "model": "test",
+            "messages": [
+                {"role": "user", "content": "run pwd"},
+                {
+                    "role": "assistant",
+                    "content": None,
+                    "tool_calls": [
+                        {
+                            "id": "call_1",
+                            "type": "function",
+                            "function": {"name": "Bash", "arguments": '{"command": "pwd"}'},
+                        }
+                    ],
+                },
+                {"role": "tool", "tool_call_id": "call_1", "content": "/home/user"},
+            ],
+        }
+        anthropic_body = proxy.openai_to_anthropic_request(openai_body)
+        msgs = anthropic_body.get("messages", [])
+        # Last message is the tool result, encoded as Anthropic user/tool_result
+        tool_result_msg = msgs[-1]
+        self.assertEqual(tool_result_msg["role"], "user")
+        blocks = tool_result_msg["content"]
+        self.assertEqual(len(blocks), 1)
+        self.assertEqual(blocks[0]["type"], "tool_result")
+        self.assertEqual(blocks[0]["tool_use_id"], "call_1")
+        self.assertEqual(blocks[0]["content"], "/home/user")
+    def test_anthropic_to_openai_response_text_only(self):
+        """A plain-text Anthropic response becomes OpenAI choices[0] with
+        finish_reason='stop' and a string content body."""
+        anthropic_resp = {
+            "id": "msg_test_1",
+            "model": "qwen35",
+            "content": [{"type": "text", "text": "the answer is 42"}],
+            "stop_reason": "end_turn",
+            "usage": {"input_tokens": 10, "output_tokens": 5},
+        }
+        openai_resp = proxy.anthropic_to_openai_response(anthropic_resp)
+        self.assertEqual(openai_resp["object"], "chat.completion")
+        self.assertEqual(openai_resp["model"], "qwen35")
+        self.assertEqual(len(openai_resp["choices"]), 1)
+        choice = openai_resp["choices"][0]
+        self.assertEqual(choice["finish_reason"], "stop")
+        self.assertEqual(choice["message"]["role"], "assistant")
+        self.assertEqual(choice["message"]["content"], "the answer is 42")
+        self.assertNotIn("tool_calls", choice["message"])
+        # Usage is re-shaped to OpenAI conventions
+        self.assertEqual(openai_resp["usage"]["prompt_tokens"], 10)
+        self.assertEqual(openai_resp["usage"]["completion_tokens"], 5)
+        self.assertEqual(openai_resp["usage"]["total_tokens"], 15)
+    def test_anthropic_to_openai_response_tool_use_yields_tool_calls(self):
+        """An Anthropic response with a tool_use content block becomes an
+        OpenAI choice with finish_reason='tool_calls' and a tool_calls array
+        carrying the JSON-stringified arguments — the canonical OpenAI shape
+        clients like Forge expect."""
+        anthropic_resp = {
+            "id": "msg_tool_1",
+            "model": "qwen35",
+            "content": [
+                {
+                    "type": "tool_use",
+                    "id": "toolu_xyz",
+                    "name": "Bash",
+                    "input": {"command": "pwd"},
+                }
+            ],
+            "stop_reason": "tool_use",
+            "usage": {"input_tokens": 20, "output_tokens": 8},
+        }
+        openai_resp = proxy.anthropic_to_openai_response(anthropic_resp)
+        choice = openai_resp["choices"][0]
+        self.assertEqual(choice["finish_reason"], "tool_calls")
+        msg = choice["message"]
+        self.assertIsNone(msg["content"])  # No text emitted
+        self.assertEqual(len(msg["tool_calls"]), 1)
+        tc = msg["tool_calls"][0]
+        self.assertEqual(tc["type"], "function")
+        self.assertEqual(tc["id"], "toolu_xyz")
+        self.assertEqual(tc["function"]["name"], "Bash")
+        # Arguments are JSON-stringified per OpenAI spec
+        self.assertEqual(json.loads(tc["function"]["arguments"]), {"command": "pwd"})