@miller-tech/uap 1.20.34 → 1.20.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/docs/deployment/QWEN35_LLAMA_CPP.md +15 -6
- package/package.json +1 -1
- package/tools/agents/config/qwen3.5-enhanced.jinja +187 -0
- package/tools/agents/scripts/anthropic_proxy.py +1097 -59
- package/tools/agents/scripts/tool-choice-proxy.cjs +12 -0
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +193 -8
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
/**
|
|
3
3
|
* Layer 1: Intelligent Agent Execution Proxy (v1.0.0)
|
|
4
4
|
*
|
|
5
|
+
* DEPRECATED: This OpenAI-only shim is superseded by anthropic_proxy.py's
|
|
6
|
+
* /v1/chat/completions route, which provides the same OpenAI Chat Completions
|
|
7
|
+
* surface AND runs through the canonical guarded Anthropic pipeline (loop
|
|
8
|
+
* detection, tool narrowing, malformed-payload retry, context pruning).
|
|
9
|
+
*
|
|
10
|
+
* New deployments should point clients at the anthropic-proxy port (default
|
|
11
|
+
* 4000) and use either /v1/messages (Anthropic) or /v1/chat/completions
|
|
12
|
+
* (OpenAI passthrough). This script is retained for backward compatibility
|
|
13
|
+
* with installations that still reference it via `uap tool-calls` tooling
|
|
14
|
+
* (see src/cli/tool-calls.ts) and the in-container Qwen benchmark
|
|
15
|
+
* (scripts/benchmarks/run-tbench-qwen35-quick.sh).
|
|
16
|
+
*
|
|
5
17
|
* Model-agnostic proxy that sits between any OpenAI-compatible client and
|
|
6
18
|
* any OpenAI-compatible inference server. Implements:
|
|
7
19
|
*
|
|
@@ -2140,8 +2140,12 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2140
2140
|
}
|
|
2141
2141
|
|
|
2142
2142
|
openai = proxy.build_openai_request(body, monitor)
|
|
2143
|
-
|
|
2144
|
-
|
|
2143
|
+
# Finalize turn keeps tools available but switches tool_choice to
|
|
2144
|
+
# 'auto' so the model can complete with a tool call or summarise.
|
|
2145
|
+
# Earlier behaviour stripped tools entirely, which caused Anthropic
|
|
2146
|
+
# clients to see end_turn with no action and halt.
|
|
2147
|
+
self.assertIn("tools", openai)
|
|
2148
|
+
self.assertEqual(openai.get("tool_choice"), "auto")
|
|
2145
2149
|
self.assertEqual(monitor.tool_turn_phase, "finalize")
|
|
2146
2150
|
self.assertTrue(monitor.finalize_turn_active)
|
|
2147
2151
|
finally:
|
|
@@ -2229,7 +2233,7 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2229
2233
|
finally:
|
|
2230
2234
|
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
2231
2235
|
|
|
2232
|
-
def
|
|
2236
|
+
def test_state_machine_finalize_keeps_tools_with_auto_choice(self):
|
|
2233
2237
|
old_state = getattr(proxy, "PROXY_TOOL_STATE_MACHINE")
|
|
2234
2238
|
old_min_msgs = getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES")
|
|
2235
2239
|
old_stagnation = getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD")
|
|
@@ -2293,8 +2297,10 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2293
2297
|
}
|
|
2294
2298
|
|
|
2295
2299
|
openai = proxy.build_openai_request(body, monitor)
|
|
2296
|
-
|
|
2297
|
-
|
|
2300
|
+
# Finalize keeps tools + tool_choice=auto so the model can either
|
|
2301
|
+
# complete with a tool call or emit a plain-text summary.
|
|
2302
|
+
self.assertIn("tools", openai)
|
|
2303
|
+
self.assertEqual(openai.get("tool_choice"), "auto")
|
|
2298
2304
|
finally:
|
|
2299
2305
|
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
2300
2306
|
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
|
|
@@ -3512,28 +3518,78 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3512
3518
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3513
3519
|
|
|
3514
3520
|
def test_max_tokens_floor_applied_when_thinking_active(self):
|
|
3515
|
-
"""
|
|
3521
|
+
"""Floor applies on non-preflight tool turns with thinking enabled."""
|
|
3516
3522
|
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
3517
3523
|
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
3518
3524
|
try:
|
|
3519
3525
|
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
|
|
3520
3526
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
|
|
3521
3527
|
|
|
3528
|
+
# max_tokens=1536 is above SMALL_PREFLIGHT_THRESHOLD (1024), so the
|
|
3529
|
+
# request does NOT take the preflight carveout and the regular
|
|
3530
|
+
# floor path applies. Small-preflight bypass is covered separately
|
|
3531
|
+
# in test_max_tokens_floor_bypassed_for_small_preflight.
|
|
3522
3532
|
body = {
|
|
3523
3533
|
"model": "test",
|
|
3524
|
-
"max_tokens":
|
|
3534
|
+
"max_tokens": 1536,
|
|
3525
3535
|
"messages": [{"role": "user", "content": "run command"}],
|
|
3526
3536
|
"tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
|
|
3527
3537
|
}
|
|
3528
3538
|
openai = proxy.build_openai_request(
|
|
3529
3539
|
body, proxy.SessionMonitor(context_window=0)
|
|
3530
3540
|
)
|
|
3531
|
-
# Tools + thinking enabled = floor applied
|
|
3532
3541
|
self.assertEqual(openai.get("max_tokens"), 4096)
|
|
3533
3542
|
finally:
|
|
3534
3543
|
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
3535
3544
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3536
3545
|
|
|
3546
|
+
def test_max_tokens_floor_bypassed_for_small_preflight(self):
|
|
3547
|
+
"""Small preflight requests (max_tokens <= SMALL_PREFLIGHT_THRESHOLD)
|
|
3548
|
+
bypass the big floor and instead get THINKING_MIN_FOR_TOOLS=2048
|
|
3549
|
+
bump so Qwen's mandatory thinking has room before the tool call."""
|
|
3550
|
+
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
3551
|
+
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
3552
|
+
try:
|
|
3553
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
|
|
3554
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
|
|
3555
|
+
|
|
3556
|
+
body = {
|
|
3557
|
+
"model": "test",
|
|
3558
|
+
"max_tokens": 512,
|
|
3559
|
+
"messages": [{"role": "user", "content": "run command"}],
|
|
3560
|
+
"tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
|
|
3561
|
+
}
|
|
3562
|
+
openai = proxy.build_openai_request(
|
|
3563
|
+
body, proxy.SessionMonitor(context_window=0)
|
|
3564
|
+
)
|
|
3565
|
+
self.assertEqual(openai.get("max_tokens"), 2048)
|
|
3566
|
+
finally:
|
|
3567
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
3568
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3569
|
+
|
|
3570
|
+
def test_max_tokens_true_preflight_left_alone(self):
|
|
3571
|
+
"""True preflight requests (max_tokens <= 16) are not inflated, even
|
|
3572
|
+
with tools present, so plan-generation latency stays low."""
|
|
3573
|
+
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
3574
|
+
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
3575
|
+
try:
|
|
3576
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
|
|
3577
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
|
|
3578
|
+
|
|
3579
|
+
body = {
|
|
3580
|
+
"model": "test",
|
|
3581
|
+
"max_tokens": 1,
|
|
3582
|
+
"messages": [{"role": "user", "content": "ping"}],
|
|
3583
|
+
"tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
|
|
3584
|
+
}
|
|
3585
|
+
openai = proxy.build_openai_request(
|
|
3586
|
+
body, proxy.SessionMonitor(context_window=0)
|
|
3587
|
+
)
|
|
3588
|
+
self.assertEqual(openai.get("max_tokens"), 1)
|
|
3589
|
+
finally:
|
|
3590
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
3591
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3592
|
+
|
|
3537
3593
|
|
|
3538
3594
|
class TestGenerationHangRecovery(unittest.TestCase):
|
|
3539
3595
|
"""Tests for generation hang recovery: timeouts, slot hang detection."""
|
|
@@ -4818,3 +4874,132 @@ class TestUpstream503Resilience(unittest.TestCase):
|
|
|
4818
4874
|
"""Does not match 200 even with loading text."""
|
|
4819
4875
|
resp = httpx.Response(200, text='{"status":"loading model"}')
|
|
4820
4876
|
self.assertFalse(proxy._is_loading_model_503(resp))
|
|
4877
|
+
|
|
4878
|
+
|
|
4879
|
+
class TestOpenAIPassthroughConversion(unittest.TestCase):
|
|
4880
|
+
"""Tests for the /v1/chat/completions OpenAI passthrough route.
|
|
4881
|
+
|
|
4882
|
+
The route converts OpenAI Chat Completions requests to Anthropic
|
|
4883
|
+
Messages, runs the full guarded Anthropic pipeline, then converts the
|
|
4884
|
+
response back to OpenAI shape. This exercises the pure conversion
|
|
4885
|
+
helpers (openai_to_anthropic_request, anthropic_to_openai_response) in
|
|
4886
|
+
isolation so a regression in the dual-interface surface is caught
|
|
4887
|
+
without needing a live FastAPI client."""
|
|
4888
|
+
|
|
4889
|
+
def test_openai_to_anthropic_request_preserves_user_and_assistant_text(self):
|
|
4890
|
+
"""User and assistant text messages survive the OpenAI->Anthropic
|
|
4891
|
+
conversion with the expected role + content shape."""
|
|
4892
|
+
openai_body = {
|
|
4893
|
+
"model": "qwen35",
|
|
4894
|
+
"max_tokens": 1024,
|
|
4895
|
+
"messages": [
|
|
4896
|
+
{"role": "system", "content": "you are helpful"},
|
|
4897
|
+
{"role": "user", "content": "hello"},
|
|
4898
|
+
{"role": "assistant", "content": "hi there"},
|
|
4899
|
+
{"role": "user", "content": "thanks"},
|
|
4900
|
+
],
|
|
4901
|
+
}
|
|
4902
|
+
anthropic_body = proxy.openai_to_anthropic_request(openai_body)
|
|
4903
|
+
|
|
4904
|
+
self.assertEqual(anthropic_body.get("model"), "qwen35")
|
|
4905
|
+
self.assertEqual(anthropic_body.get("max_tokens"), 1024)
|
|
4906
|
+
# System collapses into a top-level 'system' field
|
|
4907
|
+
self.assertIn("system", anthropic_body)
|
|
4908
|
+
# Non-system messages preserved in order
|
|
4909
|
+
msgs = anthropic_body.get("messages", [])
|
|
4910
|
+
self.assertEqual(len(msgs), 3)
|
|
4911
|
+
self.assertEqual(msgs[0]["role"], "user")
|
|
4912
|
+
self.assertEqual(msgs[1]["role"], "assistant")
|
|
4913
|
+
self.assertEqual(msgs[2]["role"], "user")
|
|
4914
|
+
|
|
4915
|
+
def test_openai_to_anthropic_request_converts_tool_response(self):
|
|
4916
|
+
"""OpenAI 'role: tool' messages become Anthropic user messages with
|
|
4917
|
+
a tool_result content block — required so the guarded pipeline can
|
|
4918
|
+
track tool history across turns."""
|
|
4919
|
+
openai_body = {
|
|
4920
|
+
"model": "test",
|
|
4921
|
+
"messages": [
|
|
4922
|
+
{"role": "user", "content": "run pwd"},
|
|
4923
|
+
{
|
|
4924
|
+
"role": "assistant",
|
|
4925
|
+
"content": None,
|
|
4926
|
+
"tool_calls": [
|
|
4927
|
+
{
|
|
4928
|
+
"id": "call_1",
|
|
4929
|
+
"type": "function",
|
|
4930
|
+
"function": {"name": "Bash", "arguments": '{"command": "pwd"}'},
|
|
4931
|
+
}
|
|
4932
|
+
],
|
|
4933
|
+
},
|
|
4934
|
+
{"role": "tool", "tool_call_id": "call_1", "content": "/home/user"},
|
|
4935
|
+
],
|
|
4936
|
+
}
|
|
4937
|
+
anthropic_body = proxy.openai_to_anthropic_request(openai_body)
|
|
4938
|
+
msgs = anthropic_body.get("messages", [])
|
|
4939
|
+
|
|
4940
|
+
# Last message is the tool result, encoded as Anthropic user/tool_result
|
|
4941
|
+
tool_result_msg = msgs[-1]
|
|
4942
|
+
self.assertEqual(tool_result_msg["role"], "user")
|
|
4943
|
+
blocks = tool_result_msg["content"]
|
|
4944
|
+
self.assertEqual(len(blocks), 1)
|
|
4945
|
+
self.assertEqual(blocks[0]["type"], "tool_result")
|
|
4946
|
+
self.assertEqual(blocks[0]["tool_use_id"], "call_1")
|
|
4947
|
+
self.assertEqual(blocks[0]["content"], "/home/user")
|
|
4948
|
+
|
|
4949
|
+
def test_anthropic_to_openai_response_text_only(self):
|
|
4950
|
+
"""A plain-text Anthropic response becomes OpenAI choices[0] with
|
|
4951
|
+
finish_reason='stop' and a string content body."""
|
|
4952
|
+
anthropic_resp = {
|
|
4953
|
+
"id": "msg_test_1",
|
|
4954
|
+
"model": "qwen35",
|
|
4955
|
+
"content": [{"type": "text", "text": "the answer is 42"}],
|
|
4956
|
+
"stop_reason": "end_turn",
|
|
4957
|
+
"usage": {"input_tokens": 10, "output_tokens": 5},
|
|
4958
|
+
}
|
|
4959
|
+
openai_resp = proxy.anthropic_to_openai_response(anthropic_resp)
|
|
4960
|
+
|
|
4961
|
+
self.assertEqual(openai_resp["object"], "chat.completion")
|
|
4962
|
+
self.assertEqual(openai_resp["model"], "qwen35")
|
|
4963
|
+
self.assertEqual(len(openai_resp["choices"]), 1)
|
|
4964
|
+
choice = openai_resp["choices"][0]
|
|
4965
|
+
self.assertEqual(choice["finish_reason"], "stop")
|
|
4966
|
+
self.assertEqual(choice["message"]["role"], "assistant")
|
|
4967
|
+
self.assertEqual(choice["message"]["content"], "the answer is 42")
|
|
4968
|
+
self.assertNotIn("tool_calls", choice["message"])
|
|
4969
|
+
# Usage is re-shaped to OpenAI conventions
|
|
4970
|
+
self.assertEqual(openai_resp["usage"]["prompt_tokens"], 10)
|
|
4971
|
+
self.assertEqual(openai_resp["usage"]["completion_tokens"], 5)
|
|
4972
|
+
self.assertEqual(openai_resp["usage"]["total_tokens"], 15)
|
|
4973
|
+
|
|
4974
|
+
def test_anthropic_to_openai_response_tool_use_yields_tool_calls(self):
|
|
4975
|
+
"""An Anthropic response with a tool_use content block becomes an
|
|
4976
|
+
OpenAI choice with finish_reason='tool_calls' and a tool_calls array
|
|
4977
|
+
carrying the JSON-stringified arguments — the canonical OpenAI shape
|
|
4978
|
+
clients like Forge expect."""
|
|
4979
|
+
anthropic_resp = {
|
|
4980
|
+
"id": "msg_tool_1",
|
|
4981
|
+
"model": "qwen35",
|
|
4982
|
+
"content": [
|
|
4983
|
+
{
|
|
4984
|
+
"type": "tool_use",
|
|
4985
|
+
"id": "toolu_xyz",
|
|
4986
|
+
"name": "Bash",
|
|
4987
|
+
"input": {"command": "pwd"},
|
|
4988
|
+
}
|
|
4989
|
+
],
|
|
4990
|
+
"stop_reason": "tool_use",
|
|
4991
|
+
"usage": {"input_tokens": 20, "output_tokens": 8},
|
|
4992
|
+
}
|
|
4993
|
+
openai_resp = proxy.anthropic_to_openai_response(anthropic_resp)
|
|
4994
|
+
|
|
4995
|
+
choice = openai_resp["choices"][0]
|
|
4996
|
+
self.assertEqual(choice["finish_reason"], "tool_calls")
|
|
4997
|
+
msg = choice["message"]
|
|
4998
|
+
self.assertIsNone(msg["content"]) # No text emitted
|
|
4999
|
+
self.assertEqual(len(msg["tool_calls"]), 1)
|
|
5000
|
+
tc = msg["tool_calls"][0]
|
|
5001
|
+
self.assertEqual(tc["type"], "function")
|
|
5002
|
+
self.assertEqual(tc["id"], "toolu_xyz")
|
|
5003
|
+
self.assertEqual(tc["function"]["name"], "Bash")
|
|
5004
|
+
# Arguments are JSON-stringified per OpenAI spec
|
|
5005
|
+
self.assertEqual(json.loads(tc["function"]["arguments"]), {"command": "pwd"})
|