@miller-tech/uap 1.20.34 → 1.20.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2140,8 +2140,12 @@ class TestToolTurnControls(unittest.TestCase):
2140
2140
  }
2141
2141
 
2142
2142
  openai = proxy.build_openai_request(body, monitor)
2143
- self.assertNotIn("tools", openai)
2144
- self.assertNotIn("tool_choice", openai)
2143
+ # Finalize turn keeps tools available but switches tool_choice to
2144
+ # 'auto' so the model can complete with a tool call or summarise.
2145
+ # Earlier behaviour stripped tools entirely, which caused Anthropic
2146
+ # clients to see end_turn with no action and halt.
2147
+ self.assertIn("tools", openai)
2148
+ self.assertEqual(openai.get("tool_choice"), "auto")
2145
2149
  self.assertEqual(monitor.tool_turn_phase, "finalize")
2146
2150
  self.assertTrue(monitor.finalize_turn_active)
2147
2151
  finally:
@@ -2229,7 +2233,7 @@ class TestToolTurnControls(unittest.TestCase):
2229
2233
  finally:
2230
2234
  setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
2231
2235
 
2232
- def test_state_machine_finalize_temporarily_disables_tools(self):
2236
+ def test_state_machine_finalize_keeps_tools_with_auto_choice(self):
2233
2237
  old_state = getattr(proxy, "PROXY_TOOL_STATE_MACHINE")
2234
2238
  old_min_msgs = getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES")
2235
2239
  old_stagnation = getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD")
@@ -2293,8 +2297,10 @@ class TestToolTurnControls(unittest.TestCase):
2293
2297
  }
2294
2298
 
2295
2299
  openai = proxy.build_openai_request(body, monitor)
2296
- self.assertNotIn("tools", openai)
2297
- self.assertNotIn("tool_choice", openai)
2300
+ # Finalize keeps tools + tool_choice=auto so the model can either
2301
+ # complete with a tool call or emit a plain-text summary.
2302
+ self.assertIn("tools", openai)
2303
+ self.assertEqual(openai.get("tool_choice"), "auto")
2298
2304
  finally:
2299
2305
  setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
2300
2306
  setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
@@ -3512,28 +3518,78 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
3512
3518
  setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3513
3519
 
3514
3520
  def test_max_tokens_floor_applied_when_thinking_active(self):
3515
- """max_tokens floor should apply when tools present and thinking enabled."""
3521
+ """Floor applies on non-preflight tool turns with thinking enabled."""
3516
3522
  old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
3517
3523
  old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
3518
3524
  try:
3519
3525
  setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
3520
3526
  setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
3521
3527
 
3528
+ # max_tokens=1536 is above SMALL_PREFLIGHT_THRESHOLD (1024), so the
3529
+ # request does NOT take the preflight carveout and the regular
3530
+ # floor path applies. Small-preflight bypass is covered separately
3531
+ # in test_max_tokens_floor_bypassed_for_small_preflight.
3522
3532
  body = {
3523
3533
  "model": "test",
3524
- "max_tokens": 512,
3534
+ "max_tokens": 1536,
3525
3535
  "messages": [{"role": "user", "content": "run command"}],
3526
3536
  "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
3527
3537
  }
3528
3538
  openai = proxy.build_openai_request(
3529
3539
  body, proxy.SessionMonitor(context_window=0)
3530
3540
  )
3531
- # Tools + thinking enabled = floor applied
3532
3541
  self.assertEqual(openai.get("max_tokens"), 4096)
3533
3542
  finally:
3534
3543
  setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
3535
3544
  setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3536
3545
 
3546
+ def test_max_tokens_floor_bypassed_for_small_preflight(self):
3547
+ """Small preflight requests (max_tokens <= SMALL_PREFLIGHT_THRESHOLD)
3548
+ bypass the big floor and instead get THINKING_MIN_FOR_TOOLS=2048
3549
+ bump so Qwen's mandatory thinking has room before the tool call."""
3550
+ old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
3551
+ old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
3552
+ try:
3553
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
3554
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
3555
+
3556
+ body = {
3557
+ "model": "test",
3558
+ "max_tokens": 512,
3559
+ "messages": [{"role": "user", "content": "run command"}],
3560
+ "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
3561
+ }
3562
+ openai = proxy.build_openai_request(
3563
+ body, proxy.SessionMonitor(context_window=0)
3564
+ )
3565
+ self.assertEqual(openai.get("max_tokens"), 2048)
3566
+ finally:
3567
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
3568
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3569
+
3570
+ def test_max_tokens_true_preflight_left_alone(self):
3571
+ """True preflight requests (max_tokens <= 16) are not inflated, even
3572
+ with tools present, so plan-generation latency stays low."""
3573
+ old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
3574
+ old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
3575
+ try:
3576
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
3577
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
3578
+
3579
+ body = {
3580
+ "model": "test",
3581
+ "max_tokens": 1,
3582
+ "messages": [{"role": "user", "content": "ping"}],
3583
+ "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
3584
+ }
3585
+ openai = proxy.build_openai_request(
3586
+ body, proxy.SessionMonitor(context_window=0)
3587
+ )
3588
+ self.assertEqual(openai.get("max_tokens"), 1)
3589
+ finally:
3590
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
3591
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3592
+
3537
3593
 
3538
3594
  class TestGenerationHangRecovery(unittest.TestCase):
3539
3595
  """Tests for generation hang recovery: timeouts, slot hang detection."""