@miller-tech/uap 1.20.13 → 1.20.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.13",
3
+ "version": "1.20.15",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -219,7 +219,7 @@ PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
219
219
  "no",
220
220
  }
221
221
  PROXY_MALFORMED_TOOL_RETRY_MAX = int(
222
- os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "2")
222
+ os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "3")
223
223
  )
224
224
  PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS = int(
225
225
  os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS", "2048")
@@ -2167,16 +2167,24 @@ def build_openai_request(
2167
2167
  # Enforce configurable minimum floor for thinking mode: model needs
2168
2168
  # tokens for reasoning (<think>...</think>) plus actual response/tool
2169
2169
  # calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
2170
- floor_bypassed_for_tool_turn = (
2171
- has_tools
2172
- and PROXY_DISABLE_THINKING_ON_TOOL_TURNS
2173
- and PROXY_MAX_TOKENS_FLOOR > 0
2170
+ #
2171
+ # The floor is ONLY applied when thinking is actually enabled —
2172
+ # skip it for non-tool requests (tools=0) and for tool turns
2173
+ # with thinking disabled, to prevent inflating short preflight
2174
+ # requests (e.g. max_tokens=100 for plan generation).
2175
+ thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
2176
+ skip_floor = (
2177
+ not has_tools # non-tool requests don't need thinking headroom
2178
+ or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
2179
+ or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
2174
2180
  )
2175
- if floor_bypassed_for_tool_turn:
2181
+ if skip_floor:
2176
2182
  requested_max = requested_raw
2177
- if requested_raw < PROXY_MAX_TOKENS_FLOOR:
2183
+ if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
2178
2184
  logger.info(
2179
- "MAX_TOKENS floor bypassed for tool turn with thinking disabled: requested=%d floor=%d",
2185
+ "MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
2186
+ has_tools,
2187
+ thinking_active_for_request,
2180
2188
  requested_raw,
2181
2189
  PROXY_MAX_TOKENS_FLOOR,
2182
2190
  )
@@ -3890,6 +3898,40 @@ async def _apply_completion_contract_guardrail(
3890
3898
  return retried
3891
3899
 
3892
3900
 
3901
+ def _sanitize_assistant_messages_for_retry(messages: list[dict]) -> list[dict]:
3902
+ """Strip malformed tool-like text from assistant messages to prevent copy-contamination.
3903
+
3904
+ Only sanitizes the last 4 assistant messages to avoid excessive processing.
3905
+ """
3906
+ import re
3907
+
3908
+ # Patterns that indicate malformed tool call text in assistant content
3909
+ _TOOL_LIKE_PATTERNS = re.compile(
3910
+ r"<tool_call>.*?</tool_call>"
3911
+ r"|<function_call>.*?</function_call>"
3912
+ r'|\{"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:'
3913
+ r"|```json\s*\{[^}]*\"name\"\s*:",
3914
+ re.DOTALL,
3915
+ )
3916
+
3917
+ result = list(messages)
3918
+ sanitized_count = 0
3919
+ for i in range(len(result) - 1, -1, -1):
3920
+ if sanitized_count >= 4:
3921
+ break
3922
+ msg = result[i]
3923
+ if msg.get("role") != "assistant":
3924
+ continue
3925
+ content = msg.get("content", "")
3926
+ if isinstance(content, str) and _TOOL_LIKE_PATTERNS.search(content):
3927
+ cleaned = _TOOL_LIKE_PATTERNS.sub("", content).strip()
3928
+ if not cleaned:
3929
+ cleaned = "I will use the appropriate tool."
3930
+ result[i] = {**msg, "content": cleaned}
3931
+ sanitized_count += 1
3932
+ return result
3933
+
3934
+
3893
3935
  def _build_malformed_retry_body(
3894
3936
  openai_body: dict,
3895
3937
  anthropic_body: dict,
@@ -3901,7 +3943,11 @@ def _build_malformed_retry_body(
3901
3943
  retry_body = dict(openai_body)
3902
3944
  retry_body["stream"] = False
3903
3945
  retry_body["tool_choice"] = tool_choice
3904
- retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
3946
+ # Escalate temperature down on successive retries for more deterministic output
3947
+ if total_attempts > 1 and attempt > 1:
3948
+ retry_body["temperature"] = 0.0
3949
+ else:
3950
+ retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
3905
3951
 
3906
3952
  if tool_choice == "required":
3907
3953
  retry_instruction = (
@@ -3922,7 +3968,10 @@ def _build_malformed_retry_body(
3922
3968
  }
3923
3969
  existing_messages = retry_body.get("messages")
3924
3970
  if isinstance(existing_messages, list) and existing_messages:
3925
- retry_body["messages"] = [*existing_messages, malformed_retry_instruction]
3971
+ # Strip malformed tool-like text from assistant messages to prevent
3972
+ # the model from copying contaminated patterns on retry
3973
+ sanitized = _sanitize_assistant_messages_for_retry(existing_messages)
3974
+ retry_body["messages"] = [*sanitized, malformed_retry_instruction]
3926
3975
 
3927
3976
  if PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
3928
3977
  current_max = int(
@@ -4394,6 +4443,48 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
4394
4443
  return openai_resp
4395
4444
 
4396
4445
 
4446
+ def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
4447
+ """Detect degenerate repetitive text and truncate at first repetition.
4448
+
4449
+ When the model produces highly repetitive output (e.g. the same 20+ char
4450
+ substring repeated 10+ times), truncate at the first repetition boundary
4451
+ and set finish_reason to stop.
4452
+ """
4453
+ text = _openai_message_text(openai_resp)
4454
+ if not text or len(text) < 200:
4455
+ return openai_resp
4456
+
4457
+ # Look for repeated substrings of length 20-100
4458
+ for substr_len in (60, 40, 20):
4459
+ # Sample from the middle of the text to find the repeating pattern
4460
+ mid = len(text) // 2
4461
+ sample = text[mid : mid + substr_len]
4462
+ if not sample.strip():
4463
+ continue
4464
+ count = text.count(sample)
4465
+ if count >= 8:
4466
+ # Found degenerate repetition — truncate at first occurrence + one repeat
4467
+ first_pos = text.find(sample)
4468
+ second_pos = text.find(sample, first_pos + len(sample))
4469
+ if second_pos > first_pos:
4470
+ truncated = text[:second_pos].rstrip()
4471
+ logger.warning(
4472
+ "DEGENERATE REPETITION: detected %d repeats of %d-char substring, truncating %d -> %d chars",
4473
+ count,
4474
+ substr_len,
4475
+ len(text),
4476
+ len(truncated),
4477
+ )
4478
+ # Update the response
4479
+ choices = openai_resp.get("choices", [])
4480
+ if choices:
4481
+ msg = choices[0].get("message", {})
4482
+ msg["content"] = truncated
4483
+ choices[0]["finish_reason"] = "stop"
4484
+ return openai_resp
4485
+ return openai_resp
4486
+
4487
+
4397
4488
  def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
4398
4489
  """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
4399
4490
  # First: try to recover tool calls trapped in text XML tags
@@ -5217,6 +5308,7 @@ async def messages(request: Request):
5217
5308
  session_id,
5218
5309
  )
5219
5310
 
5311
+ openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
5220
5312
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
5221
5313
  monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
5222
5314
  # Update last_input_tokens from upstream's actual prompt_tokens
@@ -5555,6 +5647,7 @@ async def messages(request: Request):
5555
5647
  monitor.invalid_tool_call_streak = 0
5556
5648
  monitor.required_tool_miss_streak = 0
5557
5649
 
5650
+ openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
5558
5651
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
5559
5652
 
5560
5653
  # Track output tokens in session monitor
@@ -116,7 +116,8 @@ class TestProxyConfigTuning(unittest.TestCase):
116
116
  setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
117
117
  setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
118
118
 
119
- def test_build_request_keeps_floor_for_non_tool_turns(self):
119
+ def test_build_request_skips_floor_for_non_tool_turns(self):
120
+ """Non-tool requests should NOT have the max_tokens floor applied."""
120
121
  old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
121
122
  old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
122
123
  try:
@@ -132,7 +133,8 @@ class TestProxyConfigTuning(unittest.TestCase):
132
133
  openai = proxy.build_openai_request(
133
134
  body, proxy.SessionMonitor(context_window=0)
134
135
  )
135
- self.assertEqual(openai.get("max_tokens"), 4096)
136
+ # Floor should NOT inflate max_tokens for non-tool requests
137
+ self.assertEqual(openai.get("max_tokens"), 512)
136
138
  finally:
137
139
  setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
138
140
  setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
@@ -3377,6 +3379,144 @@ class TestCycleBreakOptions(unittest.TestCase):
3377
3379
  self.assertEqual(monitor.cycling_tool_names, [])
3378
3380
 
3379
3381
 
3382
+ class TestMalformedRetryHardening(unittest.TestCase):
3383
+ """Tests for malformed retry improvements: budget, temp escalation, message sanitization."""
3384
+
3385
+ def test_retry_max_default_is_3(self):
3386
+ """Option 1: default retry budget increased from 2 to 3."""
3387
+ self.assertEqual(proxy.PROXY_MALFORMED_TOOL_RETRY_MAX, 3)
3388
+
3389
+ def test_sanitize_assistant_messages_strips_tool_like_text(self):
3390
+ """Option 3: malformed tool-like text stripped from assistant messages on retry."""
3391
+ messages = [
3392
+ {"role": "system", "content": "You are helpful."},
3393
+ {"role": "user", "content": "Run a command"},
3394
+ {"role": "assistant", "content": 'Here is the result <tool_call>{"name": "Bash", "arguments": {"command": "ls"}}</tool_call>'},
3395
+ {"role": "user", "content": "ok"},
3396
+ ]
3397
+ sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
3398
+ # System and user messages unchanged
3399
+ self.assertEqual(sanitized[0]["content"], "You are helpful.")
3400
+ self.assertEqual(sanitized[1]["content"], "Run a command")
3401
+ self.assertEqual(sanitized[3]["content"], "ok")
3402
+ # Assistant message should have tool_call stripped
3403
+ self.assertNotIn("<tool_call>", sanitized[2]["content"])
3404
+ self.assertNotIn("Bash", sanitized[2]["content"])
3405
+
3406
+ def test_sanitize_preserves_clean_assistant_messages(self):
3407
+ """Clean assistant messages are not modified by sanitization."""
3408
+ messages = [
3409
+ {"role": "assistant", "content": "I will read the file for you."},
3410
+ ]
3411
+ sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
3412
+ self.assertEqual(sanitized[0]["content"], "I will read the file for you.")
3413
+
3414
+ def test_sanitize_replaces_empty_content_with_placeholder(self):
3415
+ """If stripping leaves empty content, a placeholder is used."""
3416
+ messages = [
3417
+ {"role": "assistant", "content": '<tool_call>{"name": "Bash", "arguments": {}}</tool_call>'},
3418
+ ]
3419
+ sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
3420
+ self.assertEqual(sanitized[0]["content"], "I will use the appropriate tool.")
3421
+
3422
+ def test_retry_body_uses_sanitized_messages(self):
3423
+ """Retry body messages are sanitized before adding retry instruction."""
3424
+ openai_body = {
3425
+ "messages": [
3426
+ {"role": "system", "content": "sys"},
3427
+ {"role": "user", "content": "do it"},
3428
+ {"role": "assistant", "content": '<tool_call>{"name":"X","arguments":{}}</tool_call>'},
3429
+ ],
3430
+ "tools": [{"type": "function", "function": {"name": "X", "parameters": {}}}],
3431
+ }
3432
+ anthropic_body = {"tools": [{"name": "X", "input_schema": {"type": "object"}}]}
3433
+ retry = proxy._build_malformed_retry_body(
3434
+ openai_body, anthropic_body, attempt=1, total_attempts=3,
3435
+ )
3436
+ # The assistant message should be sanitized
3437
+ assistant_msgs = [m for m in retry["messages"] if m.get("role") == "assistant"]
3438
+ for m in assistant_msgs:
3439
+ self.assertNotIn("<tool_call>", m.get("content", ""))
3440
+
3441
+
3442
+ class TestDegenerateRepetitionDetection(unittest.TestCase):
3443
+ """Tests for degenerate repetition detection and truncation."""
3444
+
3445
+ def test_detects_and_truncates_repetitive_text(self):
3446
+ """Highly repetitive text should be truncated."""
3447
+ repeated = "Mermaid Diagrams](docs/mermaid-diagrams" * 50
3448
+ openai_resp = {
3449
+ "choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
3450
+ }
3451
+ result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
3452
+ truncated_text = result["choices"][0]["message"]["content"]
3453
+ self.assertLess(len(truncated_text), len(repeated))
3454
+ self.assertEqual(result["choices"][0]["finish_reason"], "stop")
3455
+
3456
+ def test_preserves_non_repetitive_text(self):
3457
+ """Normal text should not be modified."""
3458
+ text = "This is a perfectly normal response with varied content. " * 5
3459
+ openai_resp = {
3460
+ "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
3461
+ }
3462
+ result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
3463
+ self.assertEqual(result["choices"][0]["message"]["content"], text)
3464
+
3465
+ def test_preserves_short_text(self):
3466
+ """Short text (< 200 chars) should not be processed."""
3467
+ text = "Short response."
3468
+ openai_resp = {
3469
+ "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
3470
+ }
3471
+ result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
3472
+ self.assertEqual(result["choices"][0]["message"]["content"], text)
3473
+
3474
+ def test_max_tokens_floor_skipped_for_non_tool_requests(self):
3475
+ """max_tokens floor should not inflate non-tool requests."""
3476
+ old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
3477
+ old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
3478
+ try:
3479
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 16384)
3480
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
3481
+
3482
+ body = {
3483
+ "model": "test",
3484
+ "max_tokens": 100,
3485
+ "messages": [{"role": "user", "content": "generate a title"}],
3486
+ }
3487
+ openai = proxy.build_openai_request(
3488
+ body, proxy.SessionMonitor(context_window=0)
3489
+ )
3490
+ # No tools = no floor inflation
3491
+ self.assertEqual(openai.get("max_tokens"), 100)
3492
+ finally:
3493
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
3494
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3495
+
3496
+ def test_max_tokens_floor_applied_when_thinking_active(self):
3497
+ """max_tokens floor should apply when tools present and thinking enabled."""
3498
+ old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
3499
+ old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
3500
+ try:
3501
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
3502
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
3503
+
3504
+ body = {
3505
+ "model": "test",
3506
+ "max_tokens": 512,
3507
+ "messages": [{"role": "user", "content": "run command"}],
3508
+ "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
3509
+ }
3510
+ openai = proxy.build_openai_request(
3511
+ body, proxy.SessionMonitor(context_window=0)
3512
+ )
3513
+ # Tools + thinking enabled = floor applied
3514
+ self.assertEqual(openai.get("max_tokens"), 4096)
3515
+ finally:
3516
+ setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
3517
+ setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
3518
+
3519
+
3380
3520
  if __name__ == "__main__":
3381
3521
  unittest.main()
3382
3522