@miller-tech/uap 1.20.39 → 1.20.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.39",
3
+ "version": "1.20.41",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -6502,6 +6502,15 @@ def _extract_thinking_block(text: str) -> tuple[str | None, str]:
6502
6502
  present, returns ``(None, text)`` unchanged. Multiple thinking blocks
6503
6503
  are concatenated. Trailing whitespace after each block is consumed so
6504
6504
  the remaining text starts cleanly with the model's actual answer.
6505
+
6506
+ Truncated / unclosed ``<think>`` blocks (max_tokens cutting off
6507
+ mid-thinking) are also handled: everything from the dangling
6508
+ ``<think>`` to end-of-text is treated as partial thinking content,
6509
+ and anything before it is preserved as the body. Without this, the
6510
+ open tag and the model's partial reasoning would leak into the
6511
+ Anthropic-spec ``text`` content block — a 100% Anthropic-compatibility
6512
+ violation since real Anthropic responses never embed ``<think>`` in
6513
+ ``text``.
6505
6514
  """
6506
6515
  if "<think>" not in text:
6507
6516
  return None, text
@@ -6510,8 +6519,24 @@ def _extract_thinking_block(text: str) -> tuple[str | None, str]:
6510
6519
  parts.append(m.group(1).strip())
6511
6520
  return ""
6512
6521
  remaining = _THINKING_BLOCK_RE.sub(collect, text)
6522
+ # After stripping balanced pairs, check for a dangling unclosed
6523
+ # <think>... open tag and treat it as partial thinking content.
6524
+ # First occurrence wins; any further '<think>' substrings in the
6525
+ # captured partial are folded into the same partial block.
6526
+ if "<think>" in remaining:
6527
+ idx = remaining.find("<think>")
6528
+ partial = remaining[idx + len("<think>"):].strip()
6529
+ if partial:
6530
+ parts.append(partial)
6531
+ # rstrip mirrors the balanced regex's \s* consumption after </think>:
6532
+ # whitespace separating body from thinking is structural, not part of
6533
+ # the body.
6534
+ remaining = remaining[:idx].rstrip()
6513
6535
  if not parts:
6514
- return None, text
6536
+ # Saw "<think>" in original text but no extractable content (e.g.
6537
+ # bare "<think>" alone or "<think></think>"). Return cleaned body
6538
+ # so the open tag does not leak.
6539
+ return None, remaining.lstrip()
6515
6540
  return "\n\n".join(p for p in parts if p), remaining.lstrip()
6516
6541
 
6517
6542
 
@@ -8225,7 +8250,7 @@ async def models():
8225
8250
  {"id": "claude-haiku-4-5-20251001", "object": "model"},
8226
8251
  {"id": "claude-sonnet-4-6", "object": "model"},
8227
8252
  {"id": "claude-opus-4-7", "object": "model"},
8228
- {"id": "qwen35-a3b-iq4xs", "object": "model"},
8253
+ {"id": "qwen36-27b-iq4xs", "object": "model"},
8229
8254
  ]
8230
8255
  }
8231
8256
 
@@ -5028,9 +5028,13 @@ class TestModelsEndpoint(unittest.TestCase):
5028
5028
  self.assertIn("claude-sonnet-4-6", ids)
5029
5029
  self.assertIn("claude-opus-4-7", ids)
5030
5030
 
5031
- # Local model (kept because requests for it actually route locally
5032
- # even with __local_only__ passthrough sentinel set)
5033
- self.assertIn("qwen35-a3b-iq4xs", ids)
5031
+ # Local model what llama-server actually serves. Updated
5032
+ # 2026-05-15 from qwen35-a3b-iq4xs after the switch from 35B-A3B
5033
+ # MoE to Qwen3.6-27B dense (see project_active_server memory).
5034
+ # Requests for this ID route locally even with __local_only__
5035
+ # passthrough sentinel set.
5036
+ self.assertIn("qwen36-27b-iq4xs", ids)
5037
+ self.assertNotIn("qwen35-a3b-iq4xs", ids)
5034
5038
 
5035
5039
  def test_models_endpoint_drops_stale_4_6_dated_variants(self):
5036
5040
  """The pre-2026-05 list advertised claude-opus-4-6-20260101,
@@ -5057,3 +5061,77 @@ class TestModelsEndpoint(unittest.TestCase):
5057
5061
  for entry in result["data"]:
5058
5062
  self.assertIn("id", entry)
5059
5063
  self.assertEqual(entry["object"], "model")
5064
+
5065
+
5066
+ class TestThinkingBlockExtraction(unittest.TestCase):
5067
+ """Tests for _extract_thinking_block — guarantees Anthropic-spec text
5068
+ content blocks never contain Qwen <think> tags, including the truncated
5069
+ / unclosed case (max_tokens cutting off mid-thinking).
5070
+
5071
+ Reproduces the OK_HAIKU regression observed 2026-05-14: a request with
5072
+ max_tokens=40 against a Qwen upstream produced an Anthropic response
5073
+ whose text block started with `<think>\\nHere's a thinking process:...`
5074
+ because the model never reached `</think>` before being cut off."""
5075
+
5076
+ def test_balanced_single_block_extracts_to_thinking(self):
5077
+ text = "<think>let me reason</think>The answer is 42."
5078
+ thinking, body = proxy._extract_thinking_block(text)
5079
+ self.assertEqual(thinking, "let me reason")
5080
+ self.assertEqual(body, "The answer is 42.")
5081
+
5082
+ def test_balanced_multiple_blocks_concatenate(self):
5083
+ text = "<think>step one</think>partial<think>step two</think>final"
5084
+ thinking, body = proxy._extract_thinking_block(text)
5085
+ self.assertEqual(thinking, "step one\n\nstep two")
5086
+ # Both blocks stripped; body is the residual prose joined.
5087
+ self.assertEqual(body, "partialfinal")
5088
+
5089
+ def test_no_think_tag_returns_text_unchanged(self):
5090
+ text = "Hello world."
5091
+ thinking, body = proxy._extract_thinking_block(text)
5092
+ self.assertIsNone(thinking)
5093
+ self.assertEqual(body, text)
5094
+
5095
+ def test_unclosed_think_captures_partial_and_strips_open_tag(self):
5096
+ """Truncation case (max_tokens cuts mid-thinking). The open
5097
+ <think> tag and partial reasoning MUST NOT leak into the
5098
+ Anthropic-spec text content block."""
5099
+ text = "<think>\nHere's a thinking process:\n\n1. Analyze user input"
5100
+ thinking, body = proxy._extract_thinking_block(text)
5101
+ # Partial reasoning captured as thinking content
5102
+ self.assertIsNotNone(thinking)
5103
+ self.assertIn("thinking process", thinking)
5104
+ # CRITICAL: body must be empty / contain no <think> tag
5105
+ self.assertEqual(body, "")
5106
+ self.assertNotIn("<think>", body)
5107
+
5108
+ def test_pre_text_then_unclosed_think_preserves_pre_text(self):
5109
+ text = "Pre-thinking prose. <think>partial reasoning"
5110
+ thinking, body = proxy._extract_thinking_block(text)
5111
+ self.assertEqual(thinking, "partial reasoning")
5112
+ # Pre-think prose is preserved as body
5113
+ self.assertEqual(body, "Pre-thinking prose.")
5114
+ self.assertNotIn("<think>", body)
5115
+
5116
+ def test_bare_open_tag_alone_strips_cleanly(self):
5117
+ """Edge: response is literally just '<think>' with nothing else
5118
+ (extremely degenerate truncation). The tag must be stripped from
5119
+ the body so it doesn't appear in the client-facing response."""
5120
+ text = "<think>"
5121
+ thinking, body = proxy._extract_thinking_block(text)
5122
+ # No partial content -> no thinking block; body empty, no tag leak
5123
+ self.assertIsNone(thinking)
5124
+ self.assertEqual(body, "")
5125
+ self.assertNotIn("<think>", body)
5126
+
5127
+ def test_balanced_then_dangling_unclosed_handles_both(self):
5128
+ """Multi-step truncation: model emitted one complete <think>
5129
+ block, started a second one, then was cut off."""
5130
+ text = "<think>first thoughts</think>partial answer<think>second thought, cut off mid-way"
5131
+ thinking, body = proxy._extract_thinking_block(text)
5132
+ # Both balanced and unclosed contributions appear
5133
+ self.assertIn("first thoughts", thinking)
5134
+ self.assertIn("second thought, cut off mid-way", thinking)
5135
+ # Body has the prose between the two blocks
5136
+ self.assertEqual(body, "partial answer")
5137
+ self.assertNotIn("<think>", body)