@miller-tech/uap 1.20.38 → 1.20.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -6502,6 +6502,15 @@ def _extract_thinking_block(text: str) -> tuple[str | None, str]:
|
|
|
6502
6502
|
present, returns ``(None, text)`` unchanged. Multiple thinking blocks
|
|
6503
6503
|
are concatenated. Trailing whitespace after each block is consumed so
|
|
6504
6504
|
the remaining text starts cleanly with the model's actual answer.
|
|
6505
|
+
|
|
6506
|
+
Truncated / unclosed ``<think>`` blocks (max_tokens cutting off
|
|
6507
|
+
mid-thinking) are also handled: everything from the dangling
|
|
6508
|
+
``<think>`` to end-of-text is treated as partial thinking content,
|
|
6509
|
+
and anything before it is preserved as the body. Without this, the
|
|
6510
|
+
open tag and the model's partial reasoning would leak into the
|
|
6511
|
+
Anthropic-spec ``text`` content block — a 100% Anthropic-compatibility
|
|
6512
|
+
violation since real Anthropic responses never embed ``<think>`` in
|
|
6513
|
+
``text``.
|
|
6505
6514
|
"""
|
|
6506
6515
|
if "<think>" not in text:
|
|
6507
6516
|
return None, text
|
|
@@ -6510,8 +6519,24 @@ def _extract_thinking_block(text: str) -> tuple[str | None, str]:
|
|
|
6510
6519
|
parts.append(m.group(1).strip())
|
|
6511
6520
|
return ""
|
|
6512
6521
|
remaining = _THINKING_BLOCK_RE.sub(collect, text)
|
|
6522
|
+
# After stripping balanced pairs, check for a dangling unclosed
|
|
6523
|
+
# <think>... open tag and treat it as partial thinking content.
|
|
6524
|
+
# First occurrence wins; any further '<think>' substrings in the
|
|
6525
|
+
# captured partial are folded into the same partial block.
|
|
6526
|
+
if "<think>" in remaining:
|
|
6527
|
+
idx = remaining.find("<think>")
|
|
6528
|
+
partial = remaining[idx + len("<think>"):].strip()
|
|
6529
|
+
if partial:
|
|
6530
|
+
parts.append(partial)
|
|
6531
|
+
# rstrip mirrors the balanced regex's \s* consumption after </think>:
|
|
6532
|
+
# whitespace separating body from thinking is structural, not part of
|
|
6533
|
+
# the body.
|
|
6534
|
+
remaining = remaining[:idx].rstrip()
|
|
6513
6535
|
if not parts:
|
|
6514
|
-
|
|
6536
|
+
# Saw "<think>" in original text but no extractable content (e.g.
|
|
6537
|
+
# bare "<think>" alone or "<think></think>"). Return cleaned body
|
|
6538
|
+
# so the open tag does not leak.
|
|
6539
|
+
return None, remaining.lstrip()
|
|
6515
6540
|
return "\n\n".join(p for p in parts if p), remaining.lstrip()
|
|
6516
6541
|
|
|
6517
6542
|
|
|
@@ -8206,14 +8231,25 @@ def _parse_anthropic_sse_to_message(raw: bytes) -> dict | None:
|
|
|
8206
8231
|
|
|
8207
8232
|
@app.get("/v1/models")
|
|
8208
8233
|
async def models():
|
|
8209
|
-
"""Return available model list
|
|
8234
|
+
"""Return available model list.
|
|
8235
|
+
|
|
8236
|
+
Advertises Shannon's three canonical Claude model IDs (haiku 4.5,
|
|
8237
|
+
sonnet 4.6, opus 4.7) for client compatibility — Anthropic SDKs
|
|
8238
|
+
typically check /v1/models for the requested ID before sending a
|
|
8239
|
+
Messages request, and failing that check produces a confusing 404 even
|
|
8240
|
+
though the proxy itself would happily accept the request.
|
|
8241
|
+
|
|
8242
|
+
Whether requests for those Claude IDs actually round-trip to
|
|
8243
|
+
api.anthropic.com depends on ANTHROPIC_PASSTHROUGH_MODELS /
|
|
8244
|
+
DEFAULT_PASSTHROUGH_MODEL_PATTERNS. When the local-only sentinel
|
|
8245
|
+
ANTHROPIC_PASSTHROUGH_MODELS=__local_only__ is set, all IDs (including
|
|
8246
|
+
the Claude ones below) are served by the local llama.cpp backend.
|
|
8247
|
+
"""
|
|
8210
8248
|
return {
|
|
8211
8249
|
"data": [
|
|
8212
|
-
{"id": "claude-
|
|
8213
|
-
{"id": "claude-sonnet-4-6
|
|
8214
|
-
{"id": "
|
|
8215
|
-
{"id": "gpt-5.3-codex", "object": "model"},
|
|
8216
|
-
{"id": "claude-opus-4-6-20250616", "object": "model"},
|
|
8250
|
+
{"id": "claude-haiku-4-5-20251001", "object": "model"},
|
|
8251
|
+
{"id": "claude-sonnet-4-6", "object": "model"},
|
|
8252
|
+
{"id": "claude-opus-4-7", "object": "model"},
|
|
8217
8253
|
{"id": "qwen35-a3b-iq4xs", "object": "model"},
|
|
8218
8254
|
]
|
|
8219
8255
|
}
|
|
@@ -5003,3 +5003,131 @@ class TestOpenAIPassthroughConversion(unittest.TestCase):
|
|
|
5003
5003
|
self.assertEqual(tc["function"]["name"], "Bash")
|
|
5004
5004
|
# Arguments are JSON-stringified per OpenAI spec
|
|
5005
5005
|
self.assertEqual(json.loads(tc["function"]["arguments"]), {"command": "pwd"})
|
|
5006
|
+
|
|
5007
|
+
|
|
5008
|
+
class TestModelsEndpoint(unittest.TestCase):
|
|
5009
|
+
"""Tests for the /v1/models discovery endpoint.
|
|
5010
|
+
|
|
5011
|
+
Pins the exact set of model IDs the proxy advertises so an accidental
|
|
5012
|
+
list rewrite that drops a Shannon-required ID fails CI loudly. Driven
|
|
5013
|
+
by Shannon-keygraph's .env defaults (ANTHROPIC_SMALL/MEDIUM/LARGE_MODEL)
|
|
5014
|
+
which expect haiku-4-5, sonnet-4-6, opus-4-7 to be discoverable."""
|
|
5015
|
+
|
|
5016
|
+
def test_models_endpoint_returns_shannon_canonical_set(self):
|
|
5017
|
+
"""The advertised list must contain Shannon's canonical Claude IDs
|
|
5018
|
+
plus the local model. Order is not asserted (clients shouldn't
|
|
5019
|
+
depend on it), but membership is."""
|
|
5020
|
+
import asyncio
|
|
5021
|
+
result = asyncio.run(proxy.models())
|
|
5022
|
+
|
|
5023
|
+
self.assertIn("data", result)
|
|
5024
|
+
ids = {entry["id"] for entry in result["data"]}
|
|
5025
|
+
|
|
5026
|
+
# Shannon-required Claude IDs (must include — per project policy)
|
|
5027
|
+
self.assertIn("claude-haiku-4-5-20251001", ids)
|
|
5028
|
+
self.assertIn("claude-sonnet-4-6", ids)
|
|
5029
|
+
self.assertIn("claude-opus-4-7", ids)
|
|
5030
|
+
|
|
5031
|
+
# Local model (kept because requests for it actually route locally
|
|
5032
|
+
# even with __local_only__ passthrough sentinel set)
|
|
5033
|
+
self.assertIn("qwen35-a3b-iq4xs", ids)
|
|
5034
|
+
|
|
5035
|
+
def test_models_endpoint_drops_stale_4_6_dated_variants(self):
|
|
5036
|
+
"""The pre-2026-05 list advertised claude-opus-4-6-20260101,
|
|
5037
|
+
claude-sonnet-4-6-20250514, claude-opus-4-6-20250616 and
|
|
5038
|
+
unrelated gpt-5 entries. None of those should reappear: the dated
|
|
5039
|
+
4-6 variants are superseded by 4-7, and the proxy doesn't route
|
|
5040
|
+
to OpenAI so the gpt-5 entries were noise."""
|
|
5041
|
+
import asyncio
|
|
5042
|
+
result = asyncio.run(proxy.models())
|
|
5043
|
+
ids = {entry["id"] for entry in result["data"]}
|
|
5044
|
+
|
|
5045
|
+
self.assertNotIn("claude-opus-4-6-20260101", ids)
|
|
5046
|
+
self.assertNotIn("claude-sonnet-4-6-20250514", ids)
|
|
5047
|
+
self.assertNotIn("claude-opus-4-6-20250616", ids)
|
|
5048
|
+
self.assertNotIn("gpt-5.4", ids)
|
|
5049
|
+
self.assertNotIn("gpt-5.3-codex", ids)
|
|
5050
|
+
|
|
5051
|
+
def test_models_endpoint_returns_object_model_for_each_entry(self):
|
|
5052
|
+
"""Each entry must follow the {id, object: 'model'} shape the
|
|
5053
|
+
Anthropic and OpenAI SDKs both expect."""
|
|
5054
|
+
import asyncio
|
|
5055
|
+
result = asyncio.run(proxy.models())
|
|
5056
|
+
|
|
5057
|
+
for entry in result["data"]:
|
|
5058
|
+
self.assertIn("id", entry)
|
|
5059
|
+
self.assertEqual(entry["object"], "model")
|
|
5060
|
+
|
|
5061
|
+
|
|
5062
|
+
class TestThinkingBlockExtraction(unittest.TestCase):
|
|
5063
|
+
"""Tests for _extract_thinking_block — guarantees Anthropic-spec text
|
|
5064
|
+
content blocks never contain Qwen <think> tags, including the truncated
|
|
5065
|
+
/ unclosed case (max_tokens cutting off mid-thinking).
|
|
5066
|
+
|
|
5067
|
+
Reproduces the OK_HAIKU regression observed 2026-05-14: a request with
|
|
5068
|
+
max_tokens=40 against a Qwen upstream produced an Anthropic response
|
|
5069
|
+
whose text block started with `<think>\\nHere's a thinking process:...`
|
|
5070
|
+
because the model never reached `</think>` before being cut off."""
|
|
5071
|
+
|
|
5072
|
+
def test_balanced_single_block_extracts_to_thinking(self):
|
|
5073
|
+
text = "<think>let me reason</think>The answer is 42."
|
|
5074
|
+
thinking, body = proxy._extract_thinking_block(text)
|
|
5075
|
+
self.assertEqual(thinking, "let me reason")
|
|
5076
|
+
self.assertEqual(body, "The answer is 42.")
|
|
5077
|
+
|
|
5078
|
+
def test_balanced_multiple_blocks_concatenate(self):
|
|
5079
|
+
text = "<think>step one</think>partial<think>step two</think>final"
|
|
5080
|
+
thinking, body = proxy._extract_thinking_block(text)
|
|
5081
|
+
self.assertEqual(thinking, "step one\n\nstep two")
|
|
5082
|
+
# Both blocks stripped; body is the residual prose joined.
|
|
5083
|
+
self.assertEqual(body, "partialfinal")
|
|
5084
|
+
|
|
5085
|
+
def test_no_think_tag_returns_text_unchanged(self):
|
|
5086
|
+
text = "Hello world."
|
|
5087
|
+
thinking, body = proxy._extract_thinking_block(text)
|
|
5088
|
+
self.assertIsNone(thinking)
|
|
5089
|
+
self.assertEqual(body, text)
|
|
5090
|
+
|
|
5091
|
+
def test_unclosed_think_captures_partial_and_strips_open_tag(self):
|
|
5092
|
+
"""Truncation case (max_tokens cuts mid-thinking). The open
|
|
5093
|
+
<think> tag and partial reasoning MUST NOT leak into the
|
|
5094
|
+
Anthropic-spec text content block."""
|
|
5095
|
+
text = "<think>\nHere's a thinking process:\n\n1. Analyze user input"
|
|
5096
|
+
thinking, body = proxy._extract_thinking_block(text)
|
|
5097
|
+
# Partial reasoning captured as thinking content
|
|
5098
|
+
self.assertIsNotNone(thinking)
|
|
5099
|
+
self.assertIn("thinking process", thinking)
|
|
5100
|
+
# CRITICAL: body must be empty / contain no <think> tag
|
|
5101
|
+
self.assertEqual(body, "")
|
|
5102
|
+
self.assertNotIn("<think>", body)
|
|
5103
|
+
|
|
5104
|
+
def test_pre_text_then_unclosed_think_preserves_pre_text(self):
|
|
5105
|
+
text = "Pre-thinking prose. <think>partial reasoning"
|
|
5106
|
+
thinking, body = proxy._extract_thinking_block(text)
|
|
5107
|
+
self.assertEqual(thinking, "partial reasoning")
|
|
5108
|
+
# Pre-think prose is preserved as body
|
|
5109
|
+
self.assertEqual(body, "Pre-thinking prose.")
|
|
5110
|
+
self.assertNotIn("<think>", body)
|
|
5111
|
+
|
|
5112
|
+
def test_bare_open_tag_alone_strips_cleanly(self):
|
|
5113
|
+
"""Edge: response is literally just '<think>' with nothing else
|
|
5114
|
+
(extremely degenerate truncation). The tag must be stripped from
|
|
5115
|
+
the body so it doesn't appear in the client-facing response."""
|
|
5116
|
+
text = "<think>"
|
|
5117
|
+
thinking, body = proxy._extract_thinking_block(text)
|
|
5118
|
+
# No partial content -> no thinking block; body empty, no tag leak
|
|
5119
|
+
self.assertIsNone(thinking)
|
|
5120
|
+
self.assertEqual(body, "")
|
|
5121
|
+
self.assertNotIn("<think>", body)
|
|
5122
|
+
|
|
5123
|
+
def test_balanced_then_dangling_unclosed_handles_both(self):
|
|
5124
|
+
"""Multi-step truncation: model emitted one complete <think>
|
|
5125
|
+
block, started a second one, then was cut off."""
|
|
5126
|
+
text = "<think>first thoughts</think>partial answer<think>second thought, cut off mid-way"
|
|
5127
|
+
thinking, body = proxy._extract_thinking_block(text)
|
|
5128
|
+
# Both balanced and unclosed contributions appear
|
|
5129
|
+
self.assertIn("first thoughts", thinking)
|
|
5130
|
+
self.assertIn("second thought, cut off mid-way", thinking)
|
|
5131
|
+
# Body has the prose between the two blocks
|
|
5132
|
+
self.assertEqual(body, "partial answer")
|
|
5133
|
+
self.assertNotIn("<think>", body)
|