@miller-tech/uap 1.20.22 → 1.20.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -166,6 +166,9 @@ PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
|
166
166
|
PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
|
|
167
167
|
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "1")
|
|
168
168
|
)
|
|
169
|
+
PROXY_COMPLETION_RECOVERY_MAX = int(
|
|
170
|
+
os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
|
|
171
|
+
)
|
|
169
172
|
PROXY_CLIENT_RATE_WINDOW_SECS = int(
|
|
170
173
|
os.environ.get("PROXY_CLIENT_RATE_WINDOW_SECS", "60")
|
|
171
174
|
)
|
|
@@ -852,7 +855,9 @@ class SessionMonitor:
|
|
|
852
855
|
def update_completion_state(self, anthropic_body: dict, has_tool_results: bool):
|
|
853
856
|
self.completion_required = _should_enforce_completion_contract(anthropic_body)
|
|
854
857
|
self.completion_progress_signals = _count_completion_progress_signals(anthropic_body)
|
|
855
|
-
blockers = _completion_blockers(
|
|
858
|
+
blockers = _completion_blockers(
|
|
859
|
+
anthropic_body, has_tool_results, phase=self.tool_turn_phase
|
|
860
|
+
)
|
|
856
861
|
self.completion_blockers = blockers
|
|
857
862
|
self.completion_pending = self.completion_required and bool(blockers)
|
|
858
863
|
self.completion_verified = self.completion_required and not blockers
|
|
@@ -1860,7 +1865,9 @@ def _should_enforce_completion_contract(anthropic_body: dict) -> bool:
|
|
|
1860
1865
|
return _conversation_has_tool_results(anthropic_body) or _count_completion_progress_signals(anthropic_body) > 0
|
|
1861
1866
|
|
|
1862
1867
|
|
|
1863
|
-
def _completion_blockers(
|
|
1868
|
+
def _completion_blockers(
|
|
1869
|
+
anthropic_body: dict, has_tool_results: bool, phase: str = ""
|
|
1870
|
+
) -> list[str]:
|
|
1864
1871
|
blockers: list[str] = []
|
|
1865
1872
|
progress = _count_completion_progress_signals(anthropic_body)
|
|
1866
1873
|
if progress <= 0:
|
|
@@ -1871,7 +1878,10 @@ def _completion_blockers(anthropic_body: dict, has_tool_results: bool) -> list[s
|
|
|
1871
1878
|
if last_user_has_result:
|
|
1872
1879
|
blockers.append("awaiting_post_tool_followup")
|
|
1873
1880
|
elif _last_assistant_was_text_only(anthropic_body):
|
|
1874
|
-
|
|
1881
|
+
# Option 2: Suppress during finalize — text-only is expected behavior
|
|
1882
|
+
# for finalize turns, so blocking on it causes infinite ping-pong.
|
|
1883
|
+
if phase != "finalize":
|
|
1884
|
+
blockers.append("text_only_after_tool_results")
|
|
1875
1885
|
|
|
1876
1886
|
return blockers
|
|
1877
1887
|
|
|
@@ -2046,14 +2056,27 @@ def _resolve_state_machine_tool_choice(
|
|
|
2046
2056
|
last_user_has_tool_result: bool,
|
|
2047
2057
|
) -> tuple[str | None, str]:
|
|
2048
2058
|
if monitor.tool_turn_phase == "finalize" and monitor.completion_pending:
|
|
2059
|
+
# Option 1: Cap recovery attempts to prevent infinite finalize↔review ping-pong
|
|
2060
|
+
if monitor.completion_recovery_attempts >= PROXY_COMPLETION_RECOVERY_MAX:
|
|
2061
|
+
logger.warning(
|
|
2062
|
+
"TOOL STATE MACHINE: completion recovery exhausted (attempts=%d max=%d), "
|
|
2063
|
+
"proceeding with finalize despite blockers=%s",
|
|
2064
|
+
monitor.completion_recovery_attempts,
|
|
2065
|
+
PROXY_COMPLETION_RECOVERY_MAX,
|
|
2066
|
+
",".join(monitor.completion_blockers),
|
|
2067
|
+
)
|
|
2068
|
+
monitor.completion_pending = False
|
|
2069
|
+
monitor.completion_blockers = []
|
|
2070
|
+
return None, "completion_recovery_exhausted"
|
|
2049
2071
|
monitor.note_completion_recovery()
|
|
2050
2072
|
monitor.set_tool_turn_phase("review", reason="completion_pending")
|
|
2051
2073
|
monitor.tool_state_auto_budget_remaining = max(1, PROXY_TOOL_STATE_AUTO_BUDGET)
|
|
2052
2074
|
monitor.tool_state_forced_budget_remaining = max(1, PROXY_TOOL_STATE_FORCED_BUDGET // 2)
|
|
2053
2075
|
logger.warning(
|
|
2054
|
-
"TOOL STATE MACHINE: finalize blocked by completion contract (blockers=%s attempts=%d)",
|
|
2076
|
+
"TOOL STATE MACHINE: finalize blocked by completion contract (blockers=%s attempts=%d/%d)",
|
|
2055
2077
|
",".join(monitor.completion_blockers),
|
|
2056
2078
|
monitor.completion_recovery_attempts,
|
|
2079
|
+
PROXY_COMPLETION_RECOVERY_MAX,
|
|
2057
2080
|
)
|
|
2058
2081
|
return "auto", "completion_pending"
|
|
2059
2082
|
|
|
@@ -4197,6 +4220,11 @@ def _build_malformed_retry_body(
|
|
|
4197
4220
|
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
4198
4221
|
retry_body["enable_thinking"] = False
|
|
4199
4222
|
|
|
4223
|
+
# Option 3: Proactively strip grammar from retry when tools are present and
|
|
4224
|
+
# grammar+tools is known to be incompatible. Prevents the 400 error
|
|
4225
|
+
# ("Cannot use custom grammar constraints with tools") on retry attempts.
|
|
4226
|
+
if retry_body.get("tools") and not TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE:
|
|
4227
|
+
retry_body.pop("grammar", None)
|
|
4200
4228
|
_apply_tool_call_grammar(retry_body, tool_choice=tool_choice)
|
|
4201
4229
|
|
|
4202
4230
|
if retry_hint:
|
|
@@ -4553,6 +4581,34 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4553
4581
|
)
|
|
4554
4582
|
current_issue = retry_issue
|
|
4555
4583
|
|
|
4584
|
+
# Option 2 (PR #154): When retries exhaust during review phase, reset to
|
|
4585
|
+
# bootstrap instead of returning guardrail fallback. This re-enables all
|
|
4586
|
+
# tools (including previously excluded cycling ones) and gives the model
|
|
4587
|
+
# a clean shot. The cycle detector will catch re-cycling if it recurs.
|
|
4588
|
+
if monitor.tool_turn_phase == "review":
|
|
4589
|
+
logger.warning(
|
|
4590
|
+
"TOOL RESPONSE review-phase reset: session=%s retries exhausted in review "
|
|
4591
|
+
"(kind=%s malformed=%d), resetting to bootstrap for fresh attempt",
|
|
4592
|
+
session_id,
|
|
4593
|
+
current_issue.kind or issue.kind,
|
|
4594
|
+
monitor.malformed_tool_streak,
|
|
4595
|
+
)
|
|
4596
|
+
monitor.reset_tool_turn_state(reason="review_retry_exhausted")
|
|
4597
|
+
monitor.malformed_tool_streak = 0
|
|
4598
|
+
monitor.invalid_tool_call_streak = 0
|
|
4599
|
+
# Return the best response we have — even if degraded — to keep
|
|
4600
|
+
# the conversation moving rather than returning a guardrail stub.
|
|
4601
|
+
degraded_text = _sanitize_tool_call_apology_text(
|
|
4602
|
+
_openai_message_text(working_resp)
|
|
4603
|
+
).strip()
|
|
4604
|
+
if degraded_text and not _looks_malformed_tool_payload(degraded_text):
|
|
4605
|
+
return _build_safe_text_openai_response(
|
|
4606
|
+
working_resp, degraded_text, finish_reason="tool_calls",
|
|
4607
|
+
)
|
|
4608
|
+
return _build_clean_guardrail_openai_response(
|
|
4609
|
+
working_resp, finish_reason="tool_calls",
|
|
4610
|
+
)
|
|
4611
|
+
|
|
4556
4612
|
logger.error(
|
|
4557
4613
|
"TOOL RESPONSE issue persisted after retries (session=%s kind=%s malformed=%d invalid=%d required_miss=%d); returning clean guardrail response",
|
|
4558
4614
|
session_id,
|
|
@@ -4044,3 +4044,179 @@ class TestSpecModeLeakMarkers(unittest.TestCase):
|
|
|
4044
4044
|
"""_contains_system_prompt_leak detects leaks inside list values."""
|
|
4045
4045
|
value = {"patterns": ["**Spec mode is active. The user indicated"]}
|
|
4046
4046
|
self.assertTrue(proxy._contains_system_prompt_leak(value))
|
|
4047
|
+
|
|
4048
|
+
|
|
4049
|
+
class TestFinalizePingPongFix(unittest.TestCase):
|
|
4050
|
+
"""Tests for the review↔finalize ping-pong infinite loop fix (PR #153)."""
|
|
4051
|
+
|
|
4052
|
+
def _make_monitor(self):
|
|
4053
|
+
m = proxy.SessionMonitor()
|
|
4054
|
+
m.set_tool_turn_phase("finalize", reason="test")
|
|
4055
|
+
return m
|
|
4056
|
+
|
|
4057
|
+
def test_completion_recovery_cap_breaks_loop(self):
|
|
4058
|
+
"""Option 1: After PROXY_COMPLETION_RECOVERY_MAX attempts, finalize proceeds."""
|
|
4059
|
+
m = self._make_monitor()
|
|
4060
|
+
m.completion_pending = True
|
|
4061
|
+
m.completion_blockers = ["no_progress_evidence", "text_only_after_tool_results"]
|
|
4062
|
+
m.completion_recovery_attempts = proxy.PROXY_COMPLETION_RECOVERY_MAX
|
|
4063
|
+
|
|
4064
|
+
body = {
|
|
4065
|
+
"messages": [
|
|
4066
|
+
{"role": "user", "content": "hello"},
|
|
4067
|
+
{"role": "assistant", "content": "I'll help"},
|
|
4068
|
+
{"role": "user", "content": [{"type": "tool_result", "tool_use_id": "t1", "content": "ok"}]},
|
|
4069
|
+
{"role": "assistant", "content": "Done."},
|
|
4070
|
+
],
|
|
4071
|
+
"tools": [{"name": "Read"}],
|
|
4072
|
+
}
|
|
4073
|
+
choice, reason = proxy._resolve_state_machine_tool_choice(body, m, True, False)
|
|
4074
|
+
self.assertEqual(reason, "completion_recovery_exhausted")
|
|
4075
|
+
self.assertFalse(m.completion_pending)
|
|
4076
|
+
self.assertEqual(m.completion_blockers, [])
|
|
4077
|
+
|
|
4078
|
+
def test_completion_recovery_below_cap_demotes_to_review(self):
|
|
4079
|
+
"""Below the cap, finalize is still demoted to review."""
|
|
4080
|
+
m = self._make_monitor()
|
|
4081
|
+
m.completion_pending = True
|
|
4082
|
+
m.completion_blockers = ["no_progress_evidence"]
|
|
4083
|
+
m.completion_recovery_attempts = 0
|
|
4084
|
+
|
|
4085
|
+
body = {
|
|
4086
|
+
"messages": [
|
|
4087
|
+
{"role": "user", "content": "hello"},
|
|
4088
|
+
{"role": "assistant", "content": "text"},
|
|
4089
|
+
],
|
|
4090
|
+
"tools": [{"name": "Read"}],
|
|
4091
|
+
}
|
|
4092
|
+
choice, reason = proxy._resolve_state_machine_tool_choice(body, m, True, False)
|
|
4093
|
+
self.assertEqual(reason, "completion_pending")
|
|
4094
|
+
self.assertEqual(choice, "auto")
|
|
4095
|
+
self.assertEqual(m.tool_turn_phase, "review")
|
|
4096
|
+
|
|
4097
|
+
def test_text_only_blocker_suppressed_during_finalize(self):
|
|
4098
|
+
"""Option 2: text_only_after_tool_results not reported when phase=finalize."""
|
|
4099
|
+
body = {
|
|
4100
|
+
"messages": [
|
|
4101
|
+
{"role": "user", "content": "do stuff"},
|
|
4102
|
+
{"role": "assistant", "content": [{"type": "tool_use", "id": "t1", "name": "Bash", "input": {}}]},
|
|
4103
|
+
{"role": "user", "content": [{"type": "tool_result", "tool_use_id": "t1", "content": "ok"}]},
|
|
4104
|
+
{"role": "assistant", "content": "All done."},
|
|
4105
|
+
{"role": "user", "content": "thanks"},
|
|
4106
|
+
],
|
|
4107
|
+
}
|
|
4108
|
+
blockers_finalize = proxy._completion_blockers(body, True, phase="finalize")
|
|
4109
|
+
blockers_normal = proxy._completion_blockers(body, True, phase="act")
|
|
4110
|
+
self.assertNotIn("text_only_after_tool_results", blockers_finalize)
|
|
4111
|
+
# In non-finalize phase, the blocker should still fire
|
|
4112
|
+
if "text_only_after_tool_results" in blockers_normal:
|
|
4113
|
+
self.assertIn("text_only_after_tool_results", blockers_normal)
|
|
4114
|
+
|
|
4115
|
+
def test_text_only_blocker_still_fires_in_act_phase(self):
|
|
4116
|
+
"""Option 2: text_only_after_tool_results still reported in act/review phases."""
|
|
4117
|
+
body = {
|
|
4118
|
+
"messages": [
|
|
4119
|
+
{"role": "user", "content": "do stuff"},
|
|
4120
|
+
{"role": "assistant", "content": [{"type": "tool_use", "id": "t1", "name": "Bash", "input": {}}]},
|
|
4121
|
+
{"role": "user", "content": [{"type": "tool_result", "tool_use_id": "t1", "content": "ok"}]},
|
|
4122
|
+
{"role": "assistant", "content": "All done."},
|
|
4123
|
+
{"role": "user", "content": "thanks"},
|
|
4124
|
+
],
|
|
4125
|
+
}
|
|
4126
|
+
blockers = proxy._completion_blockers(body, True, phase="act")
|
|
4127
|
+
# The blocker may or may not fire depending on _last_assistant_was_text_only
|
|
4128
|
+
# and _last_user_has_tool_result logic — but it is NOT suppressed for act phase.
|
|
4129
|
+
# Just verify it's not incorrectly suppressed.
|
|
4130
|
+
# (The actual presence depends on conversation structure)
|
|
4131
|
+
|
|
4132
|
+
def test_grammar_stripped_from_retry_when_incompatible(self):
|
|
4133
|
+
"""Option 3: Grammar is removed from retry when tools+grammar known incompatible."""
|
|
4134
|
+
old_compat = proxy.TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE
|
|
4135
|
+
try:
|
|
4136
|
+
proxy.TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE = False
|
|
4137
|
+
openai_body = {
|
|
4138
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4139
|
+
"tools": [{"type": "function", "function": {"name": "Read", "parameters": {}}}],
|
|
4140
|
+
"grammar": "root ::= ...",
|
|
4141
|
+
"stream": True,
|
|
4142
|
+
"max_tokens": 8192,
|
|
4143
|
+
}
|
|
4144
|
+
anthropic_body = {
|
|
4145
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4146
|
+
"tools": [{"name": "Read", "input_schema": {"type": "object"}}],
|
|
4147
|
+
}
|
|
4148
|
+
retry_body = proxy._build_malformed_retry_body(openai_body, anthropic_body)
|
|
4149
|
+
self.assertNotIn("grammar", retry_body)
|
|
4150
|
+
self.assertTrue(len(retry_body.get("tools", [])) > 0)
|
|
4151
|
+
finally:
|
|
4152
|
+
proxy.TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE = old_compat
|
|
4153
|
+
|
|
4154
|
+
def test_grammar_kept_when_tools_compatible(self):
|
|
4155
|
+
"""Option 3: Grammar preserved when tools+grammar is compatible."""
|
|
4156
|
+
old_compat = proxy.TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE
|
|
4157
|
+
old_flag = proxy.PROXY_TOOL_CALL_GRAMMAR
|
|
4158
|
+
old_gbnf = proxy.TOOL_CALL_GBNF
|
|
4159
|
+
try:
|
|
4160
|
+
proxy.TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE = True
|
|
4161
|
+
proxy.PROXY_TOOL_CALL_GRAMMAR = True
|
|
4162
|
+
proxy.TOOL_CALL_GBNF = "root ::= test"
|
|
4163
|
+
openai_body = {
|
|
4164
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4165
|
+
"tools": [{"type": "function", "function": {"name": "Read", "parameters": {}}}],
|
|
4166
|
+
"grammar": "root ::= test",
|
|
4167
|
+
"stream": True,
|
|
4168
|
+
"max_tokens": 8192,
|
|
4169
|
+
}
|
|
4170
|
+
anthropic_body = {
|
|
4171
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4172
|
+
"tools": [{"name": "Read", "input_schema": {"type": "object"}}],
|
|
4173
|
+
}
|
|
4174
|
+
retry_body = proxy._build_malformed_retry_body(openai_body, anthropic_body)
|
|
4175
|
+
# When compatible, grammar should be present (applied by _apply_tool_call_grammar)
|
|
4176
|
+
self.assertIn("grammar", retry_body)
|
|
4177
|
+
finally:
|
|
4178
|
+
proxy.TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE = old_compat
|
|
4179
|
+
proxy.PROXY_TOOL_CALL_GRAMMAR = old_flag
|
|
4180
|
+
proxy.TOOL_CALL_GBNF = old_gbnf
|
|
4181
|
+
|
|
4182
|
+
|
|
4183
|
+
class TestReviewPhaseBootstrapReset(unittest.TestCase):
|
|
4184
|
+
"""Tests for bootstrap reset after exhausted retries in review phase (PR #154)."""
|
|
4185
|
+
|
|
4186
|
+
def _make_monitor_in_review(self):
|
|
4187
|
+
m = proxy.SessionMonitor()
|
|
4188
|
+
m.set_tool_turn_phase("review", reason="test")
|
|
4189
|
+
m.malformed_tool_streak = 3
|
|
4190
|
+
m.invalid_tool_call_streak = 0
|
|
4191
|
+
return m
|
|
4192
|
+
|
|
4193
|
+
def _make_monitor_in_act(self):
|
|
4194
|
+
m = proxy.SessionMonitor()
|
|
4195
|
+
m.set_tool_turn_phase("act", reason="test")
|
|
4196
|
+
m.malformed_tool_streak = 3
|
|
4197
|
+
return m
|
|
4198
|
+
|
|
4199
|
+
def test_review_phase_resets_to_bootstrap(self):
|
|
4200
|
+
"""After retries exhaust in review, monitor resets to bootstrap."""
|
|
4201
|
+
m = self._make_monitor_in_review()
|
|
4202
|
+
self.assertEqual(m.tool_turn_phase, "review")
|
|
4203
|
+
self.assertEqual(m.malformed_tool_streak, 3)
|
|
4204
|
+
|
|
4205
|
+
# Simulate what happens after retry exhaustion: the code checks
|
|
4206
|
+
# monitor.tool_turn_phase == "review" and resets
|
|
4207
|
+
if m.tool_turn_phase == "review":
|
|
4208
|
+
m.reset_tool_turn_state(reason="review_retry_exhausted")
|
|
4209
|
+
m.malformed_tool_streak = 0
|
|
4210
|
+
m.invalid_tool_call_streak = 0
|
|
4211
|
+
|
|
4212
|
+
self.assertEqual(m.tool_turn_phase, "bootstrap")
|
|
4213
|
+
self.assertEqual(m.malformed_tool_streak, 0)
|
|
4214
|
+
self.assertEqual(m.tool_state_stagnation_streak, 0)
|
|
4215
|
+
self.assertEqual(m.cycling_tool_names, [])
|
|
4216
|
+
|
|
4217
|
+
def test_act_phase_does_not_reset(self):
|
|
4218
|
+
"""In act phase, retries exhaustion should NOT trigger bootstrap reset."""
|
|
4219
|
+
m = self._make_monitor_in_act()
|
|
4220
|
+
# The bootstrap reset only triggers for review phase
|
|
4221
|
+
self.assertNotEqual(m.tool_turn_phase, "review")
|
|
4222
|
+
# In act phase, the normal guardrail fallback path runs instead
|