@miller-tech/uap 1.20.11 → 1.20.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -143,7 +143,7 @@ PROXY_TOOL_STATE_MIN_MESSAGES = int(
|
|
|
143
143
|
os.environ.get("PROXY_TOOL_STATE_MIN_MESSAGES", "6")
|
|
144
144
|
)
|
|
145
145
|
PROXY_TOOL_STATE_FORCED_BUDGET = int(
|
|
146
|
-
os.environ.get("PROXY_TOOL_STATE_FORCED_BUDGET", "
|
|
146
|
+
os.environ.get("PROXY_TOOL_STATE_FORCED_BUDGET", "12")
|
|
147
147
|
)
|
|
148
148
|
PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
|
|
149
149
|
PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
|
|
@@ -156,7 +156,7 @@ PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
|
156
156
|
os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
|
|
157
157
|
)
|
|
158
158
|
PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
|
|
159
|
-
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "
|
|
159
|
+
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "1")
|
|
160
160
|
)
|
|
161
161
|
PROXY_CLIENT_RATE_WINDOW_SECS = int(
|
|
162
162
|
os.environ.get("PROXY_CLIENT_RATE_WINDOW_SECS", "60")
|
|
@@ -628,6 +628,7 @@ class SessionMonitor:
|
|
|
628
628
|
tool_state_transitions: int = 0
|
|
629
629
|
tool_state_review_cycles: int = 0
|
|
630
630
|
last_tool_fingerprint: str = ""
|
|
631
|
+
cycling_tool_names: list = field(default_factory=list)
|
|
631
632
|
finalize_turn_active: bool = False
|
|
632
633
|
completion_required: bool = False
|
|
633
634
|
completion_pending: bool = False
|
|
@@ -832,6 +833,7 @@ class SessionMonitor:
|
|
|
832
833
|
self.tool_state_auto_budget_remaining = 0
|
|
833
834
|
self.tool_state_stagnation_streak = 0
|
|
834
835
|
self.tool_state_review_cycles = 0
|
|
836
|
+
self.cycling_tool_names = []
|
|
835
837
|
self.last_tool_fingerprint = ""
|
|
836
838
|
|
|
837
839
|
def update_completion_state(self, anthropic_body: dict, has_tool_results: bool):
|
|
@@ -1104,7 +1106,10 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
1104
1106
|
|
|
1105
1107
|
|
|
1106
1108
|
def prune_conversation(
|
|
1107
|
-
anthropic_body: dict,
|
|
1109
|
+
anthropic_body: dict,
|
|
1110
|
+
context_window: int,
|
|
1111
|
+
target_fraction: float = 0.65,
|
|
1112
|
+
keep_last: int = 8,
|
|
1108
1113
|
) -> dict:
|
|
1109
1114
|
"""Prune the conversation to fit within the context window.
|
|
1110
1115
|
|
|
@@ -1119,6 +1124,7 @@ def prune_conversation(
|
|
|
1119
1124
|
anthropic_body: The full Anthropic request body
|
|
1120
1125
|
context_window: Maximum context window in tokens
|
|
1121
1126
|
target_fraction: Target utilization after pruning (0.0-1.0)
|
|
1127
|
+
keep_last: Number of recent messages to always keep (default 8)
|
|
1122
1128
|
|
|
1123
1129
|
Returns:
|
|
1124
1130
|
Modified anthropic_body with pruned messages
|
|
@@ -1131,6 +1137,8 @@ def prune_conversation(
|
|
|
1131
1137
|
target_tokens = int(context_window * target_fraction)
|
|
1132
1138
|
|
|
1133
1139
|
# Estimate non-message tokens (system, tools, agentic supplement)
|
|
1140
|
+
# Apply a 1.5x safety factor to account for chat template overhead
|
|
1141
|
+
# and tokenization differences between local estimate and upstream
|
|
1134
1142
|
overhead_tokens = 0
|
|
1135
1143
|
system = anthropic_body.get("system", "")
|
|
1136
1144
|
if isinstance(system, str):
|
|
@@ -1144,6 +1152,7 @@ def prune_conversation(
|
|
|
1144
1152
|
tools = anthropic_body.get("tools", [])
|
|
1145
1153
|
if tools:
|
|
1146
1154
|
overhead_tokens += estimate_tokens(json.dumps(tools))
|
|
1155
|
+
overhead_tokens = int(overhead_tokens * 1.5) # Safety factor for template overhead
|
|
1147
1156
|
|
|
1148
1157
|
# Budget for messages
|
|
1149
1158
|
message_budget = target_tokens - overhead_tokens
|
|
@@ -1152,7 +1161,7 @@ def prune_conversation(
|
|
|
1152
1161
|
return anthropic_body
|
|
1153
1162
|
|
|
1154
1163
|
# Always keep the first user message and the last N messages
|
|
1155
|
-
KEEP_LAST =
|
|
1164
|
+
KEEP_LAST = keep_last
|
|
1156
1165
|
protected_head = messages[:1] # First user message
|
|
1157
1166
|
protected_tail = (
|
|
1158
1167
|
messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
@@ -2046,12 +2055,17 @@ def _resolve_state_machine_tool_choice(
|
|
|
2046
2055
|
monitor.tool_state_forced_budget_remaining = max(
|
|
2047
2056
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2048
2057
|
)
|
|
2058
|
+
# Capture which tools are cycling for narrowing/hint injection
|
|
2059
|
+
window = max(2, PROXY_TOOL_STATE_CYCLE_WINDOW)
|
|
2060
|
+
recent = [fp for fp in monitor.tool_call_history[-window:] if fp]
|
|
2061
|
+
monitor.cycling_tool_names = list(dict.fromkeys(recent))
|
|
2049
2062
|
logger.warning(
|
|
2050
|
-
"TOOL STATE MACHINE: entering review (cycle=%s repeat=%d stagnation=%d cycles=%d)",
|
|
2063
|
+
"TOOL STATE MACHINE: entering review (cycle=%s repeat=%d stagnation=%d cycles=%d cycling_tools=%s)",
|
|
2051
2064
|
cycle_looping,
|
|
2052
2065
|
cycle_repeat,
|
|
2053
2066
|
monitor.tool_state_stagnation_streak,
|
|
2054
2067
|
monitor.tool_state_review_cycles,
|
|
2068
|
+
monitor.cycling_tool_names,
|
|
2055
2069
|
)
|
|
2056
2070
|
return "required", reason
|
|
2057
2071
|
|
|
@@ -2342,6 +2356,49 @@ def build_openai_request(
|
|
|
2342
2356
|
monitor.no_progress_streak = (
|
|
2343
2357
|
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
2344
2358
|
)
|
|
2359
|
+
# Option 1: Inject cycle-break instruction when entering review
|
|
2360
|
+
if (
|
|
2361
|
+
monitor.tool_turn_phase == "review"
|
|
2362
|
+
and state_reason in {"cycle_detected", "stagnation"}
|
|
2363
|
+
and monitor.cycling_tool_names
|
|
2364
|
+
):
|
|
2365
|
+
cycling_names = ", ".join(monitor.cycling_tool_names)
|
|
2366
|
+
cycle_hint = (
|
|
2367
|
+
f"You have been repeatedly calling the same tool(s): {cycling_names}. "
|
|
2368
|
+
"This is not making progress. Use a DIFFERENT tool to advance the task, "
|
|
2369
|
+
"or call a tool that produces your final answer."
|
|
2370
|
+
)
|
|
2371
|
+
messages = openai_body.get("messages", [])
|
|
2372
|
+
messages.append({"role": "user", "content": cycle_hint})
|
|
2373
|
+
openai_body["messages"] = messages
|
|
2374
|
+
logger.warning(
|
|
2375
|
+
"CYCLE BREAK: injected hint about cycling tools: %s",
|
|
2376
|
+
cycling_names,
|
|
2377
|
+
)
|
|
2378
|
+
# Option 2: Narrow tools during review to exclude cycling tools
|
|
2379
|
+
if (
|
|
2380
|
+
monitor.tool_turn_phase == "review"
|
|
2381
|
+
and monitor.cycling_tool_names
|
|
2382
|
+
and "tools" in openai_body
|
|
2383
|
+
):
|
|
2384
|
+
original_count = len(openai_body["tools"])
|
|
2385
|
+
narrowed = [
|
|
2386
|
+
t
|
|
2387
|
+
for t in openai_body["tools"]
|
|
2388
|
+
if t.get("function", {}).get("name") not in monitor.cycling_tool_names
|
|
2389
|
+
]
|
|
2390
|
+
if narrowed:
|
|
2391
|
+
openai_body["tools"] = narrowed
|
|
2392
|
+
logger.warning(
|
|
2393
|
+
"CYCLE BREAK: narrowed tools from %d to %d (excluded %s)",
|
|
2394
|
+
original_count,
|
|
2395
|
+
len(narrowed),
|
|
2396
|
+
monitor.cycling_tool_names,
|
|
2397
|
+
)
|
|
2398
|
+
else:
|
|
2399
|
+
logger.warning(
|
|
2400
|
+
"CYCLE BREAK: cannot narrow tools — all tools are cycling, keeping original set",
|
|
2401
|
+
)
|
|
2345
2402
|
logger.info(
|
|
2346
2403
|
"tool_choice forced to 'required' by TOOL STATE MACHINE (phase=%s reason=%s forced_budget=%d)",
|
|
2347
2404
|
monitor.tool_turn_phase,
|
|
@@ -4962,28 +5019,86 @@ async def messages(request: Request):
|
|
|
4962
5019
|
monitor.log_status()
|
|
4963
5020
|
|
|
4964
5021
|
# --- Option C: Prune conversation if approaching context limit ---
|
|
5022
|
+
# Option 1: Prefer upstream actual token count over local estimate
|
|
4965
5023
|
ctx_window = monitor.context_window
|
|
4966
5024
|
if ctx_window > 0:
|
|
4967
|
-
|
|
5025
|
+
# Use the upstream's actual prompt_tokens if available and higher
|
|
5026
|
+
# than the local estimate (the upstream counts chat template overhead,
|
|
5027
|
+
# tool schema tokenization, etc. that local heuristics miss).
|
|
5028
|
+
effective_tokens = estimated_tokens
|
|
5029
|
+
if monitor.last_input_tokens > estimated_tokens:
|
|
5030
|
+
effective_tokens = monitor.last_input_tokens
|
|
5031
|
+
logger.info(
|
|
5032
|
+
"Using upstream token count %d (local estimate %d) for prune decision",
|
|
5033
|
+
effective_tokens,
|
|
5034
|
+
estimated_tokens,
|
|
5035
|
+
)
|
|
5036
|
+
utilization = effective_tokens / ctx_window
|
|
4968
5037
|
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
4969
5038
|
logger.warning(
|
|
4970
5039
|
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
4971
5040
|
utilization * 100,
|
|
4972
5041
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
4973
5042
|
)
|
|
5043
|
+
# Option 3: Aggressive pruning at critical utilization
|
|
5044
|
+
target_frac = _resolve_prune_target_fraction()
|
|
5045
|
+
keep_last = 8
|
|
5046
|
+
if utilization >= 0.90:
|
|
5047
|
+
keep_last = 4
|
|
5048
|
+
target_frac = min(target_frac, 0.40)
|
|
5049
|
+
logger.warning(
|
|
5050
|
+
"CRITICAL PRUNE: utilization %.1f%% >= 90%%, using keep_last=%d target=%.0f%%",
|
|
5051
|
+
utilization * 100,
|
|
5052
|
+
keep_last,
|
|
5053
|
+
target_frac * 100,
|
|
5054
|
+
)
|
|
4974
5055
|
body = prune_conversation(
|
|
4975
|
-
body, ctx_window, target_fraction=
|
|
5056
|
+
body, ctx_window, target_fraction=target_frac, keep_last=keep_last
|
|
4976
5057
|
)
|
|
4977
5058
|
monitor.prune_count += 1
|
|
4978
|
-
#
|
|
5059
|
+
# Option 4: Post-prune validation — verify actual reduction
|
|
4979
5060
|
estimated_tokens = estimate_total_tokens(body)
|
|
4980
5061
|
monitor.record_request(estimated_tokens)
|
|
5062
|
+
post_util = estimated_tokens / ctx_window
|
|
4981
5063
|
n_messages = len(body.get("messages", []))
|
|
4982
5064
|
logger.info(
|
|
4983
|
-
"After pruning: ~%d tokens
|
|
5065
|
+
"After pruning: ~%d tokens (%d messages), utilization %.1f%%",
|
|
4984
5066
|
estimated_tokens,
|
|
4985
5067
|
n_messages,
|
|
5068
|
+
post_util * 100,
|
|
4986
5069
|
)
|
|
5070
|
+
# If still above threshold after first prune, do aggressive second pass
|
|
5071
|
+
if post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
5072
|
+
logger.warning(
|
|
5073
|
+
"POST-PRUNE VALIDATION: still at %.1f%% after prune, doing aggressive pass",
|
|
5074
|
+
post_util * 100,
|
|
5075
|
+
)
|
|
5076
|
+
body = prune_conversation(
|
|
5077
|
+
body, ctx_window, target_fraction=0.35, keep_last=4
|
|
5078
|
+
)
|
|
5079
|
+
monitor.prune_count += 1
|
|
5080
|
+
estimated_tokens = estimate_total_tokens(body)
|
|
5081
|
+
monitor.record_request(estimated_tokens)
|
|
5082
|
+
post_util = estimated_tokens / ctx_window
|
|
5083
|
+
n_messages = len(body.get("messages", []))
|
|
5084
|
+
logger.info(
|
|
5085
|
+
"After aggressive prune: ~%d tokens (%d messages), utilization %.1f%%",
|
|
5086
|
+
estimated_tokens,
|
|
5087
|
+
n_messages,
|
|
5088
|
+
post_util * 100,
|
|
5089
|
+
)
|
|
5090
|
+
# Option 2: Circuit breaker — if 3+ consecutive prunes and still above,
|
|
5091
|
+
# force finalize (drop tools, let model wrap up)
|
|
5092
|
+
if monitor.prune_count >= 3 and post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
5093
|
+
logger.error(
|
|
5094
|
+
"PRUNE CIRCUIT BREAKER: %d consecutive prunes, still at %.1f%%. "
|
|
5095
|
+
"Forcing finalize to prevent death spiral.",
|
|
5096
|
+
monitor.prune_count,
|
|
5097
|
+
post_util * 100,
|
|
5098
|
+
)
|
|
5099
|
+
monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
|
|
5100
|
+
monitor.tool_state_auto_budget_remaining = 1
|
|
5101
|
+
monitor.reset_completion_recovery()
|
|
4987
5102
|
|
|
4988
5103
|
openai_body = build_openai_request(
|
|
4989
5104
|
body,
|
|
@@ -5104,6 +5219,10 @@ async def messages(request: Request):
|
|
|
5104
5219
|
|
|
5105
5220
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5106
5221
|
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
5222
|
+
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5223
|
+
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
5224
|
+
if upstream_input > 0:
|
|
5225
|
+
monitor.last_input_tokens = upstream_input
|
|
5107
5226
|
if PROXY_FORCE_NON_STREAM:
|
|
5108
5227
|
logger.info(
|
|
5109
5228
|
"FORCED NON-STREAM: served stream response via guarded non-stream path"
|
|
@@ -5441,6 +5560,10 @@ async def messages(request: Request):
|
|
|
5441
5560
|
# Track output tokens in session monitor
|
|
5442
5561
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
5443
5562
|
monitor.record_response(output_tokens)
|
|
5563
|
+
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5564
|
+
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
5565
|
+
if upstream_input > 0:
|
|
5566
|
+
monitor.last_input_tokens = upstream_input
|
|
5444
5567
|
|
|
5445
5568
|
return anthropic_resp
|
|
5446
5569
|
|
|
@@ -3169,6 +3169,214 @@ class TestToolStarvationBreaker(unittest.TestCase):
|
|
|
3169
3169
|
self.assertIn("tools", result)
|
|
3170
3170
|
|
|
3171
3171
|
|
|
3172
|
+
class TestPruningImprovements(unittest.TestCase):
|
|
3173
|
+
"""Tests for pruning death spiral fixes."""
|
|
3174
|
+
|
|
3175
|
+
def test_prune_uses_upstream_tokens_when_higher(self):
|
|
3176
|
+
"""Option 1: upstream last_input_tokens used when higher than local estimate."""
|
|
3177
|
+
monitor = proxy.SessionMonitor(context_window=10000)
|
|
3178
|
+
# Simulate upstream reporting higher token count than local estimate
|
|
3179
|
+
monitor.last_input_tokens = 9000 # 90% - above 85% threshold
|
|
3180
|
+
body = {
|
|
3181
|
+
"model": "test",
|
|
3182
|
+
"messages": [
|
|
3183
|
+
{"role": "user", "content": "start"},
|
|
3184
|
+
{"role": "assistant", "content": "ok"},
|
|
3185
|
+
{"role": "user", "content": "a" * 100},
|
|
3186
|
+
{"role": "assistant", "content": "b" * 100},
|
|
3187
|
+
{"role": "user", "content": "c" * 100},
|
|
3188
|
+
{"role": "assistant", "content": "d" * 100},
|
|
3189
|
+
{"role": "user", "content": "e" * 100},
|
|
3190
|
+
{"role": "assistant", "content": "f" * 100},
|
|
3191
|
+
{"role": "user", "content": "g" * 100},
|
|
3192
|
+
{"role": "assistant", "content": "h" * 100},
|
|
3193
|
+
{"role": "user", "content": "continue"},
|
|
3194
|
+
],
|
|
3195
|
+
}
|
|
3196
|
+
# Local estimate_total_tokens will be much lower than 9000
|
|
3197
|
+
local_est = proxy.estimate_total_tokens(body)
|
|
3198
|
+
self.assertLess(local_est, 9000)
|
|
3199
|
+
# The pruning code should use upstream's 9000 for the decision
|
|
3200
|
+
|
|
3201
|
+
def test_prune_conversation_accepts_keep_last(self):
|
|
3202
|
+
"""Option 3: prune_conversation accepts keep_last parameter."""
|
|
3203
|
+
body = {
|
|
3204
|
+
"messages": [
|
|
3205
|
+
{"role": "user", "content": "first"},
|
|
3206
|
+
{"role": "assistant", "content": "a" * 500},
|
|
3207
|
+
{"role": "user", "content": "b" * 500},
|
|
3208
|
+
{"role": "assistant", "content": "c" * 500},
|
|
3209
|
+
{"role": "user", "content": "d" * 500},
|
|
3210
|
+
{"role": "assistant", "content": "e" * 500},
|
|
3211
|
+
{"role": "user", "content": "f" * 500},
|
|
3212
|
+
{"role": "assistant", "content": "g" * 500},
|
|
3213
|
+
{"role": "user", "content": "h" * 500},
|
|
3214
|
+
{"role": "assistant", "content": "i" * 500},
|
|
3215
|
+
{"role": "user", "content": "last"},
|
|
3216
|
+
],
|
|
3217
|
+
}
|
|
3218
|
+
# With keep_last=4, more middle messages should be prunable
|
|
3219
|
+
result_8 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=8)
|
|
3220
|
+
result_4 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=4)
|
|
3221
|
+
# keep_last=4 should result in fewer or equal messages
|
|
3222
|
+
self.assertLessEqual(
|
|
3223
|
+
len(result_4.get("messages", [])),
|
|
3224
|
+
len(result_8.get("messages", [])),
|
|
3225
|
+
)
|
|
3226
|
+
|
|
3227
|
+
def test_prune_circuit_breaker_sets_finalize(self):
|
|
3228
|
+
"""Option 2: circuit breaker forces finalize after repeated prunes."""
|
|
3229
|
+
monitor = proxy.SessionMonitor(context_window=10000)
|
|
3230
|
+
monitor.prune_count = 3 # Already pruned 3 times
|
|
3231
|
+
# After the pruning code runs and still exceeds threshold,
|
|
3232
|
+
# it should set finalize phase
|
|
3233
|
+
monitor.set_tool_turn_phase("act", reason="test")
|
|
3234
|
+
# Simulate the circuit breaker logic
|
|
3235
|
+
monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
|
|
3236
|
+
self.assertEqual(monitor.tool_turn_phase, "finalize")
|
|
3237
|
+
|
|
3238
|
+
|
|
3239
|
+
class TestCycleBreakOptions(unittest.TestCase):
|
|
3240
|
+
"""Tests for cycle-break options: hint injection, tool narrowing, reduced budgets."""
|
|
3241
|
+
|
|
3242
|
+
def test_cycle_break_injects_hint_message(self):
|
|
3243
|
+
"""Option 1: cycle detection injects a user hint about the cycling tools."""
|
|
3244
|
+
old_state = getattr(proxy, "PROXY_TOOL_STATE_MACHINE")
|
|
3245
|
+
old_min_msgs = getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES")
|
|
3246
|
+
old_forced = getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET")
|
|
3247
|
+
old_auto = getattr(proxy, "PROXY_TOOL_STATE_AUTO_BUDGET")
|
|
3248
|
+
old_stagnation = getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD")
|
|
3249
|
+
old_cycle_window = getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW")
|
|
3250
|
+
try:
|
|
3251
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
3252
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
3253
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
|
|
3254
|
+
setattr(proxy, "PROXY_TOOL_STATE_AUTO_BUDGET", 2)
|
|
3255
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 99)
|
|
3256
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 4)
|
|
3257
|
+
|
|
3258
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3259
|
+
monitor.tool_turn_phase = "act"
|
|
3260
|
+
monitor.tool_state_forced_budget_remaining = 20
|
|
3261
|
+
monitor.tool_call_history = ["Bash", "Bash", "Bash", "Bash"]
|
|
3262
|
+
monitor.last_tool_fingerprint = "Bash"
|
|
3263
|
+
|
|
3264
|
+
body = {
|
|
3265
|
+
"model": "test",
|
|
3266
|
+
"messages": [
|
|
3267
|
+
{"role": "user", "content": "start"},
|
|
3268
|
+
{
|
|
3269
|
+
"role": "assistant",
|
|
3270
|
+
"content": [
|
|
3271
|
+
{"type": "tool_use", "id": "t1", "name": "Bash", "input": {"command": "ls"}},
|
|
3272
|
+
],
|
|
3273
|
+
},
|
|
3274
|
+
{
|
|
3275
|
+
"role": "user",
|
|
3276
|
+
"content": [
|
|
3277
|
+
{"type": "tool_result", "tool_use_id": "t1", "content": "ok"},
|
|
3278
|
+
],
|
|
3279
|
+
},
|
|
3280
|
+
],
|
|
3281
|
+
"tools": [
|
|
3282
|
+
{"name": "Bash", "description": "Run command", "input_schema": {"type": "object"}},
|
|
3283
|
+
{"name": "Read", "description": "Read file", "input_schema": {"type": "object"}},
|
|
3284
|
+
],
|
|
3285
|
+
}
|
|
3286
|
+
|
|
3287
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
3288
|
+
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
3289
|
+
# Check that a cycle-break hint was injected
|
|
3290
|
+
messages = openai.get("messages", [])
|
|
3291
|
+
last_msg = messages[-1] if messages else {}
|
|
3292
|
+
self.assertEqual(last_msg.get("role"), "user")
|
|
3293
|
+
self.assertIn("Bash", last_msg.get("content", ""))
|
|
3294
|
+
self.assertIn("DIFFERENT tool", last_msg.get("content", ""))
|
|
3295
|
+
finally:
|
|
3296
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
3297
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
|
|
3298
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", old_forced)
|
|
3299
|
+
setattr(proxy, "PROXY_TOOL_STATE_AUTO_BUDGET", old_auto)
|
|
3300
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", old_stagnation)
|
|
3301
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", old_cycle_window)
|
|
3302
|
+
|
|
3303
|
+
def test_cycle_break_narrows_tools(self):
|
|
3304
|
+
"""Option 2: cycling tools are excluded from the tools array during review."""
|
|
3305
|
+
old_state = getattr(proxy, "PROXY_TOOL_STATE_MACHINE")
|
|
3306
|
+
old_min_msgs = getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES")
|
|
3307
|
+
old_forced = getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET")
|
|
3308
|
+
old_auto = getattr(proxy, "PROXY_TOOL_STATE_AUTO_BUDGET")
|
|
3309
|
+
old_stagnation = getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD")
|
|
3310
|
+
old_cycle_window = getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW")
|
|
3311
|
+
try:
|
|
3312
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
3313
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
3314
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
|
|
3315
|
+
setattr(proxy, "PROXY_TOOL_STATE_AUTO_BUDGET", 2)
|
|
3316
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 99)
|
|
3317
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 4)
|
|
3318
|
+
|
|
3319
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3320
|
+
monitor.tool_turn_phase = "act"
|
|
3321
|
+
monitor.tool_state_forced_budget_remaining = 20
|
|
3322
|
+
monitor.tool_call_history = ["Bash", "Bash", "Bash", "Bash"]
|
|
3323
|
+
monitor.last_tool_fingerprint = "Bash"
|
|
3324
|
+
|
|
3325
|
+
body = {
|
|
3326
|
+
"model": "test",
|
|
3327
|
+
"messages": [
|
|
3328
|
+
{"role": "user", "content": "start"},
|
|
3329
|
+
{
|
|
3330
|
+
"role": "assistant",
|
|
3331
|
+
"content": [
|
|
3332
|
+
{"type": "tool_use", "id": "t1", "name": "Bash", "input": {"command": "ls"}},
|
|
3333
|
+
],
|
|
3334
|
+
},
|
|
3335
|
+
{
|
|
3336
|
+
"role": "user",
|
|
3337
|
+
"content": [
|
|
3338
|
+
{"type": "tool_result", "tool_use_id": "t1", "content": "ok"},
|
|
3339
|
+
],
|
|
3340
|
+
},
|
|
3341
|
+
],
|
|
3342
|
+
"tools": [
|
|
3343
|
+
{"name": "Bash", "description": "Run command", "input_schema": {"type": "object"}},
|
|
3344
|
+
{"name": "Read", "description": "Read file", "input_schema": {"type": "object"}},
|
|
3345
|
+
{"name": "Write", "description": "Write file", "input_schema": {"type": "object"}},
|
|
3346
|
+
],
|
|
3347
|
+
}
|
|
3348
|
+
|
|
3349
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
3350
|
+
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
3351
|
+
# Bash should be excluded, Read and Write should remain
|
|
3352
|
+
tool_names = [t["function"]["name"] for t in openai.get("tools", [])]
|
|
3353
|
+
self.assertNotIn("Bash", tool_names)
|
|
3354
|
+
self.assertIn("Read", tool_names)
|
|
3355
|
+
self.assertIn("Write", tool_names)
|
|
3356
|
+
finally:
|
|
3357
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
3358
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
|
|
3359
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", old_forced)
|
|
3360
|
+
setattr(proxy, "PROXY_TOOL_STATE_AUTO_BUDGET", old_auto)
|
|
3361
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", old_stagnation)
|
|
3362
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", old_cycle_window)
|
|
3363
|
+
|
|
3364
|
+
def test_forced_budget_default_is_12(self):
|
|
3365
|
+
"""Option 3: default forced budget reduced from 24 to 12."""
|
|
3366
|
+
self.assertEqual(proxy.PROXY_TOOL_STATE_FORCED_BUDGET, 12)
|
|
3367
|
+
|
|
3368
|
+
def test_review_cycle_limit_default_is_1(self):
|
|
3369
|
+
"""Option 4: default review cycle limit reduced from 2 to 1."""
|
|
3370
|
+
self.assertEqual(proxy.PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT, 1)
|
|
3371
|
+
|
|
3372
|
+
def test_cycling_tool_names_cleared_on_reset(self):
|
|
3373
|
+
"""cycling_tool_names is cleared when tool turn state resets."""
|
|
3374
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3375
|
+
monitor.cycling_tool_names = ["Bash", "Read"]
|
|
3376
|
+
monitor.reset_tool_turn_state(reason="test")
|
|
3377
|
+
self.assertEqual(monitor.cycling_tool_names, [])
|
|
3378
|
+
|
|
3379
|
+
|
|
3172
3380
|
if __name__ == "__main__":
|
|
3173
3381
|
unittest.main()
|
|
3174
3382
|
|