@miller-tech/uap 1.20.10 → 1.20.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -147,10 +147,10 @@ PROXY_TOOL_STATE_FORCED_BUDGET = int(
|
|
|
147
147
|
)
|
|
148
148
|
PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
|
|
149
149
|
PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
|
|
150
|
-
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "
|
|
150
|
+
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "5")
|
|
151
151
|
)
|
|
152
152
|
PROXY_TOOL_STATE_CYCLE_WINDOW = int(
|
|
153
|
-
os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "
|
|
153
|
+
os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "4")
|
|
154
154
|
)
|
|
155
155
|
PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
156
156
|
os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
|
|
@@ -1104,7 +1104,10 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
1104
1104
|
|
|
1105
1105
|
|
|
1106
1106
|
def prune_conversation(
|
|
1107
|
-
anthropic_body: dict,
|
|
1107
|
+
anthropic_body: dict,
|
|
1108
|
+
context_window: int,
|
|
1109
|
+
target_fraction: float = 0.65,
|
|
1110
|
+
keep_last: int = 8,
|
|
1108
1111
|
) -> dict:
|
|
1109
1112
|
"""Prune the conversation to fit within the context window.
|
|
1110
1113
|
|
|
@@ -1119,6 +1122,7 @@ def prune_conversation(
|
|
|
1119
1122
|
anthropic_body: The full Anthropic request body
|
|
1120
1123
|
context_window: Maximum context window in tokens
|
|
1121
1124
|
target_fraction: Target utilization after pruning (0.0-1.0)
|
|
1125
|
+
keep_last: Number of recent messages to always keep (default 8)
|
|
1122
1126
|
|
|
1123
1127
|
Returns:
|
|
1124
1128
|
Modified anthropic_body with pruned messages
|
|
@@ -1131,6 +1135,8 @@ def prune_conversation(
|
|
|
1131
1135
|
target_tokens = int(context_window * target_fraction)
|
|
1132
1136
|
|
|
1133
1137
|
# Estimate non-message tokens (system, tools, agentic supplement)
|
|
1138
|
+
# Apply a 1.5x safety factor to account for chat template overhead
|
|
1139
|
+
# and tokenization differences between local estimate and upstream
|
|
1134
1140
|
overhead_tokens = 0
|
|
1135
1141
|
system = anthropic_body.get("system", "")
|
|
1136
1142
|
if isinstance(system, str):
|
|
@@ -1144,6 +1150,7 @@ def prune_conversation(
|
|
|
1144
1150
|
tools = anthropic_body.get("tools", [])
|
|
1145
1151
|
if tools:
|
|
1146
1152
|
overhead_tokens += estimate_tokens(json.dumps(tools))
|
|
1153
|
+
overhead_tokens = int(overhead_tokens * 1.5) # Safety factor for template overhead
|
|
1147
1154
|
|
|
1148
1155
|
# Budget for messages
|
|
1149
1156
|
message_budget = target_tokens - overhead_tokens
|
|
@@ -1152,7 +1159,7 @@ def prune_conversation(
|
|
|
1152
1159
|
return anthropic_body
|
|
1153
1160
|
|
|
1154
1161
|
# Always keep the first user message and the last N messages
|
|
1155
|
-
KEEP_LAST =
|
|
1162
|
+
KEEP_LAST = keep_last
|
|
1156
1163
|
protected_head = messages[:1] # First user message
|
|
1157
1164
|
protected_tail = (
|
|
1158
1165
|
messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
@@ -2053,7 +2060,7 @@ def _resolve_state_machine_tool_choice(
|
|
|
2053
2060
|
monitor.tool_state_stagnation_streak,
|
|
2054
2061
|
monitor.tool_state_review_cycles,
|
|
2055
2062
|
)
|
|
2056
|
-
return "
|
|
2063
|
+
return "required", reason
|
|
2057
2064
|
|
|
2058
2065
|
if monitor.tool_state_forced_budget_remaining <= 0:
|
|
2059
2066
|
monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
|
|
@@ -2068,7 +2075,7 @@ def _resolve_state_machine_tool_choice(
|
|
|
2068
2075
|
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d)",
|
|
2069
2076
|
monitor.tool_state_review_cycles,
|
|
2070
2077
|
)
|
|
2071
|
-
return "
|
|
2078
|
+
return "required", "forced_budget_exhausted"
|
|
2072
2079
|
|
|
2073
2080
|
monitor.tool_state_forced_budget_remaining -= 1
|
|
2074
2081
|
return "required", "act"
|
|
@@ -2088,7 +2095,7 @@ def _resolve_state_machine_tool_choice(
|
|
|
2088
2095
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2089
2096
|
)
|
|
2090
2097
|
return "required", "review_complete"
|
|
2091
|
-
return "
|
|
2098
|
+
return "required", "review"
|
|
2092
2099
|
|
|
2093
2100
|
if monitor.tool_turn_phase == "finalize":
|
|
2094
2101
|
if monitor.tool_state_auto_budget_remaining <= 0:
|
|
@@ -4962,28 +4969,86 @@ async def messages(request: Request):
|
|
|
4962
4969
|
monitor.log_status()
|
|
4963
4970
|
|
|
4964
4971
|
# --- Option C: Prune conversation if approaching context limit ---
|
|
4972
|
+
# Option 1: Prefer upstream actual token count over local estimate
|
|
4965
4973
|
ctx_window = monitor.context_window
|
|
4966
4974
|
if ctx_window > 0:
|
|
4967
|
-
|
|
4975
|
+
# Use the upstream's actual prompt_tokens if available and higher
|
|
4976
|
+
# than the local estimate (the upstream counts chat template overhead,
|
|
4977
|
+
# tool schema tokenization, etc. that local heuristics miss).
|
|
4978
|
+
effective_tokens = estimated_tokens
|
|
4979
|
+
if monitor.last_input_tokens > estimated_tokens:
|
|
4980
|
+
effective_tokens = monitor.last_input_tokens
|
|
4981
|
+
logger.info(
|
|
4982
|
+
"Using upstream token count %d (local estimate %d) for prune decision",
|
|
4983
|
+
effective_tokens,
|
|
4984
|
+
estimated_tokens,
|
|
4985
|
+
)
|
|
4986
|
+
utilization = effective_tokens / ctx_window
|
|
4968
4987
|
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
4969
4988
|
logger.warning(
|
|
4970
4989
|
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
4971
4990
|
utilization * 100,
|
|
4972
4991
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
4973
4992
|
)
|
|
4993
|
+
# Option 3: Aggressive pruning at critical utilization
|
|
4994
|
+
target_frac = _resolve_prune_target_fraction()
|
|
4995
|
+
keep_last = 8
|
|
4996
|
+
if utilization >= 0.90:
|
|
4997
|
+
keep_last = 4
|
|
4998
|
+
target_frac = min(target_frac, 0.40)
|
|
4999
|
+
logger.warning(
|
|
5000
|
+
"CRITICAL PRUNE: utilization %.1f%% >= 90%%, using keep_last=%d target=%.0f%%",
|
|
5001
|
+
utilization * 100,
|
|
5002
|
+
keep_last,
|
|
5003
|
+
target_frac * 100,
|
|
5004
|
+
)
|
|
4974
5005
|
body = prune_conversation(
|
|
4975
|
-
body, ctx_window, target_fraction=
|
|
5006
|
+
body, ctx_window, target_fraction=target_frac, keep_last=keep_last
|
|
4976
5007
|
)
|
|
4977
5008
|
monitor.prune_count += 1
|
|
4978
|
-
#
|
|
5009
|
+
# Option 4: Post-prune validation — verify actual reduction
|
|
4979
5010
|
estimated_tokens = estimate_total_tokens(body)
|
|
4980
5011
|
monitor.record_request(estimated_tokens)
|
|
5012
|
+
post_util = estimated_tokens / ctx_window
|
|
4981
5013
|
n_messages = len(body.get("messages", []))
|
|
4982
5014
|
logger.info(
|
|
4983
|
-
"After pruning: ~%d tokens
|
|
5015
|
+
"After pruning: ~%d tokens (%d messages), utilization %.1f%%",
|
|
4984
5016
|
estimated_tokens,
|
|
4985
5017
|
n_messages,
|
|
5018
|
+
post_util * 100,
|
|
4986
5019
|
)
|
|
5020
|
+
# If still above threshold after first prune, do aggressive second pass
|
|
5021
|
+
if post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
5022
|
+
logger.warning(
|
|
5023
|
+
"POST-PRUNE VALIDATION: still at %.1f%% after prune, doing aggressive pass",
|
|
5024
|
+
post_util * 100,
|
|
5025
|
+
)
|
|
5026
|
+
body = prune_conversation(
|
|
5027
|
+
body, ctx_window, target_fraction=0.35, keep_last=4
|
|
5028
|
+
)
|
|
5029
|
+
monitor.prune_count += 1
|
|
5030
|
+
estimated_tokens = estimate_total_tokens(body)
|
|
5031
|
+
monitor.record_request(estimated_tokens)
|
|
5032
|
+
post_util = estimated_tokens / ctx_window
|
|
5033
|
+
n_messages = len(body.get("messages", []))
|
|
5034
|
+
logger.info(
|
|
5035
|
+
"After aggressive prune: ~%d tokens (%d messages), utilization %.1f%%",
|
|
5036
|
+
estimated_tokens,
|
|
5037
|
+
n_messages,
|
|
5038
|
+
post_util * 100,
|
|
5039
|
+
)
|
|
5040
|
+
# Option 2: Circuit breaker — if 3+ consecutive prunes and still above,
|
|
5041
|
+
# force finalize (drop tools, let model wrap up)
|
|
5042
|
+
if monitor.prune_count >= 3 and post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
5043
|
+
logger.error(
|
|
5044
|
+
"PRUNE CIRCUIT BREAKER: %d consecutive prunes, still at %.1f%%. "
|
|
5045
|
+
"Forcing finalize to prevent death spiral.",
|
|
5046
|
+
monitor.prune_count,
|
|
5047
|
+
post_util * 100,
|
|
5048
|
+
)
|
|
5049
|
+
monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
|
|
5050
|
+
monitor.tool_state_auto_budget_remaining = 1
|
|
5051
|
+
monitor.reset_completion_recovery()
|
|
4987
5052
|
|
|
4988
5053
|
openai_body = build_openai_request(
|
|
4989
5054
|
body,
|
|
@@ -5104,6 +5169,10 @@ async def messages(request: Request):
|
|
|
5104
5169
|
|
|
5105
5170
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5106
5171
|
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
5172
|
+
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5173
|
+
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
5174
|
+
if upstream_input > 0:
|
|
5175
|
+
monitor.last_input_tokens = upstream_input
|
|
5107
5176
|
if PROXY_FORCE_NON_STREAM:
|
|
5108
5177
|
logger.info(
|
|
5109
5178
|
"FORCED NON-STREAM: served stream response via guarded non-stream path"
|
|
@@ -5441,6 +5510,10 @@ async def messages(request: Request):
|
|
|
5441
5510
|
# Track output tokens in session monitor
|
|
5442
5511
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
5443
5512
|
monitor.record_response(output_tokens)
|
|
5513
|
+
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5514
|
+
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
5515
|
+
if upstream_input > 0:
|
|
5516
|
+
monitor.last_input_tokens = upstream_input
|
|
5444
5517
|
|
|
5445
5518
|
return anthropic_resp
|
|
5446
5519
|
|
|
@@ -1861,7 +1861,8 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
1861
1861
|
|
|
1862
1862
|
self.assertEqual(openai_1.get("tool_choice"), "required")
|
|
1863
1863
|
self.assertEqual(openai_2.get("tool_choice"), "required")
|
|
1864
|
-
|
|
1864
|
+
# Review phase now keeps required to prevent end-turn escape
|
|
1865
|
+
self.assertEqual(openai_3.get("tool_choice"), "required")
|
|
1865
1866
|
finally:
|
|
1866
1867
|
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
1867
1868
|
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
|
|
@@ -1938,7 +1939,8 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
1938
1939
|
}
|
|
1939
1940
|
|
|
1940
1941
|
openai = proxy.build_openai_request(body, monitor)
|
|
1941
|
-
|
|
1942
|
+
# Review phase now keeps required to prevent end-turn escape
|
|
1943
|
+
self.assertEqual(openai.get("tool_choice"), "required")
|
|
1942
1944
|
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
1943
1945
|
finally:
|
|
1944
1946
|
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
@@ -2067,7 +2069,8 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2067
2069
|
}
|
|
2068
2070
|
|
|
2069
2071
|
openai = proxy.build_openai_request(body, monitor)
|
|
2070
|
-
|
|
2072
|
+
# Review phase now keeps required to prevent end-turn escape
|
|
2073
|
+
self.assertEqual(openai.get("tool_choice"), "required")
|
|
2071
2074
|
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
2072
2075
|
self.assertEqual(monitor.tool_state_review_cycles, 1)
|
|
2073
2076
|
finally:
|
|
@@ -3166,6 +3169,73 @@ class TestToolStarvationBreaker(unittest.TestCase):
|
|
|
3166
3169
|
self.assertIn("tools", result)
|
|
3167
3170
|
|
|
3168
3171
|
|
|
3172
|
+
class TestPruningImprovements(unittest.TestCase):
|
|
3173
|
+
"""Tests for pruning death spiral fixes."""
|
|
3174
|
+
|
|
3175
|
+
def test_prune_uses_upstream_tokens_when_higher(self):
|
|
3176
|
+
"""Option 1: upstream last_input_tokens used when higher than local estimate."""
|
|
3177
|
+
monitor = proxy.SessionMonitor(context_window=10000)
|
|
3178
|
+
# Simulate upstream reporting higher token count than local estimate
|
|
3179
|
+
monitor.last_input_tokens = 9000 # 90% - above 85% threshold
|
|
3180
|
+
body = {
|
|
3181
|
+
"model": "test",
|
|
3182
|
+
"messages": [
|
|
3183
|
+
{"role": "user", "content": "start"},
|
|
3184
|
+
{"role": "assistant", "content": "ok"},
|
|
3185
|
+
{"role": "user", "content": "a" * 100},
|
|
3186
|
+
{"role": "assistant", "content": "b" * 100},
|
|
3187
|
+
{"role": "user", "content": "c" * 100},
|
|
3188
|
+
{"role": "assistant", "content": "d" * 100},
|
|
3189
|
+
{"role": "user", "content": "e" * 100},
|
|
3190
|
+
{"role": "assistant", "content": "f" * 100},
|
|
3191
|
+
{"role": "user", "content": "g" * 100},
|
|
3192
|
+
{"role": "assistant", "content": "h" * 100},
|
|
3193
|
+
{"role": "user", "content": "continue"},
|
|
3194
|
+
],
|
|
3195
|
+
}
|
|
3196
|
+
# Local estimate_total_tokens will be much lower than 9000
|
|
3197
|
+
local_est = proxy.estimate_total_tokens(body)
|
|
3198
|
+
self.assertLess(local_est, 9000)
|
|
3199
|
+
# The pruning code should use upstream's 9000 for the decision
|
|
3200
|
+
|
|
3201
|
+
def test_prune_conversation_accepts_keep_last(self):
|
|
3202
|
+
"""Option 3: prune_conversation accepts keep_last parameter."""
|
|
3203
|
+
body = {
|
|
3204
|
+
"messages": [
|
|
3205
|
+
{"role": "user", "content": "first"},
|
|
3206
|
+
{"role": "assistant", "content": "a" * 500},
|
|
3207
|
+
{"role": "user", "content": "b" * 500},
|
|
3208
|
+
{"role": "assistant", "content": "c" * 500},
|
|
3209
|
+
{"role": "user", "content": "d" * 500},
|
|
3210
|
+
{"role": "assistant", "content": "e" * 500},
|
|
3211
|
+
{"role": "user", "content": "f" * 500},
|
|
3212
|
+
{"role": "assistant", "content": "g" * 500},
|
|
3213
|
+
{"role": "user", "content": "h" * 500},
|
|
3214
|
+
{"role": "assistant", "content": "i" * 500},
|
|
3215
|
+
{"role": "user", "content": "last"},
|
|
3216
|
+
],
|
|
3217
|
+
}
|
|
3218
|
+
# With keep_last=4, more middle messages should be prunable
|
|
3219
|
+
result_8 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=8)
|
|
3220
|
+
result_4 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=4)
|
|
3221
|
+
# keep_last=4 should result in fewer or equal messages
|
|
3222
|
+
self.assertLessEqual(
|
|
3223
|
+
len(result_4.get("messages", [])),
|
|
3224
|
+
len(result_8.get("messages", [])),
|
|
3225
|
+
)
|
|
3226
|
+
|
|
3227
|
+
def test_prune_circuit_breaker_sets_finalize(self):
|
|
3228
|
+
"""Option 2: circuit breaker forces finalize after repeated prunes."""
|
|
3229
|
+
monitor = proxy.SessionMonitor(context_window=10000)
|
|
3230
|
+
monitor.prune_count = 3 # Already pruned 3 times
|
|
3231
|
+
# After the pruning code runs and still exceeds threshold,
|
|
3232
|
+
# it should set finalize phase
|
|
3233
|
+
monitor.set_tool_turn_phase("act", reason="test")
|
|
3234
|
+
# Simulate the circuit breaker logic
|
|
3235
|
+
monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
|
|
3236
|
+
self.assertEqual(monitor.tool_turn_phase, "finalize")
|
|
3237
|
+
|
|
3238
|
+
|
|
3169
3239
|
if __name__ == "__main__":
|
|
3170
3240
|
unittest.main()
|
|
3171
3241
|
|