@miller-tech/uap 1.20.11 → 1.20.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1104,7 +1104,10 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
1104
1104
|
|
|
1105
1105
|
|
|
1106
1106
|
def prune_conversation(
|
|
1107
|
-
anthropic_body: dict,
|
|
1107
|
+
anthropic_body: dict,
|
|
1108
|
+
context_window: int,
|
|
1109
|
+
target_fraction: float = 0.65,
|
|
1110
|
+
keep_last: int = 8,
|
|
1108
1111
|
) -> dict:
|
|
1109
1112
|
"""Prune the conversation to fit within the context window.
|
|
1110
1113
|
|
|
@@ -1119,6 +1122,7 @@ def prune_conversation(
|
|
|
1119
1122
|
anthropic_body: The full Anthropic request body
|
|
1120
1123
|
context_window: Maximum context window in tokens
|
|
1121
1124
|
target_fraction: Target utilization after pruning (0.0-1.0)
|
|
1125
|
+
keep_last: Number of recent messages to always keep (default 8)
|
|
1122
1126
|
|
|
1123
1127
|
Returns:
|
|
1124
1128
|
Modified anthropic_body with pruned messages
|
|
@@ -1131,6 +1135,8 @@ def prune_conversation(
|
|
|
1131
1135
|
target_tokens = int(context_window * target_fraction)
|
|
1132
1136
|
|
|
1133
1137
|
# Estimate non-message tokens (system, tools, agentic supplement)
|
|
1138
|
+
# Apply a 1.5x safety factor to account for chat template overhead
|
|
1139
|
+
# and tokenization differences between local estimate and upstream
|
|
1134
1140
|
overhead_tokens = 0
|
|
1135
1141
|
system = anthropic_body.get("system", "")
|
|
1136
1142
|
if isinstance(system, str):
|
|
@@ -1144,6 +1150,7 @@ def prune_conversation(
|
|
|
1144
1150
|
tools = anthropic_body.get("tools", [])
|
|
1145
1151
|
if tools:
|
|
1146
1152
|
overhead_tokens += estimate_tokens(json.dumps(tools))
|
|
1153
|
+
overhead_tokens = int(overhead_tokens * 1.5) # Safety factor for template overhead
|
|
1147
1154
|
|
|
1148
1155
|
# Budget for messages
|
|
1149
1156
|
message_budget = target_tokens - overhead_tokens
|
|
@@ -1152,7 +1159,7 @@ def prune_conversation(
|
|
|
1152
1159
|
return anthropic_body
|
|
1153
1160
|
|
|
1154
1161
|
# Always keep the first user message and the last N messages
|
|
1155
|
-
KEEP_LAST =
|
|
1162
|
+
KEEP_LAST = keep_last
|
|
1156
1163
|
protected_head = messages[:1] # First user message
|
|
1157
1164
|
protected_tail = (
|
|
1158
1165
|
messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
|
|
@@ -4962,28 +4969,86 @@ async def messages(request: Request):
|
|
|
4962
4969
|
monitor.log_status()
|
|
4963
4970
|
|
|
4964
4971
|
# --- Option C: Prune conversation if approaching context limit ---
|
|
4972
|
+
# Option 1: Prefer upstream actual token count over local estimate
|
|
4965
4973
|
ctx_window = monitor.context_window
|
|
4966
4974
|
if ctx_window > 0:
|
|
4967
|
-
|
|
4975
|
+
# Use the upstream's actual prompt_tokens if available and higher
|
|
4976
|
+
# than the local estimate (the upstream counts chat template overhead,
|
|
4977
|
+
# tool schema tokenization, etc. that local heuristics miss).
|
|
4978
|
+
effective_tokens = estimated_tokens
|
|
4979
|
+
if monitor.last_input_tokens > estimated_tokens:
|
|
4980
|
+
effective_tokens = monitor.last_input_tokens
|
|
4981
|
+
logger.info(
|
|
4982
|
+
"Using upstream token count %d (local estimate %d) for prune decision",
|
|
4983
|
+
effective_tokens,
|
|
4984
|
+
estimated_tokens,
|
|
4985
|
+
)
|
|
4986
|
+
utilization = effective_tokens / ctx_window
|
|
4968
4987
|
if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
4969
4988
|
logger.warning(
|
|
4970
4989
|
"Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
|
|
4971
4990
|
utilization * 100,
|
|
4972
4991
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
4973
4992
|
)
|
|
4993
|
+
# Option 3: Aggressive pruning at critical utilization
|
|
4994
|
+
target_frac = _resolve_prune_target_fraction()
|
|
4995
|
+
keep_last = 8
|
|
4996
|
+
if utilization >= 0.90:
|
|
4997
|
+
keep_last = 4
|
|
4998
|
+
target_frac = min(target_frac, 0.40)
|
|
4999
|
+
logger.warning(
|
|
5000
|
+
"CRITICAL PRUNE: utilization %.1f%% >= 90%%, using keep_last=%d target=%.0f%%",
|
|
5001
|
+
utilization * 100,
|
|
5002
|
+
keep_last,
|
|
5003
|
+
target_frac * 100,
|
|
5004
|
+
)
|
|
4974
5005
|
body = prune_conversation(
|
|
4975
|
-
body, ctx_window, target_fraction=
|
|
5006
|
+
body, ctx_window, target_fraction=target_frac, keep_last=keep_last
|
|
4976
5007
|
)
|
|
4977
5008
|
monitor.prune_count += 1
|
|
4978
|
-
#
|
|
5009
|
+
# Option 4: Post-prune validation — verify actual reduction
|
|
4979
5010
|
estimated_tokens = estimate_total_tokens(body)
|
|
4980
5011
|
monitor.record_request(estimated_tokens)
|
|
5012
|
+
post_util = estimated_tokens / ctx_window
|
|
4981
5013
|
n_messages = len(body.get("messages", []))
|
|
4982
5014
|
logger.info(
|
|
4983
|
-
"After pruning: ~%d tokens
|
|
5015
|
+
"After pruning: ~%d tokens (%d messages), utilization %.1f%%",
|
|
4984
5016
|
estimated_tokens,
|
|
4985
5017
|
n_messages,
|
|
5018
|
+
post_util * 100,
|
|
4986
5019
|
)
|
|
5020
|
+
# If still above threshold after first prune, do aggressive second pass
|
|
5021
|
+
if post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
5022
|
+
logger.warning(
|
|
5023
|
+
"POST-PRUNE VALIDATION: still at %.1f%% after prune, doing aggressive pass",
|
|
5024
|
+
post_util * 100,
|
|
5025
|
+
)
|
|
5026
|
+
body = prune_conversation(
|
|
5027
|
+
body, ctx_window, target_fraction=0.35, keep_last=4
|
|
5028
|
+
)
|
|
5029
|
+
monitor.prune_count += 1
|
|
5030
|
+
estimated_tokens = estimate_total_tokens(body)
|
|
5031
|
+
monitor.record_request(estimated_tokens)
|
|
5032
|
+
post_util = estimated_tokens / ctx_window
|
|
5033
|
+
n_messages = len(body.get("messages", []))
|
|
5034
|
+
logger.info(
|
|
5035
|
+
"After aggressive prune: ~%d tokens (%d messages), utilization %.1f%%",
|
|
5036
|
+
estimated_tokens,
|
|
5037
|
+
n_messages,
|
|
5038
|
+
post_util * 100,
|
|
5039
|
+
)
|
|
5040
|
+
# Option 2: Circuit breaker — if 3+ consecutive prunes and still above,
|
|
5041
|
+
# force finalize (drop tools, let model wrap up)
|
|
5042
|
+
if monitor.prune_count >= 3 and post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
|
|
5043
|
+
logger.error(
|
|
5044
|
+
"PRUNE CIRCUIT BREAKER: %d consecutive prunes, still at %.1f%%. "
|
|
5045
|
+
"Forcing finalize to prevent death spiral.",
|
|
5046
|
+
monitor.prune_count,
|
|
5047
|
+
post_util * 100,
|
|
5048
|
+
)
|
|
5049
|
+
monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
|
|
5050
|
+
monitor.tool_state_auto_budget_remaining = 1
|
|
5051
|
+
monitor.reset_completion_recovery()
|
|
4987
5052
|
|
|
4988
5053
|
openai_body = build_openai_request(
|
|
4989
5054
|
body,
|
|
@@ -5104,6 +5169,10 @@ async def messages(request: Request):
|
|
|
5104
5169
|
|
|
5105
5170
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5106
5171
|
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
5172
|
+
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5173
|
+
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
5174
|
+
if upstream_input > 0:
|
|
5175
|
+
monitor.last_input_tokens = upstream_input
|
|
5107
5176
|
if PROXY_FORCE_NON_STREAM:
|
|
5108
5177
|
logger.info(
|
|
5109
5178
|
"FORCED NON-STREAM: served stream response via guarded non-stream path"
|
|
@@ -5441,6 +5510,10 @@ async def messages(request: Request):
|
|
|
5441
5510
|
# Track output tokens in session monitor
|
|
5442
5511
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
5443
5512
|
monitor.record_response(output_tokens)
|
|
5513
|
+
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5514
|
+
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
5515
|
+
if upstream_input > 0:
|
|
5516
|
+
monitor.last_input_tokens = upstream_input
|
|
5444
5517
|
|
|
5445
5518
|
return anthropic_resp
|
|
5446
5519
|
|
|
@@ -3169,6 +3169,73 @@ class TestToolStarvationBreaker(unittest.TestCase):
|
|
|
3169
3169
|
self.assertIn("tools", result)
|
|
3170
3170
|
|
|
3171
3171
|
|
|
3172
|
+
class TestPruningImprovements(unittest.TestCase):
|
|
3173
|
+
"""Tests for pruning death spiral fixes."""
|
|
3174
|
+
|
|
3175
|
+
def test_prune_uses_upstream_tokens_when_higher(self):
|
|
3176
|
+
"""Option 1: upstream last_input_tokens used when higher than local estimate."""
|
|
3177
|
+
monitor = proxy.SessionMonitor(context_window=10000)
|
|
3178
|
+
# Simulate upstream reporting higher token count than local estimate
|
|
3179
|
+
monitor.last_input_tokens = 9000 # 90% - above 85% threshold
|
|
3180
|
+
body = {
|
|
3181
|
+
"model": "test",
|
|
3182
|
+
"messages": [
|
|
3183
|
+
{"role": "user", "content": "start"},
|
|
3184
|
+
{"role": "assistant", "content": "ok"},
|
|
3185
|
+
{"role": "user", "content": "a" * 100},
|
|
3186
|
+
{"role": "assistant", "content": "b" * 100},
|
|
3187
|
+
{"role": "user", "content": "c" * 100},
|
|
3188
|
+
{"role": "assistant", "content": "d" * 100},
|
|
3189
|
+
{"role": "user", "content": "e" * 100},
|
|
3190
|
+
{"role": "assistant", "content": "f" * 100},
|
|
3191
|
+
{"role": "user", "content": "g" * 100},
|
|
3192
|
+
{"role": "assistant", "content": "h" * 100},
|
|
3193
|
+
{"role": "user", "content": "continue"},
|
|
3194
|
+
],
|
|
3195
|
+
}
|
|
3196
|
+
# Local estimate_total_tokens will be much lower than 9000
|
|
3197
|
+
local_est = proxy.estimate_total_tokens(body)
|
|
3198
|
+
self.assertLess(local_est, 9000)
|
|
3199
|
+
# The pruning code should use upstream's 9000 for the decision
|
|
3200
|
+
|
|
3201
|
+
def test_prune_conversation_accepts_keep_last(self):
|
|
3202
|
+
"""Option 3: prune_conversation accepts keep_last parameter."""
|
|
3203
|
+
body = {
|
|
3204
|
+
"messages": [
|
|
3205
|
+
{"role": "user", "content": "first"},
|
|
3206
|
+
{"role": "assistant", "content": "a" * 500},
|
|
3207
|
+
{"role": "user", "content": "b" * 500},
|
|
3208
|
+
{"role": "assistant", "content": "c" * 500},
|
|
3209
|
+
{"role": "user", "content": "d" * 500},
|
|
3210
|
+
{"role": "assistant", "content": "e" * 500},
|
|
3211
|
+
{"role": "user", "content": "f" * 500},
|
|
3212
|
+
{"role": "assistant", "content": "g" * 500},
|
|
3213
|
+
{"role": "user", "content": "h" * 500},
|
|
3214
|
+
{"role": "assistant", "content": "i" * 500},
|
|
3215
|
+
{"role": "user", "content": "last"},
|
|
3216
|
+
],
|
|
3217
|
+
}
|
|
3218
|
+
# With keep_last=4, more middle messages should be prunable
|
|
3219
|
+
result_8 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=8)
|
|
3220
|
+
result_4 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=4)
|
|
3221
|
+
# keep_last=4 should result in fewer or equal messages
|
|
3222
|
+
self.assertLessEqual(
|
|
3223
|
+
len(result_4.get("messages", [])),
|
|
3224
|
+
len(result_8.get("messages", [])),
|
|
3225
|
+
)
|
|
3226
|
+
|
|
3227
|
+
def test_prune_circuit_breaker_sets_finalize(self):
|
|
3228
|
+
"""Option 2: circuit breaker forces finalize after repeated prunes."""
|
|
3229
|
+
monitor = proxy.SessionMonitor(context_window=10000)
|
|
3230
|
+
monitor.prune_count = 3 # Already pruned 3 times
|
|
3231
|
+
# After the pruning code runs and still exceeds threshold,
|
|
3232
|
+
# it should set finalize phase
|
|
3233
|
+
monitor.set_tool_turn_phase("act", reason="test")
|
|
3234
|
+
# Simulate the circuit breaker logic
|
|
3235
|
+
monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
|
|
3236
|
+
self.assertEqual(monitor.tool_turn_phase, "finalize")
|
|
3237
|
+
|
|
3238
|
+
|
|
3172
3239
|
if __name__ == "__main__":
|
|
3173
3240
|
unittest.main()
|
|
3174
3241
|
|