@miller-tech/uap 1.20.10 → 1.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.10",
3
+ "version": "1.20.12",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -147,10 +147,10 @@ PROXY_TOOL_STATE_FORCED_BUDGET = int(
147
147
  )
148
148
  PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
149
149
  PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
150
- os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "9")
150
+ os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "5")
151
151
  )
152
152
  PROXY_TOOL_STATE_CYCLE_WINDOW = int(
153
- os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "8")
153
+ os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "4")
154
154
  )
155
155
  PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
156
156
  os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
@@ -1104,7 +1104,10 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
1104
1104
 
1105
1105
 
1106
1106
  def prune_conversation(
1107
- anthropic_body: dict, context_window: int, target_fraction: float = 0.65
1107
+ anthropic_body: dict,
1108
+ context_window: int,
1109
+ target_fraction: float = 0.65,
1110
+ keep_last: int = 8,
1108
1111
  ) -> dict:
1109
1112
  """Prune the conversation to fit within the context window.
1110
1113
 
@@ -1119,6 +1122,7 @@ def prune_conversation(
1119
1122
  anthropic_body: The full Anthropic request body
1120
1123
  context_window: Maximum context window in tokens
1121
1124
  target_fraction: Target utilization after pruning (0.0-1.0)
1125
+ keep_last: Number of recent messages to always keep (default 8)
1122
1126
 
1123
1127
  Returns:
1124
1128
  Modified anthropic_body with pruned messages
@@ -1131,6 +1135,8 @@ def prune_conversation(
1131
1135
  target_tokens = int(context_window * target_fraction)
1132
1136
 
1133
1137
  # Estimate non-message tokens (system, tools, agentic supplement)
1138
+ # Apply a 1.5x safety factor to account for chat template overhead
1139
+ # and tokenization differences between local estimate and upstream
1134
1140
  overhead_tokens = 0
1135
1141
  system = anthropic_body.get("system", "")
1136
1142
  if isinstance(system, str):
@@ -1144,6 +1150,7 @@ def prune_conversation(
1144
1150
  tools = anthropic_body.get("tools", [])
1145
1151
  if tools:
1146
1152
  overhead_tokens += estimate_tokens(json.dumps(tools))
1153
+ overhead_tokens = int(overhead_tokens * 1.5) # Safety factor for template overhead
1147
1154
 
1148
1155
  # Budget for messages
1149
1156
  message_budget = target_tokens - overhead_tokens
@@ -1152,7 +1159,7 @@ def prune_conversation(
1152
1159
  return anthropic_body
1153
1160
 
1154
1161
  # Always keep the first user message and the last N messages
1155
- KEEP_LAST = 8 # Keep the last 8 messages (recent context)
1162
+ KEEP_LAST = keep_last
1156
1163
  protected_head = messages[:1] # First user message
1157
1164
  protected_tail = (
1158
1165
  messages[-KEEP_LAST:] if len(messages) > KEEP_LAST else messages[1:]
@@ -2053,7 +2060,7 @@ def _resolve_state_machine_tool_choice(
2053
2060
  monitor.tool_state_stagnation_streak,
2054
2061
  monitor.tool_state_review_cycles,
2055
2062
  )
2056
- return "auto", reason
2063
+ return "required", reason
2057
2064
 
2058
2065
  if monitor.tool_state_forced_budget_remaining <= 0:
2059
2066
  monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
@@ -2068,7 +2075,7 @@ def _resolve_state_machine_tool_choice(
2068
2075
  "TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d)",
2069
2076
  monitor.tool_state_review_cycles,
2070
2077
  )
2071
- return "auto", "forced_budget_exhausted"
2078
+ return "required", "forced_budget_exhausted"
2072
2079
 
2073
2080
  monitor.tool_state_forced_budget_remaining -= 1
2074
2081
  return "required", "act"
@@ -2088,7 +2095,7 @@ def _resolve_state_machine_tool_choice(
2088
2095
  1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
2089
2096
  )
2090
2097
  return "required", "review_complete"
2091
- return "auto", "review"
2098
+ return "required", "review"
2092
2099
 
2093
2100
  if monitor.tool_turn_phase == "finalize":
2094
2101
  if monitor.tool_state_auto_budget_remaining <= 0:
@@ -4962,28 +4969,86 @@ async def messages(request: Request):
4962
4969
  monitor.log_status()
4963
4970
 
4964
4971
  # --- Option C: Prune conversation if approaching context limit ---
4972
+ # Option 1: Prefer upstream actual token count over local estimate
4965
4973
  ctx_window = monitor.context_window
4966
4974
  if ctx_window > 0:
4967
- utilization = estimated_tokens / ctx_window
4975
+ # Use the upstream's actual prompt_tokens if available and higher
4976
+ # than the local estimate (the upstream counts chat template overhead,
4977
+ # tool schema tokenization, etc. that local heuristics miss).
4978
+ effective_tokens = estimated_tokens
4979
+ if monitor.last_input_tokens > estimated_tokens:
4980
+ effective_tokens = monitor.last_input_tokens
4981
+ logger.info(
4982
+ "Using upstream token count %d (local estimate %d) for prune decision",
4983
+ effective_tokens,
4984
+ estimated_tokens,
4985
+ )
4986
+ utilization = effective_tokens / ctx_window
4968
4987
  if utilization >= PROXY_CONTEXT_PRUNE_THRESHOLD:
4969
4988
  logger.warning(
4970
4989
  "Context utilization %.1f%% exceeds threshold %.1f%% -- pruning conversation",
4971
4990
  utilization * 100,
4972
4991
  PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
4973
4992
  )
4993
+ # Option 3: Aggressive pruning at critical utilization
4994
+ target_frac = _resolve_prune_target_fraction()
4995
+ keep_last = 8
4996
+ if utilization >= 0.90:
4997
+ keep_last = 4
4998
+ target_frac = min(target_frac, 0.40)
4999
+ logger.warning(
5000
+ "CRITICAL PRUNE: utilization %.1f%% >= 90%%, using keep_last=%d target=%.0f%%",
5001
+ utilization * 100,
5002
+ keep_last,
5003
+ target_frac * 100,
5004
+ )
4974
5005
  body = prune_conversation(
4975
- body, ctx_window, target_fraction=_resolve_prune_target_fraction()
5006
+ body, ctx_window, target_fraction=target_frac, keep_last=keep_last
4976
5007
  )
4977
5008
  monitor.prune_count += 1
4978
- # Re-estimate after pruning
5009
+ # Option 4: Post-prune validation — verify actual reduction
4979
5010
  estimated_tokens = estimate_total_tokens(body)
4980
5011
  monitor.record_request(estimated_tokens)
5012
+ post_util = estimated_tokens / ctx_window
4981
5013
  n_messages = len(body.get("messages", []))
4982
5014
  logger.info(
4983
- "After pruning: ~%d tokens, %d messages",
5015
+ "After pruning: ~%d tokens (%d messages), utilization %.1f%%",
4984
5016
  estimated_tokens,
4985
5017
  n_messages,
5018
+ post_util * 100,
4986
5019
  )
5020
+ # If still above threshold after first prune, do aggressive second pass
5021
+ if post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
5022
+ logger.warning(
5023
+ "POST-PRUNE VALIDATION: still at %.1f%% after prune, doing aggressive pass",
5024
+ post_util * 100,
5025
+ )
5026
+ body = prune_conversation(
5027
+ body, ctx_window, target_fraction=0.35, keep_last=4
5028
+ )
5029
+ monitor.prune_count += 1
5030
+ estimated_tokens = estimate_total_tokens(body)
5031
+ monitor.record_request(estimated_tokens)
5032
+ post_util = estimated_tokens / ctx_window
5033
+ n_messages = len(body.get("messages", []))
5034
+ logger.info(
5035
+ "After aggressive prune: ~%d tokens (%d messages), utilization %.1f%%",
5036
+ estimated_tokens,
5037
+ n_messages,
5038
+ post_util * 100,
5039
+ )
5040
+ # Option 2: Circuit breaker — if 3+ consecutive prunes and still above,
5041
+ # force finalize (drop tools, let model wrap up)
5042
+ if monitor.prune_count >= 3 and post_util >= PROXY_CONTEXT_PRUNE_THRESHOLD:
5043
+ logger.error(
5044
+ "PRUNE CIRCUIT BREAKER: %d consecutive prunes, still at %.1f%%. "
5045
+ "Forcing finalize to prevent death spiral.",
5046
+ monitor.prune_count,
5047
+ post_util * 100,
5048
+ )
5049
+ monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
5050
+ monitor.tool_state_auto_budget_remaining = 1
5051
+ monitor.reset_completion_recovery()
4987
5052
 
4988
5053
  openai_body = build_openai_request(
4989
5054
  body,
@@ -5104,6 +5169,10 @@ async def messages(request: Request):
5104
5169
 
5105
5170
  anthropic_resp = openai_to_anthropic_response(openai_resp, model)
5106
5171
  monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
5172
+ # Update last_input_tokens from upstream's actual prompt_tokens
5173
+ upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
5174
+ if upstream_input > 0:
5175
+ monitor.last_input_tokens = upstream_input
5107
5176
  if PROXY_FORCE_NON_STREAM:
5108
5177
  logger.info(
5109
5178
  "FORCED NON-STREAM: served stream response via guarded non-stream path"
@@ -5441,6 +5510,10 @@ async def messages(request: Request):
5441
5510
  # Track output tokens in session monitor
5442
5511
  output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
5443
5512
  monitor.record_response(output_tokens)
5513
+ # Update last_input_tokens from upstream's actual prompt_tokens
5514
+ upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
5515
+ if upstream_input > 0:
5516
+ monitor.last_input_tokens = upstream_input
5444
5517
 
5445
5518
  return anthropic_resp
5446
5519
 
@@ -1861,7 +1861,8 @@ class TestToolTurnControls(unittest.TestCase):
1861
1861
 
1862
1862
  self.assertEqual(openai_1.get("tool_choice"), "required")
1863
1863
  self.assertEqual(openai_2.get("tool_choice"), "required")
1864
- self.assertEqual(openai_3.get("tool_choice"), "auto")
1864
+ # Review phase now keeps required to prevent end-turn escape
1865
+ self.assertEqual(openai_3.get("tool_choice"), "required")
1865
1866
  finally:
1866
1867
  setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
1867
1868
  setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
@@ -1938,7 +1939,8 @@ class TestToolTurnControls(unittest.TestCase):
1938
1939
  }
1939
1940
 
1940
1941
  openai = proxy.build_openai_request(body, monitor)
1941
- self.assertEqual(openai.get("tool_choice"), "auto")
1942
+ # Review phase now keeps required to prevent end-turn escape
1943
+ self.assertEqual(openai.get("tool_choice"), "required")
1942
1944
  self.assertEqual(monitor.tool_turn_phase, "review")
1943
1945
  finally:
1944
1946
  setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
@@ -2067,7 +2069,8 @@ class TestToolTurnControls(unittest.TestCase):
2067
2069
  }
2068
2070
 
2069
2071
  openai = proxy.build_openai_request(body, monitor)
2070
- self.assertEqual(openai.get("tool_choice"), "auto")
2072
+ # Review phase now keeps required to prevent end-turn escape
2073
+ self.assertEqual(openai.get("tool_choice"), "required")
2071
2074
  self.assertEqual(monitor.tool_turn_phase, "review")
2072
2075
  self.assertEqual(monitor.tool_state_review_cycles, 1)
2073
2076
  finally:
@@ -3166,6 +3169,73 @@ class TestToolStarvationBreaker(unittest.TestCase):
3166
3169
  self.assertIn("tools", result)
3167
3170
 
3168
3171
 
3172
+ class TestPruningImprovements(unittest.TestCase):
3173
+ """Tests for pruning death spiral fixes."""
3174
+
3175
+ def test_prune_uses_upstream_tokens_when_higher(self):
3176
+ """Option 1: upstream last_input_tokens used when higher than local estimate."""
3177
+ monitor = proxy.SessionMonitor(context_window=10000)
3178
+ # Simulate upstream reporting higher token count than local estimate
3179
+ monitor.last_input_tokens = 9000 # 90% - above 85% threshold
3180
+ body = {
3181
+ "model": "test",
3182
+ "messages": [
3183
+ {"role": "user", "content": "start"},
3184
+ {"role": "assistant", "content": "ok"},
3185
+ {"role": "user", "content": "a" * 100},
3186
+ {"role": "assistant", "content": "b" * 100},
3187
+ {"role": "user", "content": "c" * 100},
3188
+ {"role": "assistant", "content": "d" * 100},
3189
+ {"role": "user", "content": "e" * 100},
3190
+ {"role": "assistant", "content": "f" * 100},
3191
+ {"role": "user", "content": "g" * 100},
3192
+ {"role": "assistant", "content": "h" * 100},
3193
+ {"role": "user", "content": "continue"},
3194
+ ],
3195
+ }
3196
+ # Local estimate_total_tokens will be much lower than 9000
3197
+ local_est = proxy.estimate_total_tokens(body)
3198
+ self.assertLess(local_est, 9000)
3199
+ # The pruning code should use upstream's 9000 for the decision
3200
+
3201
+ def test_prune_conversation_accepts_keep_last(self):
3202
+ """Option 3: prune_conversation accepts keep_last parameter."""
3203
+ body = {
3204
+ "messages": [
3205
+ {"role": "user", "content": "first"},
3206
+ {"role": "assistant", "content": "a" * 500},
3207
+ {"role": "user", "content": "b" * 500},
3208
+ {"role": "assistant", "content": "c" * 500},
3209
+ {"role": "user", "content": "d" * 500},
3210
+ {"role": "assistant", "content": "e" * 500},
3211
+ {"role": "user", "content": "f" * 500},
3212
+ {"role": "assistant", "content": "g" * 500},
3213
+ {"role": "user", "content": "h" * 500},
3214
+ {"role": "assistant", "content": "i" * 500},
3215
+ {"role": "user", "content": "last"},
3216
+ ],
3217
+ }
3218
+ # With keep_last=4, more middle messages should be prunable
3219
+ result_8 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=8)
3220
+ result_4 = proxy.prune_conversation(dict(body), 2000, target_fraction=0.50, keep_last=4)
3221
+ # keep_last=4 should result in fewer or equal messages
3222
+ self.assertLessEqual(
3223
+ len(result_4.get("messages", [])),
3224
+ len(result_8.get("messages", [])),
3225
+ )
3226
+
3227
+ def test_prune_circuit_breaker_sets_finalize(self):
3228
+ """Option 2: circuit breaker forces finalize after repeated prunes."""
3229
+ monitor = proxy.SessionMonitor(context_window=10000)
3230
+ monitor.prune_count = 3 # Already pruned 3 times
3231
+ # After the pruning code runs and still exceeds threshold,
3232
+ # it should set finalize phase
3233
+ monitor.set_tool_turn_phase("act", reason="test")
3234
+ # Simulate the circuit breaker logic
3235
+ monitor.set_tool_turn_phase("finalize", reason="prune_circuit_breaker")
3236
+ self.assertEqual(monitor.tool_turn_phase, "finalize")
3237
+
3238
+
3169
3239
  if __name__ == "__main__":
3170
3240
  unittest.main()
3171
3241