headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,1579 @@
1
+ """Tool Output Intelligence Network (TOIN) - Cross-user learning for compression.
2
+
3
+ TOIN aggregates anonymized compression patterns across all Headroom users to
4
+ create a network effect: every user's compression decisions improve the
5
+ recommendations for everyone.
6
+
7
+ Key concepts:
8
+ - ToolPattern: Aggregated intelligence about a tool type (by structure hash)
9
+ - CompressionHint: Recommendations for how to compress a specific tool output
10
+ - ToolIntelligenceNetwork: Central aggregator that learns from all users
11
+
12
+ How it works:
13
+ 1. When SmartCrusher compresses data, it records the outcome via telemetry
14
+ 2. When LLM retrieves compressed data, TOIN tracks what was needed
15
+ 3. TOIN learns: "For tools with structure X, retrieval rate is high when
16
+ compressing field Y - preserve it"
17
+ 4. Next time: SmartCrusher asks TOIN for hints before compressing
18
+
19
+ Privacy:
20
+ - No actual data values are stored
21
+ - Tool names are structure hashes
22
+ - Field names are SHA256[:8] hashes
23
+ - No user identifiers
24
+
25
+ Network Effect:
26
+ - More users → more compression events → better recommendations
27
+ - Cross-user patterns reveal universal tool behaviors
28
+ - Federated learning: aggregate patterns, not data
29
+
30
+ Usage:
31
+ from headroom.telemetry.toin import get_toin
32
+
33
+ # Before compression, get recommendations
34
+ hint = get_toin().get_recommendation(tool_signature, query_context)
35
+
36
+ # Apply hint
37
+ if hint.skip_compression:
38
+ return original_data
39
+ config.preserve_fields = hint.preserve_fields
40
+ config.max_items = hint.max_items
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import hashlib
46
+ import json
47
+ import logging
48
+ import threading
49
+ import time
50
+ from collections.abc import Callable
51
+ from dataclasses import dataclass, field
52
+ from pathlib import Path
53
+ from typing import Any, Literal
54
+
55
+ from .models import FieldSemantics, ToolSignature
56
+
57
+ logger = logging.getLogger(__name__)
58
+
59
+ # LOW FIX #22: Define callback types for metrics/monitoring hooks
60
+ # These allow users to plug in their own metrics collection (Prometheus, StatsD, etc.)
61
+ MetricsCallback = Callable[[str, dict[str, Any]], None] # (event_name, event_data) -> None
62
+
63
+
64
+ @dataclass
65
+ class ToolPattern:
66
+ """Aggregated intelligence about a tool type across all users.
67
+
68
+ This is the core TOIN data structure. It represents everything we've
69
+ learned about how to compress outputs from tools with a specific structure.
70
+ """
71
+
72
+ tool_signature_hash: str
73
+
74
+ # === Compression Statistics ===
75
+ total_compressions: int = 0
76
+ total_items_seen: int = 0
77
+ total_items_kept: int = 0
78
+ avg_compression_ratio: float = 0.0
79
+ avg_token_reduction: float = 0.0
80
+
81
+ # === Retrieval Statistics ===
82
+ total_retrievals: int = 0
83
+ full_retrievals: int = 0 # Retrieved everything
84
+ search_retrievals: int = 0 # Used search filter
85
+
86
+ @property
87
+ def retrieval_rate(self) -> float:
88
+ """Fraction of compressions that triggered retrieval."""
89
+ if self.total_compressions == 0:
90
+ return 0.0
91
+ return self.total_retrievals / self.total_compressions
92
+
93
+ @property
94
+ def full_retrieval_rate(self) -> float:
95
+ """Fraction of retrievals that were full (not search)."""
96
+ if self.total_retrievals == 0:
97
+ return 0.0
98
+ return self.full_retrievals / self.total_retrievals
99
+
100
+ # === Learned Patterns ===
101
+ # Fields that are frequently retrieved (should preserve)
102
+ commonly_retrieved_fields: list[str] = field(default_factory=list)
103
+ field_retrieval_frequency: dict[str, int] = field(default_factory=dict)
104
+
105
+ # Query patterns that trigger retrieval
106
+ common_query_patterns: list[str] = field(default_factory=list)
107
+ # MEDIUM FIX #10: Track query pattern frequency to keep most common, not just recent
108
+ query_pattern_frequency: dict[str, int] = field(default_factory=dict)
109
+
110
+ # Best compression strategy for this tool type
111
+ optimal_strategy: str = "default"
112
+ strategy_success_rates: dict[str, float] = field(default_factory=dict)
113
+
114
+ # === Learned Recommendations ===
115
+ optimal_max_items: int = 20
116
+ skip_compression_recommended: bool = False
117
+ preserve_fields: list[str] = field(default_factory=list)
118
+
119
+ # === Field-Level Semantics (TOIN Evolution) ===
120
+ # Learned semantic types for each field based on retrieval patterns
121
+ # This enables zero-latency signal detection without hardcoded patterns
122
+ field_semantics: dict[str, FieldSemantics] = field(default_factory=dict)
123
+
124
+ # === Confidence ===
125
+ sample_size: int = 0
126
+ user_count: int = 0 # Number of unique users (anonymized)
127
+ confidence: float = 0.0 # 0.0 = no data, 1.0 = high confidence
128
+ last_updated: float = 0.0
129
+
130
+ # === Instance Tracking (for user_count) ===
131
+ # Hashed instance IDs of users who have contributed to this pattern
132
+ # Limited to avoid unbounded growth (for serialization)
133
+ _seen_instance_hashes: list[str] = field(default_factory=list)
134
+ # FIX: Separate set for ALL seen instances to prevent double-counting
135
+ # CRITICAL FIX #1: Capped at MAX_SEEN_INSTANCES to prevent OOM with millions of users.
136
+ # When cap is reached, we rely on user_count for accurate counting and
137
+ # accept some potential double-counting for new users (negligible at scale).
138
+ _all_seen_instances: set[str] = field(default_factory=set)
139
+
140
+ # CRITICAL FIX: Track whether instance tracking was truncated during serialization
141
+ # If True, we know some users were lost and should be conservative about user_count
142
+ _tracking_truncated: bool = False
143
+
144
+ # CRITICAL FIX #1: Maximum entries in _all_seen_instances to prevent OOM
145
+ # This is a class constant, not a field (not serialized)
146
+ MAX_SEEN_INSTANCES: int = 10000
147
+
148
+ def to_dict(self) -> dict[str, Any]:
149
+ """Convert to dictionary for serialization."""
150
+ return {
151
+ "tool_signature_hash": self.tool_signature_hash,
152
+ "total_compressions": self.total_compressions,
153
+ "total_items_seen": self.total_items_seen,
154
+ "total_items_kept": self.total_items_kept,
155
+ "avg_compression_ratio": self.avg_compression_ratio,
156
+ "avg_token_reduction": self.avg_token_reduction,
157
+ "total_retrievals": self.total_retrievals,
158
+ "full_retrievals": self.full_retrievals,
159
+ "search_retrievals": self.search_retrievals,
160
+ "retrieval_rate": self.retrieval_rate,
161
+ "full_retrieval_rate": self.full_retrieval_rate,
162
+ "commonly_retrieved_fields": self.commonly_retrieved_fields,
163
+ "field_retrieval_frequency": self.field_retrieval_frequency,
164
+ "common_query_patterns": self.common_query_patterns,
165
+ "query_pattern_frequency": self.query_pattern_frequency,
166
+ "optimal_strategy": self.optimal_strategy,
167
+ "strategy_success_rates": self.strategy_success_rates,
168
+ "optimal_max_items": self.optimal_max_items,
169
+ "skip_compression_recommended": self.skip_compression_recommended,
170
+ "preserve_fields": self.preserve_fields,
171
+ # Field-level semantics (TOIN Evolution)
172
+ "field_semantics": {k: v.to_dict() for k, v in self.field_semantics.items()},
173
+ "sample_size": self.sample_size,
174
+ "user_count": self.user_count,
175
+ "confidence": self.confidence,
176
+ "last_updated": self.last_updated,
177
+ # Serialize instance hashes (limited to 100 for bounded storage)
178
+ "seen_instance_hashes": self._seen_instance_hashes[:100],
179
+ # CRITICAL FIX: Track if truncation occurred during serialization
180
+ # This tells from_dict() that some users were lost and prevents double-counting
181
+ "tracking_truncated": (
182
+ self._tracking_truncated
183
+ or self.user_count > len(self._seen_instance_hashes)
184
+ or len(self._all_seen_instances) > 100
185
+ ),
186
+ }
187
+
188
+ @classmethod
189
+ def from_dict(cls, data: dict[str, Any]) -> ToolPattern:
190
+ """Create from dictionary."""
191
+ # Filter to only valid fields
192
+ valid_fields = {
193
+ "tool_signature_hash",
194
+ "total_compressions",
195
+ "total_items_seen",
196
+ "total_items_kept",
197
+ "avg_compression_ratio",
198
+ "avg_token_reduction",
199
+ "total_retrievals",
200
+ "full_retrievals",
201
+ "search_retrievals",
202
+ "commonly_retrieved_fields",
203
+ "field_retrieval_frequency",
204
+ "common_query_patterns",
205
+ "query_pattern_frequency",
206
+ "optimal_strategy",
207
+ "strategy_success_rates",
208
+ "optimal_max_items",
209
+ "skip_compression_recommended",
210
+ "preserve_fields",
211
+ "sample_size",
212
+ "user_count",
213
+ "confidence",
214
+ "last_updated",
215
+ }
216
+ filtered = {k: v for k, v in data.items() if k in valid_fields}
217
+
218
+ # Handle seen_instance_hashes (serialized without underscore prefix)
219
+ seen_hashes = data.get("seen_instance_hashes", [])
220
+
221
+ pattern = cls(**filtered)
222
+ pattern._seen_instance_hashes = seen_hashes[:100] # Limit on load
223
+
224
+ # CRITICAL FIX: Populate _all_seen_instances from loaded hashes
225
+ # This prevents double-counting after restart - without this, the same
226
+ # instances would be counted again because the lookup set was empty
227
+ pattern._all_seen_instances = set(pattern._seen_instance_hashes)
228
+
229
+ # CRITICAL FIX: Restore truncation flag to prevent double-counting
230
+ # If truncated, we know some users were lost in serialization
231
+ pattern._tracking_truncated = data.get("tracking_truncated", False)
232
+ # Also detect truncation if user_count > loaded hashes (backward compat)
233
+ if pattern.user_count > len(pattern._seen_instance_hashes):
234
+ pattern._tracking_truncated = True
235
+
236
+ # Load field semantics (TOIN Evolution)
237
+ field_semantics_data = data.get("field_semantics", {})
238
+ if field_semantics_data:
239
+ pattern.field_semantics = {
240
+ k: FieldSemantics.from_dict(v) for k, v in field_semantics_data.items()
241
+ }
242
+
243
+ return pattern
244
+
245
+
246
+ @dataclass
247
+ class CompressionHint:
248
+ """Recommendation for how to compress a specific tool output.
249
+
250
+ This is what TOIN returns when asked for advice before compression.
251
+ """
252
+
253
+ # Should we compress at all?
254
+ skip_compression: bool = False
255
+
256
+ # How aggressively to compress
257
+ max_items: int = 20
258
+ compression_level: Literal["none", "conservative", "moderate", "aggressive"] = "moderate"
259
+
260
+ # Which fields to preserve (never remove)
261
+ preserve_fields: list[str] = field(default_factory=list)
262
+
263
+ # Which strategy to use
264
+ recommended_strategy: str = "default"
265
+
266
+ # Why this recommendation
267
+ reason: str = ""
268
+ confidence: float = 0.0
269
+
270
+ # Source of recommendation
271
+ source: Literal["network", "local", "default"] = "default"
272
+ based_on_samples: int = 0
273
+
274
+ # === TOIN Evolution: Learned Field Semantics ===
275
+ # These enable zero-latency signal detection in SmartCrusher.
276
+ # field_hash -> FieldSemantics (learned semantic type, important values, etc.)
277
+ field_semantics: dict[str, FieldSemantics] = field(default_factory=dict)
278
+
279
+
280
+ @dataclass
281
+ class TOINConfig:
282
+ """Configuration for the Tool Output Intelligence Network."""
283
+
284
+ # Enable/disable TOIN
285
+ enabled: bool = True
286
+
287
+ # Storage
288
+ storage_path: str | None = None # Path to store TOIN data
289
+ auto_save_interval: int = 600 # Auto-save every 10 minutes
290
+
291
+ # Network learning thresholds
292
+ min_samples_for_recommendation: int = 10
293
+ min_users_for_network_effect: int = 3
294
+
295
+ # Recommendation thresholds
296
+ high_retrieval_threshold: float = 0.5 # Above this = compress less
297
+ medium_retrieval_threshold: float = 0.2 # Between medium and high = moderate
298
+
299
+ # Privacy
300
+ anonymize_queries: bool = True
301
+ max_query_patterns: int = 10
302
+
303
+ # LOW FIX #22: Metrics/monitoring hooks
304
+ # Callback for emitting metrics events. Signature: (event_name, event_data) -> None
305
+ # Event names: "toin.compression", "toin.retrieval", "toin.recommendation", "toin.save"
306
+ # This allows integration with Prometheus, StatsD, OpenTelemetry, etc.
307
+ metrics_callback: MetricsCallback | None = None
308
+
309
+
310
+ class ToolIntelligenceNetwork:
311
+ """Aggregates tool patterns across all Headroom users.
312
+
313
+ This is the brain of TOIN. It maintains a database of learned patterns
314
+ for different tool types and provides recommendations based on
315
+ cross-user intelligence.
316
+
317
+ Thread-safe for concurrent access.
318
+ """
319
+
320
+ def __init__(self, config: TOINConfig | None = None):
321
+ """Initialize TOIN.
322
+
323
+ Args:
324
+ config: Configuration options.
325
+ """
326
+ self._config = config or TOINConfig()
327
+ self._lock = threading.RLock() # RLock for reentrant locking (save calls export_patterns)
328
+
329
+ # Pattern database: structure_hash -> ToolPattern
330
+ self._patterns: dict[str, ToolPattern] = {}
331
+
332
+ # Instance ID for user counting (anonymized)
333
+ # IMPORTANT: Must be STABLE across restarts to avoid false user count inflation
334
+ # Derive from storage path if available, otherwise use machine-specific ID
335
+ self._instance_id = self._generate_stable_instance_id()
336
+
337
+ # Tracking
338
+ self._last_save_time = time.time()
339
+ self._dirty = False
340
+
341
+ # Load existing data
342
+ if self._config.storage_path:
343
+ self._load_from_disk()
344
+
345
+ def _generate_stable_instance_id(self) -> str:
346
+ """Generate a stable instance ID that doesn't change across restarts.
347
+
348
+ Uses storage path if available, otherwise uses machine-specific info.
349
+ This prevents false user count inflation when reloading from disk.
350
+
351
+ HIGH FIX: Instance ID collision risk
352
+ Previously used SHA256[:8] (32 bits) which has 50% collision probability
353
+ at sqrt(2^32) ≈ 65,536 users (birthday paradox). Increased to SHA256[:16]
354
+ (64 bits) for 50% collision at ~4 billion users, which is acceptable.
355
+ """
356
+ if self._config.storage_path:
357
+ # Derive from storage path - same path = same instance
358
+ return hashlib.sha256(self._config.storage_path.encode()).hexdigest()[
359
+ :16
360
+ ] # HIGH FIX: 64 bits instead of 32
361
+ else:
362
+ # No storage - use a combination of hostname and process info
363
+ # This is less stable but better than pure random
364
+ import os
365
+ import socket
366
+
367
+ machine_info = (
368
+ f"{socket.gethostname()}:{os.getuid() if hasattr(os, 'getuid') else 'unknown'}"
369
+ )
370
+ return hashlib.sha256(machine_info.encode()).hexdigest()[:16] # HIGH FIX: 64 bits
371
+
372
+ def _emit_metric(self, event_name: str, event_data: dict[str, Any]) -> None:
373
+ """Emit a metrics event via the configured callback.
374
+
375
+ LOW FIX #22: Provides monitoring integration for external metrics systems.
376
+
377
+ Args:
378
+ event_name: Name of the event (e.g., "toin.compression").
379
+ event_data: Dictionary of event data to emit.
380
+ """
381
+ if self._config.metrics_callback is not None:
382
+ try:
383
+ self._config.metrics_callback(event_name, event_data)
384
+ except Exception as e:
385
+ # Never let metrics callback failures break TOIN
386
+ logger.debug(f"Metrics callback failed for {event_name}: {e}")
387
+
388
+ def record_compression(
389
+ self,
390
+ tool_signature: ToolSignature,
391
+ original_count: int,
392
+ compressed_count: int,
393
+ original_tokens: int,
394
+ compressed_tokens: int,
395
+ strategy: str,
396
+ query_context: str | None = None,
397
+ items: list[dict[str, Any]] | None = None,
398
+ ) -> None:
399
+ """Record a compression event.
400
+
401
+ Called after SmartCrusher compresses data. Updates the pattern
402
+ for this tool type.
403
+
404
+ TOIN Evolution: When items are provided, we capture field statistics
405
+ for learning semantic types (uniqueness, default values, etc.).
406
+
407
+ Args:
408
+ tool_signature: Signature of the tool output structure.
409
+ original_count: Original number of items.
410
+ compressed_count: Number of items after compression.
411
+ original_tokens: Original token count.
412
+ compressed_tokens: Compressed token count.
413
+ strategy: Compression strategy used.
414
+ query_context: Optional user query that triggered this tool call.
415
+ items: Optional list of items being compressed for field-level learning.
416
+ """
417
+ # HIGH FIX: Check enabled FIRST to avoid computing structure_hash if disabled
418
+ # This saves CPU when TOIN is turned off
419
+ if not self._config.enabled:
420
+ return
421
+
422
+ # Computing structure_hash can be expensive for large structures
423
+ sig_hash = tool_signature.structure_hash
424
+
425
+ # LOW FIX #22: Emit compression metric
426
+ self._emit_metric(
427
+ "toin.compression",
428
+ {
429
+ "signature_hash": sig_hash,
430
+ "original_count": original_count,
431
+ "compressed_count": compressed_count,
432
+ "original_tokens": original_tokens,
433
+ "compressed_tokens": compressed_tokens,
434
+ "strategy": strategy,
435
+ "compression_ratio": compressed_count / original_count if original_count > 0 else 0,
436
+ },
437
+ )
438
+
439
+ with self._lock:
440
+ # Get or create pattern
441
+ if sig_hash not in self._patterns:
442
+ self._patterns[sig_hash] = ToolPattern(tool_signature_hash=sig_hash)
443
+
444
+ pattern = self._patterns[sig_hash]
445
+
446
+ # Update compression stats
447
+ pattern.total_compressions += 1
448
+ pattern.total_items_seen += original_count
449
+ pattern.total_items_kept += compressed_count
450
+ pattern.sample_size += 1
451
+
452
+ # Update rolling averages
453
+ n = pattern.total_compressions
454
+ compression_ratio = compressed_count / original_count if original_count > 0 else 0.0
455
+ token_reduction = (
456
+ 1 - (compressed_tokens / original_tokens) if original_tokens > 0 else 0.0
457
+ )
458
+
459
+ pattern.avg_compression_ratio = (
460
+ pattern.avg_compression_ratio * (n - 1) + compression_ratio
461
+ ) / n
462
+ pattern.avg_token_reduction = (
463
+ pattern.avg_token_reduction * (n - 1) + token_reduction
464
+ ) / n
465
+
466
+ # Update strategy stats
467
+ if strategy not in pattern.strategy_success_rates:
468
+ pattern.strategy_success_rates[strategy] = 1.0 # Start optimistic
469
+ else:
470
+ # Give a small boost for each compression without retrieval
471
+ # This counteracts the penalty from record_retrieval() and prevents
472
+ # all strategies from trending to 0.0 over time (one-way ratchet fix)
473
+ # The boost is small (0.02) because retrieval penalties are larger (0.05-0.15)
474
+ # This means strategies that cause retrievals will still trend down
475
+ current_rate = pattern.strategy_success_rates[strategy]
476
+ pattern.strategy_success_rates[strategy] = min(1.0, current_rate + 0.02)
477
+
478
+ # HIGH FIX: Bound strategy_success_rates to prevent unbounded growth
479
+ # Keep top 20 strategies by success rate
480
+ if len(pattern.strategy_success_rates) > 20:
481
+ sorted_strategies = sorted(
482
+ pattern.strategy_success_rates.items(),
483
+ key=lambda x: x[1],
484
+ reverse=True,
485
+ )[:20]
486
+ pattern.strategy_success_rates = dict(sorted_strategies)
487
+
488
+ # Track unique users via instance_id
489
+ # FIX: Use _all_seen_instances set for lookup to prevent double-counting
490
+ # after the storage list hits its cap
491
+ # CRITICAL FIX #1: Check cap before adding to prevent OOM
492
+ if self._instance_id not in pattern._all_seen_instances:
493
+ # CRITICAL FIX: Check if we can verify this is a new user
494
+ # If tracking was truncated (users lost after restart), we can only
495
+ # count new users if we can add them to _all_seen_instances for dedup
496
+ can_track = len(pattern._all_seen_instances) < ToolPattern.MAX_SEEN_INSTANCES
497
+
498
+ if can_track:
499
+ # Add to the lookup set - we can verify this is new
500
+ pattern._all_seen_instances.add(self._instance_id)
501
+ # Also add to storage list (capped at 100 for serialization)
502
+ if len(pattern._seen_instance_hashes) < 100:
503
+ pattern._seen_instance_hashes.append(self._instance_id)
504
+ # Safe to increment user_count - we verified it's new
505
+ pattern.user_count += 1
506
+ elif not pattern._tracking_truncated:
507
+ # Tracking set is full but we weren't truncated before
508
+ # This is a truly new user beyond our tracking capacity
509
+ pattern.user_count += 1
510
+ # else: Can't verify if new, skip incrementing to prevent double-count
511
+
512
+ # Track query context patterns for learning (privacy-preserving)
513
+ if query_context and len(query_context) >= 3:
514
+ # Normalize and anonymize: extract keywords, remove values
515
+ query_pattern = self._anonymize_query_pattern(query_context)
516
+ if query_pattern:
517
+ # MEDIUM FIX #10: Track frequency to keep most common patterns
518
+ pattern.query_pattern_frequency[query_pattern] = (
519
+ pattern.query_pattern_frequency.get(query_pattern, 0) + 1
520
+ )
521
+ # Update the list to contain top patterns by frequency
522
+ if query_pattern not in pattern.common_query_patterns:
523
+ pattern.common_query_patterns.append(query_pattern)
524
+ # Keep only the most common patterns (by frequency)
525
+ if len(pattern.common_query_patterns) > self._config.max_query_patterns:
526
+ pattern.common_query_patterns = sorted(
527
+ pattern.common_query_patterns,
528
+ key=lambda p: pattern.query_pattern_frequency.get(p, 0),
529
+ reverse=True,
530
+ )[: self._config.max_query_patterns]
531
+ # Also limit the frequency dict
532
+ if len(pattern.query_pattern_frequency) > self._config.max_query_patterns * 2:
533
+ top_patterns = sorted(
534
+ pattern.query_pattern_frequency.items(),
535
+ key=lambda x: x[1],
536
+ reverse=True,
537
+ )[: self._config.max_query_patterns * 2]
538
+ pattern.query_pattern_frequency = dict(top_patterns)
539
+
540
+ # Periodically update recommendations even without retrievals
541
+ # This ensures optimal_strategy is updated based on success rates
542
+ if pattern.total_compressions % 10 == 0:
543
+ self._update_recommendations(pattern)
544
+
545
+ # === TOIN Evolution: Field Statistics for Semantic Learning ===
546
+ # Capture field-level statistics to learn default values and uniqueness
547
+ if items:
548
+ self._update_field_statistics(pattern, items)
549
+
550
+ pattern.last_updated = time.time()
551
+ pattern.confidence = self._calculate_confidence(pattern)
552
+ self._dirty = True
553
+
554
+ # Auto-save if needed (outside lock)
555
+ self._maybe_auto_save()
556
+
557
+ def _update_field_statistics(
558
+ self,
559
+ pattern: ToolPattern,
560
+ items: list[dict[str, Any]],
561
+ ) -> None:
562
+ """Update field statistics from compression items.
563
+
564
+ Captures uniqueness, default values, and value distribution for
565
+ learning field semantic types.
566
+
567
+ Args:
568
+ pattern: ToolPattern to update.
569
+ items: Items being compressed.
570
+ """
571
+ if not items:
572
+ return
573
+
574
+ # Analyze field statistics (sample up to 100 items to limit CPU)
575
+ sample_items = items[:100] if len(items) > 100 else items
576
+
577
+ # Collect values for each field
578
+ field_values: dict[str, list[str]] = {} # field_hash -> list of value_hashes
579
+
580
+ for item in sample_items:
581
+ if not isinstance(item, dict):
582
+ continue
583
+
584
+ for field_name, value in item.items():
585
+ field_hash = self._hash_field_name(field_name)
586
+ value_hash = self._hash_value(value)
587
+
588
+ if field_hash not in field_values:
589
+ field_values[field_hash] = []
590
+ field_values[field_hash].append(value_hash)
591
+
592
+ # Update FieldSemantics with statistics
593
+ for field_hash, values in field_values.items():
594
+ if not values:
595
+ continue
596
+
597
+ # Get or create FieldSemantics
598
+ if field_hash not in pattern.field_semantics:
599
+ pattern.field_semantics[field_hash] = FieldSemantics(field_hash=field_hash)
600
+
601
+ field_sem = pattern.field_semantics[field_hash]
602
+
603
+ # Calculate statistics
604
+ unique_values = len(set(values))
605
+ total_values = len(values)
606
+
607
+ # Find most common value
608
+ from collections import Counter
609
+
610
+ value_counts = Counter(values)
611
+ most_common_value, most_common_count = value_counts.most_common(1)[0]
612
+ most_common_frequency = most_common_count / total_values if total_values > 0 else 0.0
613
+
614
+ # Record compression stats
615
+ field_sem.record_compression_stats(
616
+ unique_values=unique_values,
617
+ total_values=total_values,
618
+ most_common_value_hash=most_common_value,
619
+ most_common_frequency=most_common_frequency,
620
+ )
621
+
622
+ # Bound field_semantics to prevent unbounded growth (max 100 fields)
623
+ if len(pattern.field_semantics) > 100:
624
+ # Keep fields with highest activity (retrieval + compression count)
625
+ sorted_fields = sorted(
626
+ pattern.field_semantics.items(),
627
+ key=lambda x: x[1].retrieval_count + x[1].compression_count,
628
+ reverse=True,
629
+ )[:100]
630
+ pattern.field_semantics = dict(sorted_fields)
631
+
632
+ def record_retrieval(
633
+ self,
634
+ tool_signature_hash: str,
635
+ retrieval_type: str,
636
+ query: str | None = None,
637
+ query_fields: list[str] | None = None,
638
+ strategy: str | None = None,
639
+ retrieved_items: list[dict[str, Any]] | None = None,
640
+ ) -> None:
641
+ """Record a retrieval event.
642
+
643
+ Called when LLM retrieves compressed content. This is the key
644
+ feedback signal - it means compression was too aggressive.
645
+
646
+ TOIN Evolution: When retrieved_items are provided, we learn field
647
+ semantics from the values. This enables zero-latency signal detection.
648
+
649
+ Args:
650
+ tool_signature_hash: Hash of the tool signature.
651
+ retrieval_type: "full" or "search".
652
+ query: Optional search query (will be anonymized).
653
+ query_fields: Fields mentioned in query (will be hashed).
654
+ strategy: Compression strategy that was used (for success rate tracking).
655
+ retrieved_items: Optional list of retrieved items for field-level learning.
656
+ """
657
+ if not self._config.enabled:
658
+ return
659
+
660
+ # LOW FIX #22: Emit retrieval metric
661
+ self._emit_metric(
662
+ "toin.retrieval",
663
+ {
664
+ "signature_hash": tool_signature_hash,
665
+ "retrieval_type": retrieval_type,
666
+ "has_query": query is not None,
667
+ "query_fields_count": len(query_fields) if query_fields else 0,
668
+ "strategy": strategy,
669
+ },
670
+ )
671
+
672
+ with self._lock:
673
+ if tool_signature_hash not in self._patterns:
674
+ # First time seeing this tool via retrieval
675
+ self._patterns[tool_signature_hash] = ToolPattern(
676
+ tool_signature_hash=tool_signature_hash
677
+ )
678
+
679
+ pattern = self._patterns[tool_signature_hash]
680
+
681
+ # Update retrieval stats
682
+ pattern.total_retrievals += 1
683
+ if retrieval_type == "full":
684
+ pattern.full_retrievals += 1
685
+ else:
686
+ pattern.search_retrievals += 1
687
+
688
+ # Update strategy success rates - retrieval means the strategy was TOO aggressive
689
+ # Decrease success rate for this strategy
690
+ if strategy and strategy in pattern.strategy_success_rates:
691
+ # Exponential moving average: penalize strategies that trigger retrieval
692
+ # Full retrievals are worse than search retrievals
693
+ penalty = 0.15 if retrieval_type == "full" else 0.05
694
+ current_rate = pattern.strategy_success_rates[strategy]
695
+ pattern.strategy_success_rates[strategy] = max(0.0, current_rate - penalty)
696
+
697
+ # Track queried fields (anonymized)
698
+ if query_fields:
699
+ for field_name in query_fields:
700
+ field_hash = self._hash_field_name(field_name)
701
+ pattern.field_retrieval_frequency[field_hash] = (
702
+ pattern.field_retrieval_frequency.get(field_hash, 0) + 1
703
+ )
704
+
705
+ # Update commonly retrieved fields
706
+ if field_hash not in pattern.commonly_retrieved_fields:
707
+ # Add if frequently retrieved (check count from dict)
708
+ freq = pattern.field_retrieval_frequency.get(field_hash, 0)
709
+ if freq >= 3:
710
+ pattern.commonly_retrieved_fields.append(field_hash)
711
+ # HIGH: Limit commonly_retrieved_fields to prevent unbounded growth
712
+ if len(pattern.commonly_retrieved_fields) > 20:
713
+ # Keep only the most frequently retrieved fields
714
+ sorted_fields = sorted(
715
+ pattern.commonly_retrieved_fields,
716
+ key=lambda f: pattern.field_retrieval_frequency.get(f, 0),
717
+ reverse=True,
718
+ )
719
+ pattern.commonly_retrieved_fields = sorted_fields[:20]
720
+
721
+ # HIGH: Limit field_retrieval_frequency dict to prevent unbounded growth
722
+ if len(pattern.field_retrieval_frequency) > 100:
723
+ sorted_freq_items = sorted(
724
+ pattern.field_retrieval_frequency.items(),
725
+ key=lambda x: x[1],
726
+ reverse=True,
727
+ )[:100]
728
+ pattern.field_retrieval_frequency = dict(sorted_freq_items)
729
+
730
+ # Track query patterns (anonymized)
731
+ if query and self._config.anonymize_queries:
732
+ query_pattern = self._anonymize_query_pattern(query)
733
+ if query_pattern:
734
+ # MEDIUM FIX #10: Track frequency to keep most common patterns
735
+ pattern.query_pattern_frequency[query_pattern] = (
736
+ pattern.query_pattern_frequency.get(query_pattern, 0) + 1
737
+ )
738
+ if query_pattern not in pattern.common_query_patterns:
739
+ pattern.common_query_patterns.append(query_pattern)
740
+ # Keep only the most common patterns (by frequency)
741
+ if len(pattern.common_query_patterns) > self._config.max_query_patterns:
742
+ pattern.common_query_patterns = sorted(
743
+ pattern.common_query_patterns,
744
+ key=lambda p: pattern.query_pattern_frequency.get(p, 0),
745
+ reverse=True,
746
+ )[: self._config.max_query_patterns]
747
+
748
+ # === TOIN Evolution: Field-Level Semantic Learning ===
749
+ # Learn from retrieved items to build zero-latency signal detection
750
+ if retrieved_items:
751
+ # Extract query operator from query string (for learning)
752
+ query_operator = self._extract_query_operator(query) if query else "="
753
+
754
+ for item in retrieved_items:
755
+ if not isinstance(item, dict):
756
+ continue
757
+
758
+ for field_name, value in item.items():
759
+ field_hash = self._hash_field_name(field_name)
760
+
761
+ # Get or create FieldSemantics for this field
762
+ if field_hash not in pattern.field_semantics:
763
+ pattern.field_semantics[field_hash] = FieldSemantics(
764
+ field_hash=field_hash
765
+ )
766
+
767
+ field_sem = pattern.field_semantics[field_hash]
768
+
769
+ # Hash the value for privacy
770
+ value_hash = self._hash_value(value)
771
+
772
+ # Record this retrieval
773
+ field_sem.record_retrieval_value(value_hash, query_operator)
774
+
775
+ # Periodically infer types (every 5 retrievals to save CPU)
776
+ if pattern.total_retrievals % 5 == 0:
777
+ for field_sem in pattern.field_semantics.values():
778
+ if field_sem.retrieval_count >= 3: # Need minimum data
779
+ field_sem.infer_type()
780
+
781
+ # Bound field_semantics to prevent unbounded growth (max 100 fields)
782
+ if len(pattern.field_semantics) > 100:
783
+ # Keep fields with highest retrieval counts
784
+ sorted_semantics = sorted(
785
+ pattern.field_semantics.items(),
786
+ key=lambda x: x[1].retrieval_count,
787
+ reverse=True,
788
+ )[:100]
789
+ pattern.field_semantics = dict(sorted_semantics)
790
+
791
+ # Update recommendations based on new retrieval data
792
+ self._update_recommendations(pattern)
793
+
794
+ pattern.last_updated = time.time()
795
+ self._dirty = True
796
+
797
+ self._maybe_auto_save()
798
+
799
+ def get_recommendation(
800
+ self,
801
+ tool_signature: ToolSignature,
802
+ query_context: str | None = None,
803
+ ) -> CompressionHint:
804
+ """Get compression recommendation for a tool output.
805
+
806
+ This is the main API for SmartCrusher to consult before compressing.
807
+
808
+ Args:
809
+ tool_signature: Signature of the tool output structure.
810
+ query_context: User query for context-aware recommendations.
811
+
812
+ Returns:
813
+ CompressionHint with recommendations.
814
+ """
815
+ if not self._config.enabled:
816
+ return CompressionHint(source="default", reason="TOIN disabled")
817
+
818
+ sig_hash = tool_signature.structure_hash
819
+
820
+ with self._lock:
821
+ pattern = self._patterns.get(sig_hash)
822
+
823
+ if pattern is None:
824
+ # No data for this tool type
825
+ return CompressionHint(
826
+ source="default",
827
+ reason="No pattern data for this tool type",
828
+ )
829
+
830
+ # Not enough samples for reliable recommendation
831
+ if pattern.sample_size < self._config.min_samples_for_recommendation:
832
+ hint = CompressionHint(
833
+ source="local",
834
+ reason=f"Only {pattern.sample_size} samples (need {self._config.min_samples_for_recommendation})",
835
+ confidence=pattern.confidence,
836
+ based_on_samples=pattern.sample_size,
837
+ )
838
+ # LOW FIX #22: Emit recommendation metric
839
+ self._emit_metric(
840
+ "toin.recommendation",
841
+ {
842
+ "signature_hash": sig_hash,
843
+ "source": hint.source,
844
+ "confidence": hint.confidence,
845
+ "skip_compression": hint.skip_compression,
846
+ "max_items": hint.max_items,
847
+ "compression_level": hint.compression_level,
848
+ "based_on_samples": hint.based_on_samples,
849
+ },
850
+ )
851
+ return hint
852
+
853
+ # Build recommendation based on learned patterns
854
+ hint = self._build_recommendation(pattern, query_context)
855
+
856
+ # LOW FIX #22: Emit recommendation metric
857
+ self._emit_metric(
858
+ "toin.recommendation",
859
+ {
860
+ "signature_hash": sig_hash,
861
+ "source": hint.source,
862
+ "confidence": hint.confidence,
863
+ "skip_compression": hint.skip_compression,
864
+ "max_items": hint.max_items,
865
+ "compression_level": hint.compression_level,
866
+ "based_on_samples": hint.based_on_samples,
867
+ },
868
+ )
869
+ return hint
870
+
871
+ def _build_recommendation(
872
+ self,
873
+ pattern: ToolPattern,
874
+ query_context: str | None,
875
+ ) -> CompressionHint:
876
+ """Build a recommendation based on pattern data and query context."""
877
+ hint = CompressionHint(
878
+ source="network"
879
+ if pattern.user_count >= self._config.min_users_for_network_effect
880
+ else "local",
881
+ confidence=pattern.confidence,
882
+ based_on_samples=pattern.sample_size,
883
+ )
884
+
885
+ retrieval_rate = pattern.retrieval_rate
886
+ full_retrieval_rate = pattern.full_retrieval_rate
887
+
888
+ # High retrieval rate = compression too aggressive
889
+ if retrieval_rate > self._config.high_retrieval_threshold:
890
+ if full_retrieval_rate > 0.8:
891
+ # Almost all retrievals are full = don't compress
892
+ hint.skip_compression = True
893
+ hint.compression_level = "none"
894
+ hint.reason = f"Very high full retrieval rate ({full_retrieval_rate:.1%})"
895
+ else:
896
+ # High retrieval but mostly search = compress conservatively
897
+ hint.max_items = pattern.optimal_max_items
898
+ hint.compression_level = "conservative"
899
+ hint.reason = f"High retrieval rate ({retrieval_rate:.1%})"
900
+
901
+ elif retrieval_rate > self._config.medium_retrieval_threshold:
902
+ # Moderate retrieval = moderate compression
903
+ hint.max_items = max(20, pattern.optimal_max_items)
904
+ hint.compression_level = "moderate"
905
+ hint.reason = f"Moderate retrieval rate ({retrieval_rate:.1%})"
906
+
907
+ else:
908
+ # Low retrieval = aggressive compression works
909
+ hint.max_items = min(15, pattern.optimal_max_items)
910
+ hint.compression_level = "aggressive"
911
+ hint.reason = f"Low retrieval rate ({retrieval_rate:.1%})"
912
+
913
+ # Build preserve_fields list weighted by retrieval frequency
914
+ # Start with pattern's preserve_fields, then enhance based on query
915
+ preserve_fields = pattern.preserve_fields.copy()
916
+ query_fields_count = 0
917
+
918
+ # If we have query context, extract field names and prioritize them
919
+ if query_context and pattern.field_retrieval_frequency:
920
+ # Extract field names from query context
921
+ import re
922
+
923
+ query_field_names = re.findall(r"(\w+)[=:]", query_context.lower())
924
+
925
+ # Hash them and check if they're in our frequency data
926
+ for field_name in query_field_names:
927
+ field_hash = self._hash_field_name(field_name)
928
+ if field_hash in pattern.field_retrieval_frequency:
929
+ # This field is known to be retrieved - prioritize it
930
+ if field_hash in preserve_fields:
931
+ # Move to front
932
+ preserve_fields.remove(field_hash)
933
+ preserve_fields.insert(0, field_hash)
934
+ query_fields_count += 1
935
+
936
+ # Sort remaining fields by retrieval frequency (most frequent first)
937
+ if pattern.field_retrieval_frequency and len(preserve_fields) > 1:
938
+ # Separate query-mentioned fields (already at front) from others
939
+ if query_fields_count < len(preserve_fields):
940
+ rest = preserve_fields[query_fields_count:]
941
+ rest.sort(
942
+ key=lambda f: pattern.field_retrieval_frequency.get(f, 0),
943
+ reverse=True,
944
+ )
945
+ preserve_fields = preserve_fields[:query_fields_count] + rest
946
+
947
+ hint.preserve_fields = preserve_fields[:10] # Limit to top 10
948
+
949
+ # Use optimal strategy if known AND it has good success rate
950
+ if pattern.optimal_strategy != "default":
951
+ success_rate = pattern.strategy_success_rates.get(pattern.optimal_strategy, 1.0)
952
+ # Only recommend strategy if success rate >= 0.5
953
+ # Lower success rates mean this strategy often causes retrievals
954
+ if success_rate >= 0.5:
955
+ hint.recommended_strategy = pattern.optimal_strategy
956
+ else:
957
+ # Strategy has poor success rate - reduce confidence
958
+ hint.confidence *= success_rate
959
+ hint.reason += (
960
+ f" (strategy {pattern.optimal_strategy} has low success: {success_rate:.1%})"
961
+ )
962
+ # Try to find a better strategy
963
+ best_strategy = self._find_best_strategy(pattern)
964
+ if best_strategy and best_strategy != pattern.optimal_strategy:
965
+ hint.recommended_strategy = best_strategy
966
+ hint.reason += f", using {best_strategy} instead"
967
+
968
+ # Boost max_items if query_context matches common retrieval patterns
969
+ # This prevents unnecessary retrieval when we can predict what's needed
970
+ if query_context:
971
+ query_lower = query_context.lower()
972
+
973
+ # Check for exhaustive query keywords that suggest user needs all data
974
+ exhaustive_keywords = ["all", "every", "complete", "full", "entire", "list all"]
975
+ if any(kw in query_lower for kw in exhaustive_keywords):
976
+ # User likely needs more data - be conservative
977
+ hint.max_items = max(hint.max_items, 40)
978
+ hint.compression_level = "conservative"
979
+ hint.reason += " (exhaustive query detected)"
980
+
981
+ # Check against common retrieval patterns
982
+ if pattern.common_query_patterns:
983
+ query_pattern = self._anonymize_query_pattern(query_context)
984
+ if query_pattern:
985
+ # Exact match
986
+ if query_pattern in pattern.common_query_patterns:
987
+ hint.max_items = max(hint.max_items, 30)
988
+ hint.reason += " (query matches retrieval pattern)"
989
+ else:
990
+ # Partial match: check if any stored pattern is contained in query
991
+ for stored_pattern in pattern.common_query_patterns:
992
+ # Check if key fields match (e.g., "status:*" in both)
993
+ stored_fields = {
994
+ f.split(":")[0] for f in stored_pattern.split() if ":" in f
995
+ }
996
+ query_fields = {
997
+ f.split(":")[0] for f in query_pattern.split() if ":" in f
998
+ }
999
+ # If query uses same fields as a problematic pattern, be conservative
1000
+ if stored_fields and stored_fields.issubset(query_fields):
1001
+ hint.max_items = max(hint.max_items, 25)
1002
+ hint.reason += " (query uses fields from retrieval pattern)"
1003
+ break
1004
+
1005
+ # === TOIN Evolution: Include learned field semantics ===
1006
+ # Copy field_semantics with sufficient confidence for SmartCrusher to use
1007
+ # Only include fields with confidence >= 0.3 to reduce noise
1008
+ if pattern.field_semantics:
1009
+ hint.field_semantics = {
1010
+ field_hash: field_sem
1011
+ for field_hash, field_sem in pattern.field_semantics.items()
1012
+ if field_sem.confidence >= 0.3 or field_sem.retrieval_count >= 3
1013
+ }
1014
+
1015
+ return hint
1016
+
1017
+ def _find_best_strategy(self, pattern: ToolPattern) -> str | None:
1018
+ """Find the strategy with the best success rate.
1019
+
1020
+ Returns None if no strategies have been tried or all have low success.
1021
+ """
1022
+ if not pattern.strategy_success_rates:
1023
+ return None
1024
+
1025
+ # Find strategy with highest success rate above threshold
1026
+ best_strategy = None
1027
+ best_rate = 0.5 # Minimum acceptable rate
1028
+
1029
+ for strategy, rate in pattern.strategy_success_rates.items():
1030
+ if rate > best_rate:
1031
+ best_rate = rate
1032
+ best_strategy = strategy
1033
+
1034
+ return best_strategy
1035
+
1036
+ def _update_recommendations(self, pattern: ToolPattern) -> None:
1037
+ """Update learned recommendations for a pattern."""
1038
+ # Calculate optimal max_items based on retrieval rate
1039
+ retrieval_rate = pattern.retrieval_rate
1040
+
1041
+ if retrieval_rate > self._config.high_retrieval_threshold:
1042
+ if pattern.full_retrieval_rate > 0.8:
1043
+ pattern.skip_compression_recommended = True
1044
+ pattern.optimal_max_items = pattern.total_items_seen // max(
1045
+ 1, pattern.total_compressions
1046
+ )
1047
+ else:
1048
+ pattern.optimal_max_items = 50
1049
+ elif retrieval_rate > self._config.medium_retrieval_threshold:
1050
+ pattern.optimal_max_items = 30
1051
+ else:
1052
+ pattern.optimal_max_items = 20
1053
+
1054
+ # Update preserve_fields from frequently retrieved fields
1055
+ if pattern.field_retrieval_frequency:
1056
+ # Get top 5 most retrieved fields
1057
+ sorted_fields = sorted(
1058
+ pattern.field_retrieval_frequency.items(),
1059
+ key=lambda x: x[1],
1060
+ reverse=True,
1061
+ )[:5]
1062
+ pattern.preserve_fields = [f for f, _ in sorted_fields]
1063
+
1064
+ # Update optimal strategy (pick most successful)
1065
+ if pattern.strategy_success_rates:
1066
+ best_strategy = max(
1067
+ pattern.strategy_success_rates.items(),
1068
+ key=lambda x: x[1],
1069
+ )[0]
1070
+ pattern.optimal_strategy = best_strategy
1071
+
1072
+ def _calculate_confidence(self, pattern: ToolPattern) -> float:
1073
+ """Calculate confidence level for a pattern."""
1074
+ # Base confidence on sample size
1075
+ sample_confidence = min(0.7, pattern.sample_size / 100)
1076
+
1077
+ # Boost if from multiple users
1078
+ # FIX: Changed from `user_count / 10 * 0.1` (= user_count * 0.01, too small)
1079
+ # to `user_count * 0.03` for meaningful boost at low user counts
1080
+ # - 3 users: 0.09 boost
1081
+ # - 10 users: 0.30 boost (capped)
1082
+ user_boost = 0.0
1083
+ if pattern.user_count >= self._config.min_users_for_network_effect:
1084
+ user_boost = min(0.3, pattern.user_count * 0.03)
1085
+
1086
+ return min(0.95, sample_confidence + user_boost)
1087
+
1088
+ def _hash_field_name(self, field_name: str) -> str:
1089
+ """Hash a field name for anonymization."""
1090
+ return hashlib.sha256(field_name.encode()).hexdigest()[:8]
1091
+
1092
+ def _anonymize_query_pattern(self, query: str) -> str | None:
1093
+ """Extract anonymized pattern from a query.
1094
+
1095
+ Keeps structural patterns, removes specific values.
1096
+ E.g., "status:error AND user:john" -> "status:* AND user:*"
1097
+ """
1098
+ if not query:
1099
+ return None
1100
+
1101
+ # Simple pattern extraction: replace values after : or =
1102
+ import re
1103
+
1104
+ # Match field:value or field="value" patterns, but don't include spaces in unquoted values
1105
+ pattern = re.sub(r'(\w+)[=:](?:"[^"]*"|\'[^\']*\'|\w+)', r"\1:*", query)
1106
+
1107
+ # Remove if it's just generic
1108
+ if pattern in ("*", ""):
1109
+ return None
1110
+
1111
+ return pattern
1112
+
1113
+ def _hash_value(self, value: Any) -> str:
1114
+ """Hash a value for privacy-preserving storage.
1115
+
1116
+ Handles all types by converting to a canonical string representation.
1117
+ """
1118
+ if value is None:
1119
+ canonical = "null"
1120
+ elif isinstance(value, bool):
1121
+ canonical = "true" if value else "false"
1122
+ elif isinstance(value, (int, float)):
1123
+ canonical = str(value)
1124
+ elif isinstance(value, str):
1125
+ canonical = value
1126
+ elif isinstance(value, (list, dict)):
1127
+ # For complex types, use JSON serialization
1128
+ try:
1129
+ canonical = json.dumps(value, sort_keys=True, default=str)
1130
+ except (TypeError, ValueError):
1131
+ canonical = str(value)
1132
+ else:
1133
+ canonical = str(value)
1134
+
1135
+ return hashlib.sha256(canonical.encode()).hexdigest()[:8]
1136
+
1137
+ def _extract_query_operator(self, query: str) -> str:
1138
+ """Extract the dominant query operator from a search query.
1139
+
1140
+ Used for learning field semantic types from query patterns.
1141
+
1142
+ Returns:
1143
+ Query operator: "=", "!=", ">", "<", ">=", "<=", "contains", or "="
1144
+ """
1145
+ if not query:
1146
+ return "="
1147
+
1148
+ query_lower = query.lower()
1149
+
1150
+ # Check for inequality operators
1151
+ if "!=" in query or " not " in query_lower or " ne " in query_lower:
1152
+ return "!="
1153
+ if ">=" in query or " gte " in query_lower:
1154
+ return ">="
1155
+ if "<=" in query or " lte " in query_lower:
1156
+ return "<="
1157
+ if ">" in query or " gt " in query_lower:
1158
+ return ">"
1159
+ if "<" in query or " lt " in query_lower:
1160
+ return "<"
1161
+
1162
+ # Check for text search operators
1163
+ if " like " in query_lower or " contains " in query_lower or "*" in query:
1164
+ return "contains"
1165
+
1166
+ # Default to equality
1167
+ return "="
1168
+
1169
+ def get_stats(self) -> dict[str, Any]:
1170
+ """Get overall TOIN statistics."""
1171
+ with self._lock:
1172
+ total_compressions = sum(p.total_compressions for p in self._patterns.values())
1173
+ total_retrievals = sum(p.total_retrievals for p in self._patterns.values())
1174
+
1175
+ return {
1176
+ "enabled": self._config.enabled,
1177
+ "patterns_tracked": len(self._patterns),
1178
+ "total_compressions": total_compressions,
1179
+ "total_retrievals": total_retrievals,
1180
+ "global_retrieval_rate": (
1181
+ total_retrievals / total_compressions if total_compressions > 0 else 0.0
1182
+ ),
1183
+ "patterns_with_recommendations": sum(
1184
+ 1
1185
+ for p in self._patterns.values()
1186
+ if p.sample_size >= self._config.min_samples_for_recommendation
1187
+ ),
1188
+ }
1189
+
1190
+ def get_pattern(self, signature_hash: str) -> ToolPattern | None:
1191
+ """Get pattern data for a specific tool signature.
1192
+
1193
+ HIGH FIX: Returns a deep copy to prevent external mutation of internal state.
1194
+ """
1195
+ import copy
1196
+
1197
+ with self._lock:
1198
+ pattern = self._patterns.get(signature_hash)
1199
+ if pattern is not None:
1200
+ return copy.deepcopy(pattern)
1201
+ return None
1202
+
1203
+ def export_patterns(self) -> dict[str, Any]:
1204
+ """Export all patterns for sharing/aggregation."""
1205
+ with self._lock:
1206
+ return {
1207
+ "version": "1.0",
1208
+ "export_timestamp": time.time(),
1209
+ "instance_id": self._instance_id,
1210
+ "patterns": {
1211
+ sig_hash: pattern.to_dict() for sig_hash, pattern in self._patterns.items()
1212
+ },
1213
+ }
1214
+
1215
+ def import_patterns(self, data: dict[str, Any]) -> None:
1216
+ """Import patterns from another source.
1217
+
1218
+ Used for federated learning: aggregate patterns from multiple
1219
+ Headroom instances without sharing actual data.
1220
+
1221
+ Args:
1222
+ data: Exported pattern data.
1223
+ """
1224
+ if not self._config.enabled:
1225
+ return
1226
+
1227
+ patterns_data = data.get("patterns", {})
1228
+ source_instance = data.get("instance_id", "unknown")
1229
+
1230
+ with self._lock:
1231
+ for sig_hash, pattern_dict in patterns_data.items():
1232
+ imported = ToolPattern.from_dict(pattern_dict)
1233
+
1234
+ if sig_hash in self._patterns:
1235
+ # Merge with existing
1236
+ self._merge_patterns(self._patterns[sig_hash], imported)
1237
+ else:
1238
+ # Add new pattern - need to track source instance
1239
+ self._patterns[sig_hash] = imported
1240
+
1241
+ # For NEW patterns from another instance, track the source in
1242
+ # _seen_instance_hashes so user_count reflects cross-user data
1243
+ if source_instance != self._instance_id:
1244
+ pattern = self._patterns[sig_hash]
1245
+ if source_instance not in pattern._seen_instance_hashes:
1246
+ # Limit storage to 100 unique instances to bound memory
1247
+ if len(pattern._seen_instance_hashes) < 100:
1248
+ pattern._seen_instance_hashes.append(source_instance)
1249
+ # CRITICAL: Always increment user_count (even after cap)
1250
+ pattern.user_count += 1
1251
+
1252
+ self._dirty = True
1253
+
1254
+ def _merge_patterns(self, existing: ToolPattern, imported: ToolPattern) -> None:
1255
+ """Merge imported pattern into existing."""
1256
+ total = existing.sample_size + imported.sample_size
1257
+ if total == 0:
1258
+ return
1259
+
1260
+ w_existing = existing.sample_size / total
1261
+ w_imported = imported.sample_size / total
1262
+
1263
+ # Merge counts
1264
+ existing.total_compressions += imported.total_compressions
1265
+ existing.total_retrievals += imported.total_retrievals
1266
+ existing.full_retrievals += imported.full_retrievals
1267
+ existing.search_retrievals += imported.search_retrievals
1268
+ existing.total_items_seen += imported.total_items_seen
1269
+ existing.total_items_kept += imported.total_items_kept
1270
+
1271
+ # Weighted averages
1272
+ existing.avg_compression_ratio = (
1273
+ existing.avg_compression_ratio * w_existing
1274
+ + imported.avg_compression_ratio * w_imported
1275
+ )
1276
+ existing.avg_token_reduction = (
1277
+ existing.avg_token_reduction * w_existing + imported.avg_token_reduction * w_imported
1278
+ )
1279
+
1280
+ # Merge field frequencies
1281
+ for field_hash, count in imported.field_retrieval_frequency.items():
1282
+ existing.field_retrieval_frequency[field_hash] = (
1283
+ existing.field_retrieval_frequency.get(field_hash, 0) + count
1284
+ )
1285
+ # HIGH: Limit field_retrieval_frequency dict to prevent unbounded growth
1286
+ if len(existing.field_retrieval_frequency) > 100:
1287
+ # Keep only the most frequently retrieved fields
1288
+ sorted_fields = sorted(
1289
+ existing.field_retrieval_frequency.items(),
1290
+ key=lambda x: x[1],
1291
+ reverse=True,
1292
+ )[:100]
1293
+ existing.field_retrieval_frequency = dict(sorted_fields)
1294
+
1295
+ # Merge commonly retrieved fields
1296
+ for field_hash in imported.commonly_retrieved_fields:
1297
+ if field_hash not in existing.commonly_retrieved_fields:
1298
+ existing.commonly_retrieved_fields.append(field_hash)
1299
+ # HIGH: Limit commonly_retrieved_fields to prevent unbounded growth
1300
+ if len(existing.commonly_retrieved_fields) > 20:
1301
+ # Prioritize by retrieval frequency if available
1302
+ if existing.field_retrieval_frequency:
1303
+ existing.commonly_retrieved_fields = sorted(
1304
+ existing.commonly_retrieved_fields,
1305
+ key=lambda f: existing.field_retrieval_frequency.get(f, 0),
1306
+ reverse=True,
1307
+ )[:20]
1308
+ else:
1309
+ existing.commonly_retrieved_fields = existing.commonly_retrieved_fields[:20]
1310
+
1311
+ # Merge query patterns (for federated learning)
1312
+ # MEDIUM FIX #10: Also merge query_pattern_frequency for proper ranking
1313
+ for query_pattern, freq in imported.query_pattern_frequency.items():
1314
+ existing.query_pattern_frequency[query_pattern] = (
1315
+ existing.query_pattern_frequency.get(query_pattern, 0) + freq
1316
+ )
1317
+ for query_pattern in imported.common_query_patterns:
1318
+ if query_pattern not in existing.common_query_patterns:
1319
+ existing.common_query_patterns.append(query_pattern)
1320
+ # Keep only the most common patterns (by frequency)
1321
+ if len(existing.common_query_patterns) > self._config.max_query_patterns:
1322
+ existing.common_query_patterns = sorted(
1323
+ existing.common_query_patterns,
1324
+ key=lambda p: existing.query_pattern_frequency.get(p, 0),
1325
+ reverse=True,
1326
+ )[: self._config.max_query_patterns]
1327
+ # Limit frequency dict
1328
+ if len(existing.query_pattern_frequency) > self._config.max_query_patterns * 2:
1329
+ top_patterns = sorted(
1330
+ existing.query_pattern_frequency.items(),
1331
+ key=lambda x: x[1],
1332
+ reverse=True,
1333
+ )[: self._config.max_query_patterns * 2]
1334
+ existing.query_pattern_frequency = dict(top_patterns)
1335
+
1336
+ # Merge strategy success rates (weighted average)
1337
+ for strategy, rate in imported.strategy_success_rates.items():
1338
+ if strategy in existing.strategy_success_rates:
1339
+ existing.strategy_success_rates[strategy] = (
1340
+ existing.strategy_success_rates[strategy] * w_existing + rate * w_imported
1341
+ )
1342
+ else:
1343
+ existing.strategy_success_rates[strategy] = rate
1344
+
1345
+ # HIGH FIX: Bound strategy_success_rates after merge
1346
+ if len(existing.strategy_success_rates) > 20:
1347
+ sorted_strategies = sorted(
1348
+ existing.strategy_success_rates.items(),
1349
+ key=lambda x: x[1],
1350
+ reverse=True,
1351
+ )[:20]
1352
+ existing.strategy_success_rates = dict(sorted_strategies)
1353
+
1354
+ # Merge preserve_fields (union of both, deduplicated)
1355
+ for preserve_field in imported.preserve_fields:
1356
+ if preserve_field not in existing.preserve_fields:
1357
+ existing.preserve_fields.append(preserve_field)
1358
+ # Keep only top 10 most important fields
1359
+ if len(existing.preserve_fields) > 10:
1360
+ # Prioritize by retrieval frequency if available
1361
+ if existing.field_retrieval_frequency:
1362
+ existing.preserve_fields = sorted(
1363
+ existing.preserve_fields,
1364
+ key=lambda f: existing.field_retrieval_frequency.get(f, 0),
1365
+ reverse=True,
1366
+ )[:10]
1367
+ else:
1368
+ existing.preserve_fields = existing.preserve_fields[:10]
1369
+
1370
+ # Merge skip_compression_recommended (true if either recommends skip)
1371
+ if imported.skip_compression_recommended:
1372
+ # Imported has more data suggesting skip - consider it
1373
+ if imported.sample_size > existing.sample_size // 2:
1374
+ existing.skip_compression_recommended = True
1375
+
1376
+ # Merge optimal_strategy (prefer the one with better success rate)
1377
+ if imported.optimal_strategy != "default":
1378
+ imported_rate = imported.strategy_success_rates.get(imported.optimal_strategy, 0.5)
1379
+ existing_rate = (
1380
+ existing.strategy_success_rates.get(existing.optimal_strategy, 0.5)
1381
+ if existing.optimal_strategy != "default"
1382
+ else 0.0
1383
+ )
1384
+
1385
+ if imported_rate > existing_rate:
1386
+ existing.optimal_strategy = imported.optimal_strategy
1387
+
1388
+ # Merge optimal_max_items (weighted average with bounds)
1389
+ if imported.optimal_max_items > 0:
1390
+ merged_max_items = int(
1391
+ existing.optimal_max_items * w_existing + imported.optimal_max_items * w_imported
1392
+ )
1393
+ # Ensure valid bounds: min 3 items, max 1000 items
1394
+ existing.optimal_max_items = max(3, min(1000, merged_max_items))
1395
+
1396
+ existing.sample_size = total
1397
+
1398
+ # Merge seen instance hashes (union of both, limited to 100 for storage)
1399
+ # CRITICAL FIX #1 & #3: Simplified user count merge logic with cap enforcement.
1400
+ # user_count is the authoritative count even when sets hit their caps.
1401
+ new_users_found = 0
1402
+ for instance_hash in imported._seen_instance_hashes:
1403
+ # Use _all_seen_instances for deduplication (the authoritative set)
1404
+ if instance_hash not in existing._all_seen_instances:
1405
+ # Add to lookup set (with cap to prevent OOM)
1406
+ if len(existing._all_seen_instances) < ToolPattern.MAX_SEEN_INSTANCES:
1407
+ existing._all_seen_instances.add(instance_hash)
1408
+ # Limit storage list to 100 unique instances to bound serialization
1409
+ if len(existing._seen_instance_hashes) < 100:
1410
+ existing._seen_instance_hashes.append(instance_hash)
1411
+ new_users_found += 1
1412
+
1413
+ # Also merge instances from imported._all_seen_instances that weren't in list
1414
+ # (in case imported had more than 100 instances)
1415
+ for instance_hash in imported._all_seen_instances:
1416
+ if instance_hash not in existing._all_seen_instances:
1417
+ # Add with cap check
1418
+ if len(existing._all_seen_instances) < ToolPattern.MAX_SEEN_INSTANCES:
1419
+ existing._all_seen_instances.add(instance_hash)
1420
+ # Storage list already at limit, just track for dedup
1421
+ new_users_found += 1
1422
+
1423
+ # CRITICAL FIX #3: Simplified user count calculation.
1424
+ # We count new users from both the list and set, then add any users
1425
+ # that imported had beyond what we could deduplicate (when both hit caps).
1426
+ # imported.user_count may be > len(imported._all_seen_instances) if they hit cap
1427
+ users_beyond_imported_tracking = max(
1428
+ 0, imported.user_count - len(imported._all_seen_instances)
1429
+ )
1430
+ existing.user_count += new_users_found + users_beyond_imported_tracking
1431
+
1432
+ existing.last_updated = time.time()
1433
+
1434
+ # Recalculate recommendations based on merged data
1435
+ self._update_recommendations(existing)
1436
+
1437
+ def save(self) -> None:
1438
+ """Save TOIN data to disk with atomic write.
1439
+
1440
+ Uses a temporary file and rename to ensure atomicity.
1441
+ If the write fails, the original file is preserved.
1442
+
1443
+ HIGH FIX: Serialize under lock but write outside lock to prevent
1444
+ blocking other threads during slow file I/O.
1445
+ """
1446
+ if not self._config.storage_path:
1447
+ return
1448
+
1449
+ import tempfile
1450
+
1451
+ # Step 1: Serialize under lock (fast in-memory operation)
1452
+ with self._lock:
1453
+ data = self.export_patterns()
1454
+
1455
+ # Step 2: Write outside lock (slow I/O operation)
1456
+ path = Path(self._config.storage_path)
1457
+
1458
+ try:
1459
+ # Create parent directories if needed
1460
+ path.parent.mkdir(parents=True, exist_ok=True)
1461
+
1462
+ # Serialize to string (outside lock but before file ops)
1463
+ json_data = json.dumps(data, indent=2)
1464
+
1465
+ # Write to temporary file first (atomic write pattern)
1466
+ # Use same directory to ensure same filesystem for rename
1467
+ fd, tmp_path = tempfile.mkstemp(dir=path.parent, prefix=".toin_", suffix=".tmp")
1468
+ try:
1469
+ with open(fd, "w") as f:
1470
+ f.write(json_data)
1471
+
1472
+ # Atomic rename (on POSIX systems)
1473
+ Path(tmp_path).replace(path)
1474
+
1475
+ except Exception:
1476
+ # Clean up temp file on failure
1477
+ try:
1478
+ Path(tmp_path).unlink()
1479
+ except OSError:
1480
+ pass
1481
+ raise
1482
+
1483
+ # Step 3: Update state under lock (fast)
1484
+ with self._lock:
1485
+ self._dirty = False
1486
+ self._last_save_time = time.time()
1487
+
1488
+ except OSError as e:
1489
+ # Log error but don't crash - TOIN should be resilient
1490
+ logger.warning(f"Failed to save TOIN data: {e}")
1491
+
1492
+ def _load_from_disk(self) -> None:
1493
+ """Load TOIN data from disk."""
1494
+ if not self._config.storage_path:
1495
+ return
1496
+
1497
+ path = Path(self._config.storage_path)
1498
+ if not path.exists():
1499
+ return
1500
+
1501
+ try:
1502
+ with open(path) as f:
1503
+ data = json.load(f)
1504
+ self.import_patterns(data)
1505
+ self._dirty = False
1506
+ except (json.JSONDecodeError, OSError):
1507
+ pass # Start fresh if corrupted
1508
+
1509
+ def _maybe_auto_save(self) -> None:
1510
+ """Auto-save if enough time has passed.
1511
+
1512
+ HIGH FIX: Check conditions under lock to prevent race where another
1513
+ thread modifies _dirty or _last_save_time between check and save.
1514
+ The save() method already acquires the lock, and we use RLock so
1515
+ it's safe to hold the lock when calling save().
1516
+ """
1517
+ if not self._config.storage_path or not self._config.auto_save_interval:
1518
+ return
1519
+
1520
+ # Check under lock to prevent race conditions
1521
+ with self._lock:
1522
+ if not self._dirty:
1523
+ return
1524
+
1525
+ elapsed = time.time() - self._last_save_time
1526
+ if elapsed >= self._config.auto_save_interval:
1527
+ # save() uses the same RLock, so this is safe
1528
+ self.save()
1529
+
1530
+ def clear(self) -> None:
1531
+ """Clear all TOIN data. Mainly for testing."""
1532
+ with self._lock:
1533
+ self._patterns.clear()
1534
+ self._dirty = False
1535
+
1536
+
1537
+ # Global TOIN instance (lazy initialization)
1538
+ _toin_instance: ToolIntelligenceNetwork | None = None
1539
+ _toin_lock = threading.Lock()
1540
+
1541
+
1542
+ def get_toin(config: TOINConfig | None = None) -> ToolIntelligenceNetwork:
1543
+ """Get the global TOIN instance.
1544
+
1545
+ Thread-safe singleton pattern. Always acquires lock to avoid subtle
1546
+ race conditions in double-checked locking on non-CPython implementations.
1547
+
1548
+ Args:
1549
+ config: Configuration (only used on first call). If the instance
1550
+ already exists, config is ignored and a warning is logged.
1551
+
1552
+ Returns:
1553
+ Global ToolIntelligenceNetwork instance.
1554
+ """
1555
+ global _toin_instance
1556
+
1557
+ # CRITICAL FIX: Always acquire lock for thread safety across all Python
1558
+ # implementations. The overhead is negligible since we only construct once.
1559
+ with _toin_lock:
1560
+ if _toin_instance is None:
1561
+ _toin_instance = ToolIntelligenceNetwork(config)
1562
+ elif config is not None:
1563
+ # Warn when config is silently ignored
1564
+ logger.warning(
1565
+ "TOIN config ignored: instance already exists. "
1566
+ "Call reset_toin() first if you need to change config."
1567
+ )
1568
+
1569
+ return _toin_instance
1570
+
1571
+
1572
+ def reset_toin() -> None:
1573
+ """Reset the global TOIN instance. Mainly for testing."""
1574
+ global _toin_instance
1575
+
1576
+ with _toin_lock:
1577
+ if _toin_instance is not None:
1578
+ _toin_instance.clear()
1579
+ _toin_instance = None