headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,1579 @@
|
|
|
1
|
+
"""Tool Output Intelligence Network (TOIN) - Cross-user learning for compression.
|
|
2
|
+
|
|
3
|
+
TOIN aggregates anonymized compression patterns across all Headroom users to
|
|
4
|
+
create a network effect: every user's compression decisions improve the
|
|
5
|
+
recommendations for everyone.
|
|
6
|
+
|
|
7
|
+
Key concepts:
|
|
8
|
+
- ToolPattern: Aggregated intelligence about a tool type (by structure hash)
|
|
9
|
+
- CompressionHint: Recommendations for how to compress a specific tool output
|
|
10
|
+
- ToolIntelligenceNetwork: Central aggregator that learns from all users
|
|
11
|
+
|
|
12
|
+
How it works:
|
|
13
|
+
1. When SmartCrusher compresses data, it records the outcome via telemetry
|
|
14
|
+
2. When LLM retrieves compressed data, TOIN tracks what was needed
|
|
15
|
+
3. TOIN learns: "For tools with structure X, retrieval rate is high when
|
|
16
|
+
compressing field Y - preserve it"
|
|
17
|
+
4. Next time: SmartCrusher asks TOIN for hints before compressing
|
|
18
|
+
|
|
19
|
+
Privacy:
|
|
20
|
+
- No actual data values are stored
|
|
21
|
+
- Tool names are structure hashes
|
|
22
|
+
- Field names are SHA256[:8] hashes
|
|
23
|
+
- No user identifiers
|
|
24
|
+
|
|
25
|
+
Network Effect:
|
|
26
|
+
- More users → more compression events → better recommendations
|
|
27
|
+
- Cross-user patterns reveal universal tool behaviors
|
|
28
|
+
- Federated learning: aggregate patterns, not data
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
from headroom.telemetry.toin import get_toin
|
|
32
|
+
|
|
33
|
+
# Before compression, get recommendations
|
|
34
|
+
hint = get_toin().get_recommendation(tool_signature, query_context)
|
|
35
|
+
|
|
36
|
+
# Apply hint
|
|
37
|
+
if hint.skip_compression:
|
|
38
|
+
return original_data
|
|
39
|
+
config.preserve_fields = hint.preserve_fields
|
|
40
|
+
config.max_items = hint.max_items
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import hashlib
|
|
46
|
+
import json
|
|
47
|
+
import logging
|
|
48
|
+
import threading
|
|
49
|
+
import time
|
|
50
|
+
from collections.abc import Callable
|
|
51
|
+
from dataclasses import dataclass, field
|
|
52
|
+
from pathlib import Path
|
|
53
|
+
from typing import Any, Literal
|
|
54
|
+
|
|
55
|
+
from .models import FieldSemantics, ToolSignature
|
|
56
|
+
|
|
57
|
+
logger = logging.getLogger(__name__)
|
|
58
|
+
|
|
59
|
+
# LOW FIX #22: Define callback types for metrics/monitoring hooks
|
|
60
|
+
# These allow users to plug in their own metrics collection (Prometheus, StatsD, etc.)
|
|
61
|
+
MetricsCallback = Callable[[str, dict[str, Any]], None] # (event_name, event_data) -> None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class ToolPattern:
|
|
66
|
+
"""Aggregated intelligence about a tool type across all users.
|
|
67
|
+
|
|
68
|
+
This is the core TOIN data structure. It represents everything we've
|
|
69
|
+
learned about how to compress outputs from tools with a specific structure.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
tool_signature_hash: str
|
|
73
|
+
|
|
74
|
+
# === Compression Statistics ===
|
|
75
|
+
total_compressions: int = 0
|
|
76
|
+
total_items_seen: int = 0
|
|
77
|
+
total_items_kept: int = 0
|
|
78
|
+
avg_compression_ratio: float = 0.0
|
|
79
|
+
avg_token_reduction: float = 0.0
|
|
80
|
+
|
|
81
|
+
# === Retrieval Statistics ===
|
|
82
|
+
total_retrievals: int = 0
|
|
83
|
+
full_retrievals: int = 0 # Retrieved everything
|
|
84
|
+
search_retrievals: int = 0 # Used search filter
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def retrieval_rate(self) -> float:
|
|
88
|
+
"""Fraction of compressions that triggered retrieval."""
|
|
89
|
+
if self.total_compressions == 0:
|
|
90
|
+
return 0.0
|
|
91
|
+
return self.total_retrievals / self.total_compressions
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def full_retrieval_rate(self) -> float:
|
|
95
|
+
"""Fraction of retrievals that were full (not search)."""
|
|
96
|
+
if self.total_retrievals == 0:
|
|
97
|
+
return 0.0
|
|
98
|
+
return self.full_retrievals / self.total_retrievals
|
|
99
|
+
|
|
100
|
+
# === Learned Patterns ===
|
|
101
|
+
# Fields that are frequently retrieved (should preserve)
|
|
102
|
+
commonly_retrieved_fields: list[str] = field(default_factory=list)
|
|
103
|
+
field_retrieval_frequency: dict[str, int] = field(default_factory=dict)
|
|
104
|
+
|
|
105
|
+
# Query patterns that trigger retrieval
|
|
106
|
+
common_query_patterns: list[str] = field(default_factory=list)
|
|
107
|
+
# MEDIUM FIX #10: Track query pattern frequency to keep most common, not just recent
|
|
108
|
+
query_pattern_frequency: dict[str, int] = field(default_factory=dict)
|
|
109
|
+
|
|
110
|
+
# Best compression strategy for this tool type
|
|
111
|
+
optimal_strategy: str = "default"
|
|
112
|
+
strategy_success_rates: dict[str, float] = field(default_factory=dict)
|
|
113
|
+
|
|
114
|
+
# === Learned Recommendations ===
|
|
115
|
+
optimal_max_items: int = 20
|
|
116
|
+
skip_compression_recommended: bool = False
|
|
117
|
+
preserve_fields: list[str] = field(default_factory=list)
|
|
118
|
+
|
|
119
|
+
# === Field-Level Semantics (TOIN Evolution) ===
|
|
120
|
+
# Learned semantic types for each field based on retrieval patterns
|
|
121
|
+
# This enables zero-latency signal detection without hardcoded patterns
|
|
122
|
+
field_semantics: dict[str, FieldSemantics] = field(default_factory=dict)
|
|
123
|
+
|
|
124
|
+
# === Confidence ===
|
|
125
|
+
sample_size: int = 0
|
|
126
|
+
user_count: int = 0 # Number of unique users (anonymized)
|
|
127
|
+
confidence: float = 0.0 # 0.0 = no data, 1.0 = high confidence
|
|
128
|
+
last_updated: float = 0.0
|
|
129
|
+
|
|
130
|
+
# === Instance Tracking (for user_count) ===
|
|
131
|
+
# Hashed instance IDs of users who have contributed to this pattern
|
|
132
|
+
# Limited to avoid unbounded growth (for serialization)
|
|
133
|
+
_seen_instance_hashes: list[str] = field(default_factory=list)
|
|
134
|
+
# FIX: Separate set for ALL seen instances to prevent double-counting
|
|
135
|
+
# CRITICAL FIX #1: Capped at MAX_SEEN_INSTANCES to prevent OOM with millions of users.
|
|
136
|
+
# When cap is reached, we rely on user_count for accurate counting and
|
|
137
|
+
# accept some potential double-counting for new users (negligible at scale).
|
|
138
|
+
_all_seen_instances: set[str] = field(default_factory=set)
|
|
139
|
+
|
|
140
|
+
# CRITICAL FIX: Track whether instance tracking was truncated during serialization
|
|
141
|
+
# If True, we know some users were lost and should be conservative about user_count
|
|
142
|
+
_tracking_truncated: bool = False
|
|
143
|
+
|
|
144
|
+
# CRITICAL FIX #1: Maximum entries in _all_seen_instances to prevent OOM
|
|
145
|
+
# This is a class constant, not a field (not serialized)
|
|
146
|
+
MAX_SEEN_INSTANCES: int = 10000
|
|
147
|
+
|
|
148
|
+
def to_dict(self) -> dict[str, Any]:
|
|
149
|
+
"""Convert to dictionary for serialization."""
|
|
150
|
+
return {
|
|
151
|
+
"tool_signature_hash": self.tool_signature_hash,
|
|
152
|
+
"total_compressions": self.total_compressions,
|
|
153
|
+
"total_items_seen": self.total_items_seen,
|
|
154
|
+
"total_items_kept": self.total_items_kept,
|
|
155
|
+
"avg_compression_ratio": self.avg_compression_ratio,
|
|
156
|
+
"avg_token_reduction": self.avg_token_reduction,
|
|
157
|
+
"total_retrievals": self.total_retrievals,
|
|
158
|
+
"full_retrievals": self.full_retrievals,
|
|
159
|
+
"search_retrievals": self.search_retrievals,
|
|
160
|
+
"retrieval_rate": self.retrieval_rate,
|
|
161
|
+
"full_retrieval_rate": self.full_retrieval_rate,
|
|
162
|
+
"commonly_retrieved_fields": self.commonly_retrieved_fields,
|
|
163
|
+
"field_retrieval_frequency": self.field_retrieval_frequency,
|
|
164
|
+
"common_query_patterns": self.common_query_patterns,
|
|
165
|
+
"query_pattern_frequency": self.query_pattern_frequency,
|
|
166
|
+
"optimal_strategy": self.optimal_strategy,
|
|
167
|
+
"strategy_success_rates": self.strategy_success_rates,
|
|
168
|
+
"optimal_max_items": self.optimal_max_items,
|
|
169
|
+
"skip_compression_recommended": self.skip_compression_recommended,
|
|
170
|
+
"preserve_fields": self.preserve_fields,
|
|
171
|
+
# Field-level semantics (TOIN Evolution)
|
|
172
|
+
"field_semantics": {k: v.to_dict() for k, v in self.field_semantics.items()},
|
|
173
|
+
"sample_size": self.sample_size,
|
|
174
|
+
"user_count": self.user_count,
|
|
175
|
+
"confidence": self.confidence,
|
|
176
|
+
"last_updated": self.last_updated,
|
|
177
|
+
# Serialize instance hashes (limited to 100 for bounded storage)
|
|
178
|
+
"seen_instance_hashes": self._seen_instance_hashes[:100],
|
|
179
|
+
# CRITICAL FIX: Track if truncation occurred during serialization
|
|
180
|
+
# This tells from_dict() that some users were lost and prevents double-counting
|
|
181
|
+
"tracking_truncated": (
|
|
182
|
+
self._tracking_truncated
|
|
183
|
+
or self.user_count > len(self._seen_instance_hashes)
|
|
184
|
+
or len(self._all_seen_instances) > 100
|
|
185
|
+
),
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
def from_dict(cls, data: dict[str, Any]) -> ToolPattern:
|
|
190
|
+
"""Create from dictionary."""
|
|
191
|
+
# Filter to only valid fields
|
|
192
|
+
valid_fields = {
|
|
193
|
+
"tool_signature_hash",
|
|
194
|
+
"total_compressions",
|
|
195
|
+
"total_items_seen",
|
|
196
|
+
"total_items_kept",
|
|
197
|
+
"avg_compression_ratio",
|
|
198
|
+
"avg_token_reduction",
|
|
199
|
+
"total_retrievals",
|
|
200
|
+
"full_retrievals",
|
|
201
|
+
"search_retrievals",
|
|
202
|
+
"commonly_retrieved_fields",
|
|
203
|
+
"field_retrieval_frequency",
|
|
204
|
+
"common_query_patterns",
|
|
205
|
+
"query_pattern_frequency",
|
|
206
|
+
"optimal_strategy",
|
|
207
|
+
"strategy_success_rates",
|
|
208
|
+
"optimal_max_items",
|
|
209
|
+
"skip_compression_recommended",
|
|
210
|
+
"preserve_fields",
|
|
211
|
+
"sample_size",
|
|
212
|
+
"user_count",
|
|
213
|
+
"confidence",
|
|
214
|
+
"last_updated",
|
|
215
|
+
}
|
|
216
|
+
filtered = {k: v for k, v in data.items() if k in valid_fields}
|
|
217
|
+
|
|
218
|
+
# Handle seen_instance_hashes (serialized without underscore prefix)
|
|
219
|
+
seen_hashes = data.get("seen_instance_hashes", [])
|
|
220
|
+
|
|
221
|
+
pattern = cls(**filtered)
|
|
222
|
+
pattern._seen_instance_hashes = seen_hashes[:100] # Limit on load
|
|
223
|
+
|
|
224
|
+
# CRITICAL FIX: Populate _all_seen_instances from loaded hashes
|
|
225
|
+
# This prevents double-counting after restart - without this, the same
|
|
226
|
+
# instances would be counted again because the lookup set was empty
|
|
227
|
+
pattern._all_seen_instances = set(pattern._seen_instance_hashes)
|
|
228
|
+
|
|
229
|
+
# CRITICAL FIX: Restore truncation flag to prevent double-counting
|
|
230
|
+
# If truncated, we know some users were lost in serialization
|
|
231
|
+
pattern._tracking_truncated = data.get("tracking_truncated", False)
|
|
232
|
+
# Also detect truncation if user_count > loaded hashes (backward compat)
|
|
233
|
+
if pattern.user_count > len(pattern._seen_instance_hashes):
|
|
234
|
+
pattern._tracking_truncated = True
|
|
235
|
+
|
|
236
|
+
# Load field semantics (TOIN Evolution)
|
|
237
|
+
field_semantics_data = data.get("field_semantics", {})
|
|
238
|
+
if field_semantics_data:
|
|
239
|
+
pattern.field_semantics = {
|
|
240
|
+
k: FieldSemantics.from_dict(v) for k, v in field_semantics_data.items()
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return pattern
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@dataclass
|
|
247
|
+
class CompressionHint:
|
|
248
|
+
"""Recommendation for how to compress a specific tool output.
|
|
249
|
+
|
|
250
|
+
This is what TOIN returns when asked for advice before compression.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
# Should we compress at all?
|
|
254
|
+
skip_compression: bool = False
|
|
255
|
+
|
|
256
|
+
# How aggressively to compress
|
|
257
|
+
max_items: int = 20
|
|
258
|
+
compression_level: Literal["none", "conservative", "moderate", "aggressive"] = "moderate"
|
|
259
|
+
|
|
260
|
+
# Which fields to preserve (never remove)
|
|
261
|
+
preserve_fields: list[str] = field(default_factory=list)
|
|
262
|
+
|
|
263
|
+
# Which strategy to use
|
|
264
|
+
recommended_strategy: str = "default"
|
|
265
|
+
|
|
266
|
+
# Why this recommendation
|
|
267
|
+
reason: str = ""
|
|
268
|
+
confidence: float = 0.0
|
|
269
|
+
|
|
270
|
+
# Source of recommendation
|
|
271
|
+
source: Literal["network", "local", "default"] = "default"
|
|
272
|
+
based_on_samples: int = 0
|
|
273
|
+
|
|
274
|
+
# === TOIN Evolution: Learned Field Semantics ===
|
|
275
|
+
# These enable zero-latency signal detection in SmartCrusher.
|
|
276
|
+
# field_hash -> FieldSemantics (learned semantic type, important values, etc.)
|
|
277
|
+
field_semantics: dict[str, FieldSemantics] = field(default_factory=dict)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@dataclass
|
|
281
|
+
class TOINConfig:
|
|
282
|
+
"""Configuration for the Tool Output Intelligence Network."""
|
|
283
|
+
|
|
284
|
+
# Enable/disable TOIN
|
|
285
|
+
enabled: bool = True
|
|
286
|
+
|
|
287
|
+
# Storage
|
|
288
|
+
storage_path: str | None = None # Path to store TOIN data
|
|
289
|
+
auto_save_interval: int = 600 # Auto-save every 10 minutes
|
|
290
|
+
|
|
291
|
+
# Network learning thresholds
|
|
292
|
+
min_samples_for_recommendation: int = 10
|
|
293
|
+
min_users_for_network_effect: int = 3
|
|
294
|
+
|
|
295
|
+
# Recommendation thresholds
|
|
296
|
+
high_retrieval_threshold: float = 0.5 # Above this = compress less
|
|
297
|
+
medium_retrieval_threshold: float = 0.2 # Between medium and high = moderate
|
|
298
|
+
|
|
299
|
+
# Privacy
|
|
300
|
+
anonymize_queries: bool = True
|
|
301
|
+
max_query_patterns: int = 10
|
|
302
|
+
|
|
303
|
+
# LOW FIX #22: Metrics/monitoring hooks
|
|
304
|
+
# Callback for emitting metrics events. Signature: (event_name, event_data) -> None
|
|
305
|
+
# Event names: "toin.compression", "toin.retrieval", "toin.recommendation", "toin.save"
|
|
306
|
+
# This allows integration with Prometheus, StatsD, OpenTelemetry, etc.
|
|
307
|
+
metrics_callback: MetricsCallback | None = None
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class ToolIntelligenceNetwork:
|
|
311
|
+
"""Aggregates tool patterns across all Headroom users.
|
|
312
|
+
|
|
313
|
+
This is the brain of TOIN. It maintains a database of learned patterns
|
|
314
|
+
for different tool types and provides recommendations based on
|
|
315
|
+
cross-user intelligence.
|
|
316
|
+
|
|
317
|
+
Thread-safe for concurrent access.
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def __init__(self, config: TOINConfig | None = None):
|
|
321
|
+
"""Initialize TOIN.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
config: Configuration options.
|
|
325
|
+
"""
|
|
326
|
+
self._config = config or TOINConfig()
|
|
327
|
+
self._lock = threading.RLock() # RLock for reentrant locking (save calls export_patterns)
|
|
328
|
+
|
|
329
|
+
# Pattern database: structure_hash -> ToolPattern
|
|
330
|
+
self._patterns: dict[str, ToolPattern] = {}
|
|
331
|
+
|
|
332
|
+
# Instance ID for user counting (anonymized)
|
|
333
|
+
# IMPORTANT: Must be STABLE across restarts to avoid false user count inflation
|
|
334
|
+
# Derive from storage path if available, otherwise use machine-specific ID
|
|
335
|
+
self._instance_id = self._generate_stable_instance_id()
|
|
336
|
+
|
|
337
|
+
# Tracking
|
|
338
|
+
self._last_save_time = time.time()
|
|
339
|
+
self._dirty = False
|
|
340
|
+
|
|
341
|
+
# Load existing data
|
|
342
|
+
if self._config.storage_path:
|
|
343
|
+
self._load_from_disk()
|
|
344
|
+
|
|
345
|
+
def _generate_stable_instance_id(self) -> str:
|
|
346
|
+
"""Generate a stable instance ID that doesn't change across restarts.
|
|
347
|
+
|
|
348
|
+
Uses storage path if available, otherwise uses machine-specific info.
|
|
349
|
+
This prevents false user count inflation when reloading from disk.
|
|
350
|
+
|
|
351
|
+
HIGH FIX: Instance ID collision risk
|
|
352
|
+
Previously used SHA256[:8] (32 bits) which has 50% collision probability
|
|
353
|
+
at sqrt(2^32) ≈ 65,536 users (birthday paradox). Increased to SHA256[:16]
|
|
354
|
+
(64 bits) for 50% collision at ~4 billion users, which is acceptable.
|
|
355
|
+
"""
|
|
356
|
+
if self._config.storage_path:
|
|
357
|
+
# Derive from storage path - same path = same instance
|
|
358
|
+
return hashlib.sha256(self._config.storage_path.encode()).hexdigest()[
|
|
359
|
+
:16
|
|
360
|
+
] # HIGH FIX: 64 bits instead of 32
|
|
361
|
+
else:
|
|
362
|
+
# No storage - use a combination of hostname and process info
|
|
363
|
+
# This is less stable but better than pure random
|
|
364
|
+
import os
|
|
365
|
+
import socket
|
|
366
|
+
|
|
367
|
+
machine_info = (
|
|
368
|
+
f"{socket.gethostname()}:{os.getuid() if hasattr(os, 'getuid') else 'unknown'}"
|
|
369
|
+
)
|
|
370
|
+
return hashlib.sha256(machine_info.encode()).hexdigest()[:16] # HIGH FIX: 64 bits
|
|
371
|
+
|
|
372
|
+
def _emit_metric(self, event_name: str, event_data: dict[str, Any]) -> None:
|
|
373
|
+
"""Emit a metrics event via the configured callback.
|
|
374
|
+
|
|
375
|
+
LOW FIX #22: Provides monitoring integration for external metrics systems.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
event_name: Name of the event (e.g., "toin.compression").
|
|
379
|
+
event_data: Dictionary of event data to emit.
|
|
380
|
+
"""
|
|
381
|
+
if self._config.metrics_callback is not None:
|
|
382
|
+
try:
|
|
383
|
+
self._config.metrics_callback(event_name, event_data)
|
|
384
|
+
except Exception as e:
|
|
385
|
+
# Never let metrics callback failures break TOIN
|
|
386
|
+
logger.debug(f"Metrics callback failed for {event_name}: {e}")
|
|
387
|
+
|
|
388
|
+
def record_compression(
|
|
389
|
+
self,
|
|
390
|
+
tool_signature: ToolSignature,
|
|
391
|
+
original_count: int,
|
|
392
|
+
compressed_count: int,
|
|
393
|
+
original_tokens: int,
|
|
394
|
+
compressed_tokens: int,
|
|
395
|
+
strategy: str,
|
|
396
|
+
query_context: str | None = None,
|
|
397
|
+
items: list[dict[str, Any]] | None = None,
|
|
398
|
+
) -> None:
|
|
399
|
+
"""Record a compression event.
|
|
400
|
+
|
|
401
|
+
Called after SmartCrusher compresses data. Updates the pattern
|
|
402
|
+
for this tool type.
|
|
403
|
+
|
|
404
|
+
TOIN Evolution: When items are provided, we capture field statistics
|
|
405
|
+
for learning semantic types (uniqueness, default values, etc.).
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
tool_signature: Signature of the tool output structure.
|
|
409
|
+
original_count: Original number of items.
|
|
410
|
+
compressed_count: Number of items after compression.
|
|
411
|
+
original_tokens: Original token count.
|
|
412
|
+
compressed_tokens: Compressed token count.
|
|
413
|
+
strategy: Compression strategy used.
|
|
414
|
+
query_context: Optional user query that triggered this tool call.
|
|
415
|
+
items: Optional list of items being compressed for field-level learning.
|
|
416
|
+
"""
|
|
417
|
+
# HIGH FIX: Check enabled FIRST to avoid computing structure_hash if disabled
|
|
418
|
+
# This saves CPU when TOIN is turned off
|
|
419
|
+
if not self._config.enabled:
|
|
420
|
+
return
|
|
421
|
+
|
|
422
|
+
# Computing structure_hash can be expensive for large structures
|
|
423
|
+
sig_hash = tool_signature.structure_hash
|
|
424
|
+
|
|
425
|
+
# LOW FIX #22: Emit compression metric
|
|
426
|
+
self._emit_metric(
|
|
427
|
+
"toin.compression",
|
|
428
|
+
{
|
|
429
|
+
"signature_hash": sig_hash,
|
|
430
|
+
"original_count": original_count,
|
|
431
|
+
"compressed_count": compressed_count,
|
|
432
|
+
"original_tokens": original_tokens,
|
|
433
|
+
"compressed_tokens": compressed_tokens,
|
|
434
|
+
"strategy": strategy,
|
|
435
|
+
"compression_ratio": compressed_count / original_count if original_count > 0 else 0,
|
|
436
|
+
},
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
with self._lock:
|
|
440
|
+
# Get or create pattern
|
|
441
|
+
if sig_hash not in self._patterns:
|
|
442
|
+
self._patterns[sig_hash] = ToolPattern(tool_signature_hash=sig_hash)
|
|
443
|
+
|
|
444
|
+
pattern = self._patterns[sig_hash]
|
|
445
|
+
|
|
446
|
+
# Update compression stats
|
|
447
|
+
pattern.total_compressions += 1
|
|
448
|
+
pattern.total_items_seen += original_count
|
|
449
|
+
pattern.total_items_kept += compressed_count
|
|
450
|
+
pattern.sample_size += 1
|
|
451
|
+
|
|
452
|
+
# Update rolling averages
|
|
453
|
+
n = pattern.total_compressions
|
|
454
|
+
compression_ratio = compressed_count / original_count if original_count > 0 else 0.0
|
|
455
|
+
token_reduction = (
|
|
456
|
+
1 - (compressed_tokens / original_tokens) if original_tokens > 0 else 0.0
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
pattern.avg_compression_ratio = (
|
|
460
|
+
pattern.avg_compression_ratio * (n - 1) + compression_ratio
|
|
461
|
+
) / n
|
|
462
|
+
pattern.avg_token_reduction = (
|
|
463
|
+
pattern.avg_token_reduction * (n - 1) + token_reduction
|
|
464
|
+
) / n
|
|
465
|
+
|
|
466
|
+
# Update strategy stats
|
|
467
|
+
if strategy not in pattern.strategy_success_rates:
|
|
468
|
+
pattern.strategy_success_rates[strategy] = 1.0 # Start optimistic
|
|
469
|
+
else:
|
|
470
|
+
# Give a small boost for each compression without retrieval
|
|
471
|
+
# This counteracts the penalty from record_retrieval() and prevents
|
|
472
|
+
# all strategies from trending to 0.0 over time (one-way ratchet fix)
|
|
473
|
+
# The boost is small (0.02) because retrieval penalties are larger (0.05-0.15)
|
|
474
|
+
# This means strategies that cause retrievals will still trend down
|
|
475
|
+
current_rate = pattern.strategy_success_rates[strategy]
|
|
476
|
+
pattern.strategy_success_rates[strategy] = min(1.0, current_rate + 0.02)
|
|
477
|
+
|
|
478
|
+
# HIGH FIX: Bound strategy_success_rates to prevent unbounded growth
|
|
479
|
+
# Keep top 20 strategies by success rate
|
|
480
|
+
if len(pattern.strategy_success_rates) > 20:
|
|
481
|
+
sorted_strategies = sorted(
|
|
482
|
+
pattern.strategy_success_rates.items(),
|
|
483
|
+
key=lambda x: x[1],
|
|
484
|
+
reverse=True,
|
|
485
|
+
)[:20]
|
|
486
|
+
pattern.strategy_success_rates = dict(sorted_strategies)
|
|
487
|
+
|
|
488
|
+
# Track unique users via instance_id
|
|
489
|
+
# FIX: Use _all_seen_instances set for lookup to prevent double-counting
|
|
490
|
+
# after the storage list hits its cap
|
|
491
|
+
# CRITICAL FIX #1: Check cap before adding to prevent OOM
|
|
492
|
+
if self._instance_id not in pattern._all_seen_instances:
|
|
493
|
+
# CRITICAL FIX: Check if we can verify this is a new user
|
|
494
|
+
# If tracking was truncated (users lost after restart), we can only
|
|
495
|
+
# count new users if we can add them to _all_seen_instances for dedup
|
|
496
|
+
can_track = len(pattern._all_seen_instances) < ToolPattern.MAX_SEEN_INSTANCES
|
|
497
|
+
|
|
498
|
+
if can_track:
|
|
499
|
+
# Add to the lookup set - we can verify this is new
|
|
500
|
+
pattern._all_seen_instances.add(self._instance_id)
|
|
501
|
+
# Also add to storage list (capped at 100 for serialization)
|
|
502
|
+
if len(pattern._seen_instance_hashes) < 100:
|
|
503
|
+
pattern._seen_instance_hashes.append(self._instance_id)
|
|
504
|
+
# Safe to increment user_count - we verified it's new
|
|
505
|
+
pattern.user_count += 1
|
|
506
|
+
elif not pattern._tracking_truncated:
|
|
507
|
+
# Tracking set is full but we weren't truncated before
|
|
508
|
+
# This is a truly new user beyond our tracking capacity
|
|
509
|
+
pattern.user_count += 1
|
|
510
|
+
# else: Can't verify if new, skip incrementing to prevent double-count
|
|
511
|
+
|
|
512
|
+
# Track query context patterns for learning (privacy-preserving)
|
|
513
|
+
if query_context and len(query_context) >= 3:
|
|
514
|
+
# Normalize and anonymize: extract keywords, remove values
|
|
515
|
+
query_pattern = self._anonymize_query_pattern(query_context)
|
|
516
|
+
if query_pattern:
|
|
517
|
+
# MEDIUM FIX #10: Track frequency to keep most common patterns
|
|
518
|
+
pattern.query_pattern_frequency[query_pattern] = (
|
|
519
|
+
pattern.query_pattern_frequency.get(query_pattern, 0) + 1
|
|
520
|
+
)
|
|
521
|
+
# Update the list to contain top patterns by frequency
|
|
522
|
+
if query_pattern not in pattern.common_query_patterns:
|
|
523
|
+
pattern.common_query_patterns.append(query_pattern)
|
|
524
|
+
# Keep only the most common patterns (by frequency)
|
|
525
|
+
if len(pattern.common_query_patterns) > self._config.max_query_patterns:
|
|
526
|
+
pattern.common_query_patterns = sorted(
|
|
527
|
+
pattern.common_query_patterns,
|
|
528
|
+
key=lambda p: pattern.query_pattern_frequency.get(p, 0),
|
|
529
|
+
reverse=True,
|
|
530
|
+
)[: self._config.max_query_patterns]
|
|
531
|
+
# Also limit the frequency dict
|
|
532
|
+
if len(pattern.query_pattern_frequency) > self._config.max_query_patterns * 2:
|
|
533
|
+
top_patterns = sorted(
|
|
534
|
+
pattern.query_pattern_frequency.items(),
|
|
535
|
+
key=lambda x: x[1],
|
|
536
|
+
reverse=True,
|
|
537
|
+
)[: self._config.max_query_patterns * 2]
|
|
538
|
+
pattern.query_pattern_frequency = dict(top_patterns)
|
|
539
|
+
|
|
540
|
+
# Periodically update recommendations even without retrievals
|
|
541
|
+
# This ensures optimal_strategy is updated based on success rates
|
|
542
|
+
if pattern.total_compressions % 10 == 0:
|
|
543
|
+
self._update_recommendations(pattern)
|
|
544
|
+
|
|
545
|
+
# === TOIN Evolution: Field Statistics for Semantic Learning ===
|
|
546
|
+
# Capture field-level statistics to learn default values and uniqueness
|
|
547
|
+
if items:
|
|
548
|
+
self._update_field_statistics(pattern, items)
|
|
549
|
+
|
|
550
|
+
pattern.last_updated = time.time()
|
|
551
|
+
pattern.confidence = self._calculate_confidence(pattern)
|
|
552
|
+
self._dirty = True
|
|
553
|
+
|
|
554
|
+
# Auto-save if needed (outside lock)
|
|
555
|
+
self._maybe_auto_save()
|
|
556
|
+
|
|
557
|
+
def _update_field_statistics(
|
|
558
|
+
self,
|
|
559
|
+
pattern: ToolPattern,
|
|
560
|
+
items: list[dict[str, Any]],
|
|
561
|
+
) -> None:
|
|
562
|
+
"""Update field statistics from compression items.
|
|
563
|
+
|
|
564
|
+
Captures uniqueness, default values, and value distribution for
|
|
565
|
+
learning field semantic types.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
pattern: ToolPattern to update.
|
|
569
|
+
items: Items being compressed.
|
|
570
|
+
"""
|
|
571
|
+
if not items:
|
|
572
|
+
return
|
|
573
|
+
|
|
574
|
+
# Analyze field statistics (sample up to 100 items to limit CPU)
|
|
575
|
+
sample_items = items[:100] if len(items) > 100 else items
|
|
576
|
+
|
|
577
|
+
# Collect values for each field
|
|
578
|
+
field_values: dict[str, list[str]] = {} # field_hash -> list of value_hashes
|
|
579
|
+
|
|
580
|
+
for item in sample_items:
|
|
581
|
+
if not isinstance(item, dict):
|
|
582
|
+
continue
|
|
583
|
+
|
|
584
|
+
for field_name, value in item.items():
|
|
585
|
+
field_hash = self._hash_field_name(field_name)
|
|
586
|
+
value_hash = self._hash_value(value)
|
|
587
|
+
|
|
588
|
+
if field_hash not in field_values:
|
|
589
|
+
field_values[field_hash] = []
|
|
590
|
+
field_values[field_hash].append(value_hash)
|
|
591
|
+
|
|
592
|
+
# Update FieldSemantics with statistics
|
|
593
|
+
for field_hash, values in field_values.items():
|
|
594
|
+
if not values:
|
|
595
|
+
continue
|
|
596
|
+
|
|
597
|
+
# Get or create FieldSemantics
|
|
598
|
+
if field_hash not in pattern.field_semantics:
|
|
599
|
+
pattern.field_semantics[field_hash] = FieldSemantics(field_hash=field_hash)
|
|
600
|
+
|
|
601
|
+
field_sem = pattern.field_semantics[field_hash]
|
|
602
|
+
|
|
603
|
+
# Calculate statistics
|
|
604
|
+
unique_values = len(set(values))
|
|
605
|
+
total_values = len(values)
|
|
606
|
+
|
|
607
|
+
# Find most common value
|
|
608
|
+
from collections import Counter
|
|
609
|
+
|
|
610
|
+
value_counts = Counter(values)
|
|
611
|
+
most_common_value, most_common_count = value_counts.most_common(1)[0]
|
|
612
|
+
most_common_frequency = most_common_count / total_values if total_values > 0 else 0.0
|
|
613
|
+
|
|
614
|
+
# Record compression stats
|
|
615
|
+
field_sem.record_compression_stats(
|
|
616
|
+
unique_values=unique_values,
|
|
617
|
+
total_values=total_values,
|
|
618
|
+
most_common_value_hash=most_common_value,
|
|
619
|
+
most_common_frequency=most_common_frequency,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
# Bound field_semantics to prevent unbounded growth (max 100 fields)
|
|
623
|
+
if len(pattern.field_semantics) > 100:
|
|
624
|
+
# Keep fields with highest activity (retrieval + compression count)
|
|
625
|
+
sorted_fields = sorted(
|
|
626
|
+
pattern.field_semantics.items(),
|
|
627
|
+
key=lambda x: x[1].retrieval_count + x[1].compression_count,
|
|
628
|
+
reverse=True,
|
|
629
|
+
)[:100]
|
|
630
|
+
pattern.field_semantics = dict(sorted_fields)
|
|
631
|
+
|
|
632
|
+
def record_retrieval(
|
|
633
|
+
self,
|
|
634
|
+
tool_signature_hash: str,
|
|
635
|
+
retrieval_type: str,
|
|
636
|
+
query: str | None = None,
|
|
637
|
+
query_fields: list[str] | None = None,
|
|
638
|
+
strategy: str | None = None,
|
|
639
|
+
retrieved_items: list[dict[str, Any]] | None = None,
|
|
640
|
+
) -> None:
|
|
641
|
+
"""Record a retrieval event.
|
|
642
|
+
|
|
643
|
+
Called when LLM retrieves compressed content. This is the key
|
|
644
|
+
feedback signal - it means compression was too aggressive.
|
|
645
|
+
|
|
646
|
+
TOIN Evolution: When retrieved_items are provided, we learn field
|
|
647
|
+
semantics from the values. This enables zero-latency signal detection.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
tool_signature_hash: Hash of the tool signature.
|
|
651
|
+
retrieval_type: "full" or "search".
|
|
652
|
+
query: Optional search query (will be anonymized).
|
|
653
|
+
query_fields: Fields mentioned in query (will be hashed).
|
|
654
|
+
strategy: Compression strategy that was used (for success rate tracking).
|
|
655
|
+
retrieved_items: Optional list of retrieved items for field-level learning.
|
|
656
|
+
"""
|
|
657
|
+
if not self._config.enabled:
|
|
658
|
+
return
|
|
659
|
+
|
|
660
|
+
# LOW FIX #22: Emit retrieval metric
|
|
661
|
+
self._emit_metric(
|
|
662
|
+
"toin.retrieval",
|
|
663
|
+
{
|
|
664
|
+
"signature_hash": tool_signature_hash,
|
|
665
|
+
"retrieval_type": retrieval_type,
|
|
666
|
+
"has_query": query is not None,
|
|
667
|
+
"query_fields_count": len(query_fields) if query_fields else 0,
|
|
668
|
+
"strategy": strategy,
|
|
669
|
+
},
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
with self._lock:
|
|
673
|
+
if tool_signature_hash not in self._patterns:
|
|
674
|
+
# First time seeing this tool via retrieval
|
|
675
|
+
self._patterns[tool_signature_hash] = ToolPattern(
|
|
676
|
+
tool_signature_hash=tool_signature_hash
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
pattern = self._patterns[tool_signature_hash]
|
|
680
|
+
|
|
681
|
+
# Update retrieval stats
|
|
682
|
+
pattern.total_retrievals += 1
|
|
683
|
+
if retrieval_type == "full":
|
|
684
|
+
pattern.full_retrievals += 1
|
|
685
|
+
else:
|
|
686
|
+
pattern.search_retrievals += 1
|
|
687
|
+
|
|
688
|
+
# Update strategy success rates - retrieval means the strategy was TOO aggressive
|
|
689
|
+
# Decrease success rate for this strategy
|
|
690
|
+
if strategy and strategy in pattern.strategy_success_rates:
|
|
691
|
+
# Exponential moving average: penalize strategies that trigger retrieval
|
|
692
|
+
# Full retrievals are worse than search retrievals
|
|
693
|
+
penalty = 0.15 if retrieval_type == "full" else 0.05
|
|
694
|
+
current_rate = pattern.strategy_success_rates[strategy]
|
|
695
|
+
pattern.strategy_success_rates[strategy] = max(0.0, current_rate - penalty)
|
|
696
|
+
|
|
697
|
+
# Track queried fields (anonymized)
|
|
698
|
+
if query_fields:
|
|
699
|
+
for field_name in query_fields:
|
|
700
|
+
field_hash = self._hash_field_name(field_name)
|
|
701
|
+
pattern.field_retrieval_frequency[field_hash] = (
|
|
702
|
+
pattern.field_retrieval_frequency.get(field_hash, 0) + 1
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
# Update commonly retrieved fields
|
|
706
|
+
if field_hash not in pattern.commonly_retrieved_fields:
|
|
707
|
+
# Add if frequently retrieved (check count from dict)
|
|
708
|
+
freq = pattern.field_retrieval_frequency.get(field_hash, 0)
|
|
709
|
+
if freq >= 3:
|
|
710
|
+
pattern.commonly_retrieved_fields.append(field_hash)
|
|
711
|
+
# HIGH: Limit commonly_retrieved_fields to prevent unbounded growth
|
|
712
|
+
if len(pattern.commonly_retrieved_fields) > 20:
|
|
713
|
+
# Keep only the most frequently retrieved fields
|
|
714
|
+
sorted_fields = sorted(
|
|
715
|
+
pattern.commonly_retrieved_fields,
|
|
716
|
+
key=lambda f: pattern.field_retrieval_frequency.get(f, 0),
|
|
717
|
+
reverse=True,
|
|
718
|
+
)
|
|
719
|
+
pattern.commonly_retrieved_fields = sorted_fields[:20]
|
|
720
|
+
|
|
721
|
+
# HIGH: Limit field_retrieval_frequency dict to prevent unbounded growth
|
|
722
|
+
if len(pattern.field_retrieval_frequency) > 100:
|
|
723
|
+
sorted_freq_items = sorted(
|
|
724
|
+
pattern.field_retrieval_frequency.items(),
|
|
725
|
+
key=lambda x: x[1],
|
|
726
|
+
reverse=True,
|
|
727
|
+
)[:100]
|
|
728
|
+
pattern.field_retrieval_frequency = dict(sorted_freq_items)
|
|
729
|
+
|
|
730
|
+
# Track query patterns (anonymized)
|
|
731
|
+
if query and self._config.anonymize_queries:
|
|
732
|
+
query_pattern = self._anonymize_query_pattern(query)
|
|
733
|
+
if query_pattern:
|
|
734
|
+
# MEDIUM FIX #10: Track frequency to keep most common patterns
|
|
735
|
+
pattern.query_pattern_frequency[query_pattern] = (
|
|
736
|
+
pattern.query_pattern_frequency.get(query_pattern, 0) + 1
|
|
737
|
+
)
|
|
738
|
+
if query_pattern not in pattern.common_query_patterns:
|
|
739
|
+
pattern.common_query_patterns.append(query_pattern)
|
|
740
|
+
# Keep only the most common patterns (by frequency)
|
|
741
|
+
if len(pattern.common_query_patterns) > self._config.max_query_patterns:
|
|
742
|
+
pattern.common_query_patterns = sorted(
|
|
743
|
+
pattern.common_query_patterns,
|
|
744
|
+
key=lambda p: pattern.query_pattern_frequency.get(p, 0),
|
|
745
|
+
reverse=True,
|
|
746
|
+
)[: self._config.max_query_patterns]
|
|
747
|
+
|
|
748
|
+
# === TOIN Evolution: Field-Level Semantic Learning ===
|
|
749
|
+
# Learn from retrieved items to build zero-latency signal detection
|
|
750
|
+
if retrieved_items:
|
|
751
|
+
# Extract query operator from query string (for learning)
|
|
752
|
+
query_operator = self._extract_query_operator(query) if query else "="
|
|
753
|
+
|
|
754
|
+
for item in retrieved_items:
|
|
755
|
+
if not isinstance(item, dict):
|
|
756
|
+
continue
|
|
757
|
+
|
|
758
|
+
for field_name, value in item.items():
|
|
759
|
+
field_hash = self._hash_field_name(field_name)
|
|
760
|
+
|
|
761
|
+
# Get or create FieldSemantics for this field
|
|
762
|
+
if field_hash not in pattern.field_semantics:
|
|
763
|
+
pattern.field_semantics[field_hash] = FieldSemantics(
|
|
764
|
+
field_hash=field_hash
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
field_sem = pattern.field_semantics[field_hash]
|
|
768
|
+
|
|
769
|
+
# Hash the value for privacy
|
|
770
|
+
value_hash = self._hash_value(value)
|
|
771
|
+
|
|
772
|
+
# Record this retrieval
|
|
773
|
+
field_sem.record_retrieval_value(value_hash, query_operator)
|
|
774
|
+
|
|
775
|
+
# Periodically infer types (every 5 retrievals to save CPU)
|
|
776
|
+
if pattern.total_retrievals % 5 == 0:
|
|
777
|
+
for field_sem in pattern.field_semantics.values():
|
|
778
|
+
if field_sem.retrieval_count >= 3: # Need minimum data
|
|
779
|
+
field_sem.infer_type()
|
|
780
|
+
|
|
781
|
+
# Bound field_semantics to prevent unbounded growth (max 100 fields)
|
|
782
|
+
if len(pattern.field_semantics) > 100:
|
|
783
|
+
# Keep fields with highest retrieval counts
|
|
784
|
+
sorted_semantics = sorted(
|
|
785
|
+
pattern.field_semantics.items(),
|
|
786
|
+
key=lambda x: x[1].retrieval_count,
|
|
787
|
+
reverse=True,
|
|
788
|
+
)[:100]
|
|
789
|
+
pattern.field_semantics = dict(sorted_semantics)
|
|
790
|
+
|
|
791
|
+
# Update recommendations based on new retrieval data
|
|
792
|
+
self._update_recommendations(pattern)
|
|
793
|
+
|
|
794
|
+
pattern.last_updated = time.time()
|
|
795
|
+
self._dirty = True
|
|
796
|
+
|
|
797
|
+
self._maybe_auto_save()
|
|
798
|
+
|
|
799
|
+
def get_recommendation(
|
|
800
|
+
self,
|
|
801
|
+
tool_signature: ToolSignature,
|
|
802
|
+
query_context: str | None = None,
|
|
803
|
+
) -> CompressionHint:
|
|
804
|
+
"""Get compression recommendation for a tool output.
|
|
805
|
+
|
|
806
|
+
This is the main API for SmartCrusher to consult before compressing.
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
tool_signature: Signature of the tool output structure.
|
|
810
|
+
query_context: User query for context-aware recommendations.
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
CompressionHint with recommendations.
|
|
814
|
+
"""
|
|
815
|
+
if not self._config.enabled:
|
|
816
|
+
return CompressionHint(source="default", reason="TOIN disabled")
|
|
817
|
+
|
|
818
|
+
sig_hash = tool_signature.structure_hash
|
|
819
|
+
|
|
820
|
+
with self._lock:
|
|
821
|
+
pattern = self._patterns.get(sig_hash)
|
|
822
|
+
|
|
823
|
+
if pattern is None:
|
|
824
|
+
# No data for this tool type
|
|
825
|
+
return CompressionHint(
|
|
826
|
+
source="default",
|
|
827
|
+
reason="No pattern data for this tool type",
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
# Not enough samples for reliable recommendation
|
|
831
|
+
if pattern.sample_size < self._config.min_samples_for_recommendation:
|
|
832
|
+
hint = CompressionHint(
|
|
833
|
+
source="local",
|
|
834
|
+
reason=f"Only {pattern.sample_size} samples (need {self._config.min_samples_for_recommendation})",
|
|
835
|
+
confidence=pattern.confidence,
|
|
836
|
+
based_on_samples=pattern.sample_size,
|
|
837
|
+
)
|
|
838
|
+
# LOW FIX #22: Emit recommendation metric
|
|
839
|
+
self._emit_metric(
|
|
840
|
+
"toin.recommendation",
|
|
841
|
+
{
|
|
842
|
+
"signature_hash": sig_hash,
|
|
843
|
+
"source": hint.source,
|
|
844
|
+
"confidence": hint.confidence,
|
|
845
|
+
"skip_compression": hint.skip_compression,
|
|
846
|
+
"max_items": hint.max_items,
|
|
847
|
+
"compression_level": hint.compression_level,
|
|
848
|
+
"based_on_samples": hint.based_on_samples,
|
|
849
|
+
},
|
|
850
|
+
)
|
|
851
|
+
return hint
|
|
852
|
+
|
|
853
|
+
# Build recommendation based on learned patterns
|
|
854
|
+
hint = self._build_recommendation(pattern, query_context)
|
|
855
|
+
|
|
856
|
+
# LOW FIX #22: Emit recommendation metric
|
|
857
|
+
self._emit_metric(
|
|
858
|
+
"toin.recommendation",
|
|
859
|
+
{
|
|
860
|
+
"signature_hash": sig_hash,
|
|
861
|
+
"source": hint.source,
|
|
862
|
+
"confidence": hint.confidence,
|
|
863
|
+
"skip_compression": hint.skip_compression,
|
|
864
|
+
"max_items": hint.max_items,
|
|
865
|
+
"compression_level": hint.compression_level,
|
|
866
|
+
"based_on_samples": hint.based_on_samples,
|
|
867
|
+
},
|
|
868
|
+
)
|
|
869
|
+
return hint
|
|
870
|
+
|
|
871
|
+
def _build_recommendation(
|
|
872
|
+
self,
|
|
873
|
+
pattern: ToolPattern,
|
|
874
|
+
query_context: str | None,
|
|
875
|
+
) -> CompressionHint:
|
|
876
|
+
"""Build a recommendation based on pattern data and query context."""
|
|
877
|
+
hint = CompressionHint(
|
|
878
|
+
source="network"
|
|
879
|
+
if pattern.user_count >= self._config.min_users_for_network_effect
|
|
880
|
+
else "local",
|
|
881
|
+
confidence=pattern.confidence,
|
|
882
|
+
based_on_samples=pattern.sample_size,
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
retrieval_rate = pattern.retrieval_rate
|
|
886
|
+
full_retrieval_rate = pattern.full_retrieval_rate
|
|
887
|
+
|
|
888
|
+
# High retrieval rate = compression too aggressive
|
|
889
|
+
if retrieval_rate > self._config.high_retrieval_threshold:
|
|
890
|
+
if full_retrieval_rate > 0.8:
|
|
891
|
+
# Almost all retrievals are full = don't compress
|
|
892
|
+
hint.skip_compression = True
|
|
893
|
+
hint.compression_level = "none"
|
|
894
|
+
hint.reason = f"Very high full retrieval rate ({full_retrieval_rate:.1%})"
|
|
895
|
+
else:
|
|
896
|
+
# High retrieval but mostly search = compress conservatively
|
|
897
|
+
hint.max_items = pattern.optimal_max_items
|
|
898
|
+
hint.compression_level = "conservative"
|
|
899
|
+
hint.reason = f"High retrieval rate ({retrieval_rate:.1%})"
|
|
900
|
+
|
|
901
|
+
elif retrieval_rate > self._config.medium_retrieval_threshold:
|
|
902
|
+
# Moderate retrieval = moderate compression
|
|
903
|
+
hint.max_items = max(20, pattern.optimal_max_items)
|
|
904
|
+
hint.compression_level = "moderate"
|
|
905
|
+
hint.reason = f"Moderate retrieval rate ({retrieval_rate:.1%})"
|
|
906
|
+
|
|
907
|
+
else:
|
|
908
|
+
# Low retrieval = aggressive compression works
|
|
909
|
+
hint.max_items = min(15, pattern.optimal_max_items)
|
|
910
|
+
hint.compression_level = "aggressive"
|
|
911
|
+
hint.reason = f"Low retrieval rate ({retrieval_rate:.1%})"
|
|
912
|
+
|
|
913
|
+
# Build preserve_fields list weighted by retrieval frequency
|
|
914
|
+
# Start with pattern's preserve_fields, then enhance based on query
|
|
915
|
+
preserve_fields = pattern.preserve_fields.copy()
|
|
916
|
+
query_fields_count = 0
|
|
917
|
+
|
|
918
|
+
# If we have query context, extract field names and prioritize them
|
|
919
|
+
if query_context and pattern.field_retrieval_frequency:
|
|
920
|
+
# Extract field names from query context
|
|
921
|
+
import re
|
|
922
|
+
|
|
923
|
+
query_field_names = re.findall(r"(\w+)[=:]", query_context.lower())
|
|
924
|
+
|
|
925
|
+
# Hash them and check if they're in our frequency data
|
|
926
|
+
for field_name in query_field_names:
|
|
927
|
+
field_hash = self._hash_field_name(field_name)
|
|
928
|
+
if field_hash in pattern.field_retrieval_frequency:
|
|
929
|
+
# This field is known to be retrieved - prioritize it
|
|
930
|
+
if field_hash in preserve_fields:
|
|
931
|
+
# Move to front
|
|
932
|
+
preserve_fields.remove(field_hash)
|
|
933
|
+
preserve_fields.insert(0, field_hash)
|
|
934
|
+
query_fields_count += 1
|
|
935
|
+
|
|
936
|
+
# Sort remaining fields by retrieval frequency (most frequent first)
|
|
937
|
+
if pattern.field_retrieval_frequency and len(preserve_fields) > 1:
|
|
938
|
+
# Separate query-mentioned fields (already at front) from others
|
|
939
|
+
if query_fields_count < len(preserve_fields):
|
|
940
|
+
rest = preserve_fields[query_fields_count:]
|
|
941
|
+
rest.sort(
|
|
942
|
+
key=lambda f: pattern.field_retrieval_frequency.get(f, 0),
|
|
943
|
+
reverse=True,
|
|
944
|
+
)
|
|
945
|
+
preserve_fields = preserve_fields[:query_fields_count] + rest
|
|
946
|
+
|
|
947
|
+
hint.preserve_fields = preserve_fields[:10] # Limit to top 10
|
|
948
|
+
|
|
949
|
+
# Use optimal strategy if known AND it has good success rate
|
|
950
|
+
if pattern.optimal_strategy != "default":
|
|
951
|
+
success_rate = pattern.strategy_success_rates.get(pattern.optimal_strategy, 1.0)
|
|
952
|
+
# Only recommend strategy if success rate >= 0.5
|
|
953
|
+
# Lower success rates mean this strategy often causes retrievals
|
|
954
|
+
if success_rate >= 0.5:
|
|
955
|
+
hint.recommended_strategy = pattern.optimal_strategy
|
|
956
|
+
else:
|
|
957
|
+
# Strategy has poor success rate - reduce confidence
|
|
958
|
+
hint.confidence *= success_rate
|
|
959
|
+
hint.reason += (
|
|
960
|
+
f" (strategy {pattern.optimal_strategy} has low success: {success_rate:.1%})"
|
|
961
|
+
)
|
|
962
|
+
# Try to find a better strategy
|
|
963
|
+
best_strategy = self._find_best_strategy(pattern)
|
|
964
|
+
if best_strategy and best_strategy != pattern.optimal_strategy:
|
|
965
|
+
hint.recommended_strategy = best_strategy
|
|
966
|
+
hint.reason += f", using {best_strategy} instead"
|
|
967
|
+
|
|
968
|
+
# Boost max_items if query_context matches common retrieval patterns
|
|
969
|
+
# This prevents unnecessary retrieval when we can predict what's needed
|
|
970
|
+
if query_context:
|
|
971
|
+
query_lower = query_context.lower()
|
|
972
|
+
|
|
973
|
+
# Check for exhaustive query keywords that suggest user needs all data
|
|
974
|
+
exhaustive_keywords = ["all", "every", "complete", "full", "entire", "list all"]
|
|
975
|
+
if any(kw in query_lower for kw in exhaustive_keywords):
|
|
976
|
+
# User likely needs more data - be conservative
|
|
977
|
+
hint.max_items = max(hint.max_items, 40)
|
|
978
|
+
hint.compression_level = "conservative"
|
|
979
|
+
hint.reason += " (exhaustive query detected)"
|
|
980
|
+
|
|
981
|
+
# Check against common retrieval patterns
|
|
982
|
+
if pattern.common_query_patterns:
|
|
983
|
+
query_pattern = self._anonymize_query_pattern(query_context)
|
|
984
|
+
if query_pattern:
|
|
985
|
+
# Exact match
|
|
986
|
+
if query_pattern in pattern.common_query_patterns:
|
|
987
|
+
hint.max_items = max(hint.max_items, 30)
|
|
988
|
+
hint.reason += " (query matches retrieval pattern)"
|
|
989
|
+
else:
|
|
990
|
+
# Partial match: check if any stored pattern is contained in query
|
|
991
|
+
for stored_pattern in pattern.common_query_patterns:
|
|
992
|
+
# Check if key fields match (e.g., "status:*" in both)
|
|
993
|
+
stored_fields = {
|
|
994
|
+
f.split(":")[0] for f in stored_pattern.split() if ":" in f
|
|
995
|
+
}
|
|
996
|
+
query_fields = {
|
|
997
|
+
f.split(":")[0] for f in query_pattern.split() if ":" in f
|
|
998
|
+
}
|
|
999
|
+
# If query uses same fields as a problematic pattern, be conservative
|
|
1000
|
+
if stored_fields and stored_fields.issubset(query_fields):
|
|
1001
|
+
hint.max_items = max(hint.max_items, 25)
|
|
1002
|
+
hint.reason += " (query uses fields from retrieval pattern)"
|
|
1003
|
+
break
|
|
1004
|
+
|
|
1005
|
+
# === TOIN Evolution: Include learned field semantics ===
|
|
1006
|
+
# Copy field_semantics with sufficient confidence for SmartCrusher to use
|
|
1007
|
+
# Only include fields with confidence >= 0.3 to reduce noise
|
|
1008
|
+
if pattern.field_semantics:
|
|
1009
|
+
hint.field_semantics = {
|
|
1010
|
+
field_hash: field_sem
|
|
1011
|
+
for field_hash, field_sem in pattern.field_semantics.items()
|
|
1012
|
+
if field_sem.confidence >= 0.3 or field_sem.retrieval_count >= 3
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
return hint
|
|
1016
|
+
|
|
1017
|
+
def _find_best_strategy(self, pattern: ToolPattern) -> str | None:
|
|
1018
|
+
"""Find the strategy with the best success rate.
|
|
1019
|
+
|
|
1020
|
+
Returns None if no strategies have been tried or all have low success.
|
|
1021
|
+
"""
|
|
1022
|
+
if not pattern.strategy_success_rates:
|
|
1023
|
+
return None
|
|
1024
|
+
|
|
1025
|
+
# Find strategy with highest success rate above threshold
|
|
1026
|
+
best_strategy = None
|
|
1027
|
+
best_rate = 0.5 # Minimum acceptable rate
|
|
1028
|
+
|
|
1029
|
+
for strategy, rate in pattern.strategy_success_rates.items():
|
|
1030
|
+
if rate > best_rate:
|
|
1031
|
+
best_rate = rate
|
|
1032
|
+
best_strategy = strategy
|
|
1033
|
+
|
|
1034
|
+
return best_strategy
|
|
1035
|
+
|
|
1036
|
+
def _update_recommendations(self, pattern: ToolPattern) -> None:
|
|
1037
|
+
"""Update learned recommendations for a pattern."""
|
|
1038
|
+
# Calculate optimal max_items based on retrieval rate
|
|
1039
|
+
retrieval_rate = pattern.retrieval_rate
|
|
1040
|
+
|
|
1041
|
+
if retrieval_rate > self._config.high_retrieval_threshold:
|
|
1042
|
+
if pattern.full_retrieval_rate > 0.8:
|
|
1043
|
+
pattern.skip_compression_recommended = True
|
|
1044
|
+
pattern.optimal_max_items = pattern.total_items_seen // max(
|
|
1045
|
+
1, pattern.total_compressions
|
|
1046
|
+
)
|
|
1047
|
+
else:
|
|
1048
|
+
pattern.optimal_max_items = 50
|
|
1049
|
+
elif retrieval_rate > self._config.medium_retrieval_threshold:
|
|
1050
|
+
pattern.optimal_max_items = 30
|
|
1051
|
+
else:
|
|
1052
|
+
pattern.optimal_max_items = 20
|
|
1053
|
+
|
|
1054
|
+
# Update preserve_fields from frequently retrieved fields
|
|
1055
|
+
if pattern.field_retrieval_frequency:
|
|
1056
|
+
# Get top 5 most retrieved fields
|
|
1057
|
+
sorted_fields = sorted(
|
|
1058
|
+
pattern.field_retrieval_frequency.items(),
|
|
1059
|
+
key=lambda x: x[1],
|
|
1060
|
+
reverse=True,
|
|
1061
|
+
)[:5]
|
|
1062
|
+
pattern.preserve_fields = [f for f, _ in sorted_fields]
|
|
1063
|
+
|
|
1064
|
+
# Update optimal strategy (pick most successful)
|
|
1065
|
+
if pattern.strategy_success_rates:
|
|
1066
|
+
best_strategy = max(
|
|
1067
|
+
pattern.strategy_success_rates.items(),
|
|
1068
|
+
key=lambda x: x[1],
|
|
1069
|
+
)[0]
|
|
1070
|
+
pattern.optimal_strategy = best_strategy
|
|
1071
|
+
|
|
1072
|
+
def _calculate_confidence(self, pattern: ToolPattern) -> float:
|
|
1073
|
+
"""Calculate confidence level for a pattern."""
|
|
1074
|
+
# Base confidence on sample size
|
|
1075
|
+
sample_confidence = min(0.7, pattern.sample_size / 100)
|
|
1076
|
+
|
|
1077
|
+
# Boost if from multiple users
|
|
1078
|
+
# FIX: Changed from `user_count / 10 * 0.1` (= user_count * 0.01, too small)
|
|
1079
|
+
# to `user_count * 0.03` for meaningful boost at low user counts
|
|
1080
|
+
# - 3 users: 0.09 boost
|
|
1081
|
+
# - 10 users: 0.30 boost (capped)
|
|
1082
|
+
user_boost = 0.0
|
|
1083
|
+
if pattern.user_count >= self._config.min_users_for_network_effect:
|
|
1084
|
+
user_boost = min(0.3, pattern.user_count * 0.03)
|
|
1085
|
+
|
|
1086
|
+
return min(0.95, sample_confidence + user_boost)
|
|
1087
|
+
|
|
1088
|
+
def _hash_field_name(self, field_name: str) -> str:
|
|
1089
|
+
"""Hash a field name for anonymization."""
|
|
1090
|
+
return hashlib.sha256(field_name.encode()).hexdigest()[:8]
|
|
1091
|
+
|
|
1092
|
+
def _anonymize_query_pattern(self, query: str) -> str | None:
|
|
1093
|
+
"""Extract anonymized pattern from a query.
|
|
1094
|
+
|
|
1095
|
+
Keeps structural patterns, removes specific values.
|
|
1096
|
+
E.g., "status:error AND user:john" -> "status:* AND user:*"
|
|
1097
|
+
"""
|
|
1098
|
+
if not query:
|
|
1099
|
+
return None
|
|
1100
|
+
|
|
1101
|
+
# Simple pattern extraction: replace values after : or =
|
|
1102
|
+
import re
|
|
1103
|
+
|
|
1104
|
+
# Match field:value or field="value" patterns, but don't include spaces in unquoted values
|
|
1105
|
+
pattern = re.sub(r'(\w+)[=:](?:"[^"]*"|\'[^\']*\'|\w+)', r"\1:*", query)
|
|
1106
|
+
|
|
1107
|
+
# Remove if it's just generic
|
|
1108
|
+
if pattern in ("*", ""):
|
|
1109
|
+
return None
|
|
1110
|
+
|
|
1111
|
+
return pattern
|
|
1112
|
+
|
|
1113
|
+
def _hash_value(self, value: Any) -> str:
|
|
1114
|
+
"""Hash a value for privacy-preserving storage.
|
|
1115
|
+
|
|
1116
|
+
Handles all types by converting to a canonical string representation.
|
|
1117
|
+
"""
|
|
1118
|
+
if value is None:
|
|
1119
|
+
canonical = "null"
|
|
1120
|
+
elif isinstance(value, bool):
|
|
1121
|
+
canonical = "true" if value else "false"
|
|
1122
|
+
elif isinstance(value, (int, float)):
|
|
1123
|
+
canonical = str(value)
|
|
1124
|
+
elif isinstance(value, str):
|
|
1125
|
+
canonical = value
|
|
1126
|
+
elif isinstance(value, (list, dict)):
|
|
1127
|
+
# For complex types, use JSON serialization
|
|
1128
|
+
try:
|
|
1129
|
+
canonical = json.dumps(value, sort_keys=True, default=str)
|
|
1130
|
+
except (TypeError, ValueError):
|
|
1131
|
+
canonical = str(value)
|
|
1132
|
+
else:
|
|
1133
|
+
canonical = str(value)
|
|
1134
|
+
|
|
1135
|
+
return hashlib.sha256(canonical.encode()).hexdigest()[:8]
|
|
1136
|
+
|
|
1137
|
+
def _extract_query_operator(self, query: str) -> str:
|
|
1138
|
+
"""Extract the dominant query operator from a search query.
|
|
1139
|
+
|
|
1140
|
+
Used for learning field semantic types from query patterns.
|
|
1141
|
+
|
|
1142
|
+
Returns:
|
|
1143
|
+
Query operator: "=", "!=", ">", "<", ">=", "<=", "contains", or "="
|
|
1144
|
+
"""
|
|
1145
|
+
if not query:
|
|
1146
|
+
return "="
|
|
1147
|
+
|
|
1148
|
+
query_lower = query.lower()
|
|
1149
|
+
|
|
1150
|
+
# Check for inequality operators
|
|
1151
|
+
if "!=" in query or " not " in query_lower or " ne " in query_lower:
|
|
1152
|
+
return "!="
|
|
1153
|
+
if ">=" in query or " gte " in query_lower:
|
|
1154
|
+
return ">="
|
|
1155
|
+
if "<=" in query or " lte " in query_lower:
|
|
1156
|
+
return "<="
|
|
1157
|
+
if ">" in query or " gt " in query_lower:
|
|
1158
|
+
return ">"
|
|
1159
|
+
if "<" in query or " lt " in query_lower:
|
|
1160
|
+
return "<"
|
|
1161
|
+
|
|
1162
|
+
# Check for text search operators
|
|
1163
|
+
if " like " in query_lower or " contains " in query_lower or "*" in query:
|
|
1164
|
+
return "contains"
|
|
1165
|
+
|
|
1166
|
+
# Default to equality
|
|
1167
|
+
return "="
|
|
1168
|
+
|
|
1169
|
+
def get_stats(self) -> dict[str, Any]:
|
|
1170
|
+
"""Get overall TOIN statistics."""
|
|
1171
|
+
with self._lock:
|
|
1172
|
+
total_compressions = sum(p.total_compressions for p in self._patterns.values())
|
|
1173
|
+
total_retrievals = sum(p.total_retrievals for p in self._patterns.values())
|
|
1174
|
+
|
|
1175
|
+
return {
|
|
1176
|
+
"enabled": self._config.enabled,
|
|
1177
|
+
"patterns_tracked": len(self._patterns),
|
|
1178
|
+
"total_compressions": total_compressions,
|
|
1179
|
+
"total_retrievals": total_retrievals,
|
|
1180
|
+
"global_retrieval_rate": (
|
|
1181
|
+
total_retrievals / total_compressions if total_compressions > 0 else 0.0
|
|
1182
|
+
),
|
|
1183
|
+
"patterns_with_recommendations": sum(
|
|
1184
|
+
1
|
|
1185
|
+
for p in self._patterns.values()
|
|
1186
|
+
if p.sample_size >= self._config.min_samples_for_recommendation
|
|
1187
|
+
),
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
def get_pattern(self, signature_hash: str) -> ToolPattern | None:
|
|
1191
|
+
"""Get pattern data for a specific tool signature.
|
|
1192
|
+
|
|
1193
|
+
HIGH FIX: Returns a deep copy to prevent external mutation of internal state.
|
|
1194
|
+
"""
|
|
1195
|
+
import copy
|
|
1196
|
+
|
|
1197
|
+
with self._lock:
|
|
1198
|
+
pattern = self._patterns.get(signature_hash)
|
|
1199
|
+
if pattern is not None:
|
|
1200
|
+
return copy.deepcopy(pattern)
|
|
1201
|
+
return None
|
|
1202
|
+
|
|
1203
|
+
def export_patterns(self) -> dict[str, Any]:
|
|
1204
|
+
"""Export all patterns for sharing/aggregation."""
|
|
1205
|
+
with self._lock:
|
|
1206
|
+
return {
|
|
1207
|
+
"version": "1.0",
|
|
1208
|
+
"export_timestamp": time.time(),
|
|
1209
|
+
"instance_id": self._instance_id,
|
|
1210
|
+
"patterns": {
|
|
1211
|
+
sig_hash: pattern.to_dict() for sig_hash, pattern in self._patterns.items()
|
|
1212
|
+
},
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
def import_patterns(self, data: dict[str, Any]) -> None:
|
|
1216
|
+
"""Import patterns from another source.
|
|
1217
|
+
|
|
1218
|
+
Used for federated learning: aggregate patterns from multiple
|
|
1219
|
+
Headroom instances without sharing actual data.
|
|
1220
|
+
|
|
1221
|
+
Args:
|
|
1222
|
+
data: Exported pattern data.
|
|
1223
|
+
"""
|
|
1224
|
+
if not self._config.enabled:
|
|
1225
|
+
return
|
|
1226
|
+
|
|
1227
|
+
patterns_data = data.get("patterns", {})
|
|
1228
|
+
source_instance = data.get("instance_id", "unknown")
|
|
1229
|
+
|
|
1230
|
+
with self._lock:
|
|
1231
|
+
for sig_hash, pattern_dict in patterns_data.items():
|
|
1232
|
+
imported = ToolPattern.from_dict(pattern_dict)
|
|
1233
|
+
|
|
1234
|
+
if sig_hash in self._patterns:
|
|
1235
|
+
# Merge with existing
|
|
1236
|
+
self._merge_patterns(self._patterns[sig_hash], imported)
|
|
1237
|
+
else:
|
|
1238
|
+
# Add new pattern - need to track source instance
|
|
1239
|
+
self._patterns[sig_hash] = imported
|
|
1240
|
+
|
|
1241
|
+
# For NEW patterns from another instance, track the source in
|
|
1242
|
+
# _seen_instance_hashes so user_count reflects cross-user data
|
|
1243
|
+
if source_instance != self._instance_id:
|
|
1244
|
+
pattern = self._patterns[sig_hash]
|
|
1245
|
+
if source_instance not in pattern._seen_instance_hashes:
|
|
1246
|
+
# Limit storage to 100 unique instances to bound memory
|
|
1247
|
+
if len(pattern._seen_instance_hashes) < 100:
|
|
1248
|
+
pattern._seen_instance_hashes.append(source_instance)
|
|
1249
|
+
# CRITICAL: Always increment user_count (even after cap)
|
|
1250
|
+
pattern.user_count += 1
|
|
1251
|
+
|
|
1252
|
+
self._dirty = True
|
|
1253
|
+
|
|
1254
|
+
def _merge_patterns(self, existing: ToolPattern, imported: ToolPattern) -> None:
|
|
1255
|
+
"""Merge imported pattern into existing."""
|
|
1256
|
+
total = existing.sample_size + imported.sample_size
|
|
1257
|
+
if total == 0:
|
|
1258
|
+
return
|
|
1259
|
+
|
|
1260
|
+
w_existing = existing.sample_size / total
|
|
1261
|
+
w_imported = imported.sample_size / total
|
|
1262
|
+
|
|
1263
|
+
# Merge counts
|
|
1264
|
+
existing.total_compressions += imported.total_compressions
|
|
1265
|
+
existing.total_retrievals += imported.total_retrievals
|
|
1266
|
+
existing.full_retrievals += imported.full_retrievals
|
|
1267
|
+
existing.search_retrievals += imported.search_retrievals
|
|
1268
|
+
existing.total_items_seen += imported.total_items_seen
|
|
1269
|
+
existing.total_items_kept += imported.total_items_kept
|
|
1270
|
+
|
|
1271
|
+
# Weighted averages
|
|
1272
|
+
existing.avg_compression_ratio = (
|
|
1273
|
+
existing.avg_compression_ratio * w_existing
|
|
1274
|
+
+ imported.avg_compression_ratio * w_imported
|
|
1275
|
+
)
|
|
1276
|
+
existing.avg_token_reduction = (
|
|
1277
|
+
existing.avg_token_reduction * w_existing + imported.avg_token_reduction * w_imported
|
|
1278
|
+
)
|
|
1279
|
+
|
|
1280
|
+
# Merge field frequencies
|
|
1281
|
+
for field_hash, count in imported.field_retrieval_frequency.items():
|
|
1282
|
+
existing.field_retrieval_frequency[field_hash] = (
|
|
1283
|
+
existing.field_retrieval_frequency.get(field_hash, 0) + count
|
|
1284
|
+
)
|
|
1285
|
+
# HIGH: Limit field_retrieval_frequency dict to prevent unbounded growth
|
|
1286
|
+
if len(existing.field_retrieval_frequency) > 100:
|
|
1287
|
+
# Keep only the most frequently retrieved fields
|
|
1288
|
+
sorted_fields = sorted(
|
|
1289
|
+
existing.field_retrieval_frequency.items(),
|
|
1290
|
+
key=lambda x: x[1],
|
|
1291
|
+
reverse=True,
|
|
1292
|
+
)[:100]
|
|
1293
|
+
existing.field_retrieval_frequency = dict(sorted_fields)
|
|
1294
|
+
|
|
1295
|
+
# Merge commonly retrieved fields
|
|
1296
|
+
for field_hash in imported.commonly_retrieved_fields:
|
|
1297
|
+
if field_hash not in existing.commonly_retrieved_fields:
|
|
1298
|
+
existing.commonly_retrieved_fields.append(field_hash)
|
|
1299
|
+
# HIGH: Limit commonly_retrieved_fields to prevent unbounded growth
|
|
1300
|
+
if len(existing.commonly_retrieved_fields) > 20:
|
|
1301
|
+
# Prioritize by retrieval frequency if available
|
|
1302
|
+
if existing.field_retrieval_frequency:
|
|
1303
|
+
existing.commonly_retrieved_fields = sorted(
|
|
1304
|
+
existing.commonly_retrieved_fields,
|
|
1305
|
+
key=lambda f: existing.field_retrieval_frequency.get(f, 0),
|
|
1306
|
+
reverse=True,
|
|
1307
|
+
)[:20]
|
|
1308
|
+
else:
|
|
1309
|
+
existing.commonly_retrieved_fields = existing.commonly_retrieved_fields[:20]
|
|
1310
|
+
|
|
1311
|
+
# Merge query patterns (for federated learning)
|
|
1312
|
+
# MEDIUM FIX #10: Also merge query_pattern_frequency for proper ranking
|
|
1313
|
+
for query_pattern, freq in imported.query_pattern_frequency.items():
|
|
1314
|
+
existing.query_pattern_frequency[query_pattern] = (
|
|
1315
|
+
existing.query_pattern_frequency.get(query_pattern, 0) + freq
|
|
1316
|
+
)
|
|
1317
|
+
for query_pattern in imported.common_query_patterns:
|
|
1318
|
+
if query_pattern not in existing.common_query_patterns:
|
|
1319
|
+
existing.common_query_patterns.append(query_pattern)
|
|
1320
|
+
# Keep only the most common patterns (by frequency)
|
|
1321
|
+
if len(existing.common_query_patterns) > self._config.max_query_patterns:
|
|
1322
|
+
existing.common_query_patterns = sorted(
|
|
1323
|
+
existing.common_query_patterns,
|
|
1324
|
+
key=lambda p: existing.query_pattern_frequency.get(p, 0),
|
|
1325
|
+
reverse=True,
|
|
1326
|
+
)[: self._config.max_query_patterns]
|
|
1327
|
+
# Limit frequency dict
|
|
1328
|
+
if len(existing.query_pattern_frequency) > self._config.max_query_patterns * 2:
|
|
1329
|
+
top_patterns = sorted(
|
|
1330
|
+
existing.query_pattern_frequency.items(),
|
|
1331
|
+
key=lambda x: x[1],
|
|
1332
|
+
reverse=True,
|
|
1333
|
+
)[: self._config.max_query_patterns * 2]
|
|
1334
|
+
existing.query_pattern_frequency = dict(top_patterns)
|
|
1335
|
+
|
|
1336
|
+
# Merge strategy success rates (weighted average)
|
|
1337
|
+
for strategy, rate in imported.strategy_success_rates.items():
|
|
1338
|
+
if strategy in existing.strategy_success_rates:
|
|
1339
|
+
existing.strategy_success_rates[strategy] = (
|
|
1340
|
+
existing.strategy_success_rates[strategy] * w_existing + rate * w_imported
|
|
1341
|
+
)
|
|
1342
|
+
else:
|
|
1343
|
+
existing.strategy_success_rates[strategy] = rate
|
|
1344
|
+
|
|
1345
|
+
# HIGH FIX: Bound strategy_success_rates after merge
|
|
1346
|
+
if len(existing.strategy_success_rates) > 20:
|
|
1347
|
+
sorted_strategies = sorted(
|
|
1348
|
+
existing.strategy_success_rates.items(),
|
|
1349
|
+
key=lambda x: x[1],
|
|
1350
|
+
reverse=True,
|
|
1351
|
+
)[:20]
|
|
1352
|
+
existing.strategy_success_rates = dict(sorted_strategies)
|
|
1353
|
+
|
|
1354
|
+
# Merge preserve_fields (union of both, deduplicated)
|
|
1355
|
+
for preserve_field in imported.preserve_fields:
|
|
1356
|
+
if preserve_field not in existing.preserve_fields:
|
|
1357
|
+
existing.preserve_fields.append(preserve_field)
|
|
1358
|
+
# Keep only top 10 most important fields
|
|
1359
|
+
if len(existing.preserve_fields) > 10:
|
|
1360
|
+
# Prioritize by retrieval frequency if available
|
|
1361
|
+
if existing.field_retrieval_frequency:
|
|
1362
|
+
existing.preserve_fields = sorted(
|
|
1363
|
+
existing.preserve_fields,
|
|
1364
|
+
key=lambda f: existing.field_retrieval_frequency.get(f, 0),
|
|
1365
|
+
reverse=True,
|
|
1366
|
+
)[:10]
|
|
1367
|
+
else:
|
|
1368
|
+
existing.preserve_fields = existing.preserve_fields[:10]
|
|
1369
|
+
|
|
1370
|
+
# Merge skip_compression_recommended (true if either recommends skip)
|
|
1371
|
+
if imported.skip_compression_recommended:
|
|
1372
|
+
# Imported has more data suggesting skip - consider it
|
|
1373
|
+
if imported.sample_size > existing.sample_size // 2:
|
|
1374
|
+
existing.skip_compression_recommended = True
|
|
1375
|
+
|
|
1376
|
+
# Merge optimal_strategy (prefer the one with better success rate)
|
|
1377
|
+
if imported.optimal_strategy != "default":
|
|
1378
|
+
imported_rate = imported.strategy_success_rates.get(imported.optimal_strategy, 0.5)
|
|
1379
|
+
existing_rate = (
|
|
1380
|
+
existing.strategy_success_rates.get(existing.optimal_strategy, 0.5)
|
|
1381
|
+
if existing.optimal_strategy != "default"
|
|
1382
|
+
else 0.0
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
if imported_rate > existing_rate:
|
|
1386
|
+
existing.optimal_strategy = imported.optimal_strategy
|
|
1387
|
+
|
|
1388
|
+
# Merge optimal_max_items (weighted average with bounds)
|
|
1389
|
+
if imported.optimal_max_items > 0:
|
|
1390
|
+
merged_max_items = int(
|
|
1391
|
+
existing.optimal_max_items * w_existing + imported.optimal_max_items * w_imported
|
|
1392
|
+
)
|
|
1393
|
+
# Ensure valid bounds: min 3 items, max 1000 items
|
|
1394
|
+
existing.optimal_max_items = max(3, min(1000, merged_max_items))
|
|
1395
|
+
|
|
1396
|
+
existing.sample_size = total
|
|
1397
|
+
|
|
1398
|
+
# Merge seen instance hashes (union of both, limited to 100 for storage)
|
|
1399
|
+
# CRITICAL FIX #1 & #3: Simplified user count merge logic with cap enforcement.
|
|
1400
|
+
# user_count is the authoritative count even when sets hit their caps.
|
|
1401
|
+
new_users_found = 0
|
|
1402
|
+
for instance_hash in imported._seen_instance_hashes:
|
|
1403
|
+
# Use _all_seen_instances for deduplication (the authoritative set)
|
|
1404
|
+
if instance_hash not in existing._all_seen_instances:
|
|
1405
|
+
# Add to lookup set (with cap to prevent OOM)
|
|
1406
|
+
if len(existing._all_seen_instances) < ToolPattern.MAX_SEEN_INSTANCES:
|
|
1407
|
+
existing._all_seen_instances.add(instance_hash)
|
|
1408
|
+
# Limit storage list to 100 unique instances to bound serialization
|
|
1409
|
+
if len(existing._seen_instance_hashes) < 100:
|
|
1410
|
+
existing._seen_instance_hashes.append(instance_hash)
|
|
1411
|
+
new_users_found += 1
|
|
1412
|
+
|
|
1413
|
+
# Also merge instances from imported._all_seen_instances that weren't in list
|
|
1414
|
+
# (in case imported had more than 100 instances)
|
|
1415
|
+
for instance_hash in imported._all_seen_instances:
|
|
1416
|
+
if instance_hash not in existing._all_seen_instances:
|
|
1417
|
+
# Add with cap check
|
|
1418
|
+
if len(existing._all_seen_instances) < ToolPattern.MAX_SEEN_INSTANCES:
|
|
1419
|
+
existing._all_seen_instances.add(instance_hash)
|
|
1420
|
+
# Storage list already at limit, just track for dedup
|
|
1421
|
+
new_users_found += 1
|
|
1422
|
+
|
|
1423
|
+
# CRITICAL FIX #3: Simplified user count calculation.
|
|
1424
|
+
# We count new users from both the list and set, then add any users
|
|
1425
|
+
# that imported had beyond what we could deduplicate (when both hit caps).
|
|
1426
|
+
# imported.user_count may be > len(imported._all_seen_instances) if they hit cap
|
|
1427
|
+
users_beyond_imported_tracking = max(
|
|
1428
|
+
0, imported.user_count - len(imported._all_seen_instances)
|
|
1429
|
+
)
|
|
1430
|
+
existing.user_count += new_users_found + users_beyond_imported_tracking
|
|
1431
|
+
|
|
1432
|
+
existing.last_updated = time.time()
|
|
1433
|
+
|
|
1434
|
+
# Recalculate recommendations based on merged data
|
|
1435
|
+
self._update_recommendations(existing)
|
|
1436
|
+
|
|
1437
|
+
def save(self) -> None:
|
|
1438
|
+
"""Save TOIN data to disk with atomic write.
|
|
1439
|
+
|
|
1440
|
+
Uses a temporary file and rename to ensure atomicity.
|
|
1441
|
+
If the write fails, the original file is preserved.
|
|
1442
|
+
|
|
1443
|
+
HIGH FIX: Serialize under lock but write outside lock to prevent
|
|
1444
|
+
blocking other threads during slow file I/O.
|
|
1445
|
+
"""
|
|
1446
|
+
if not self._config.storage_path:
|
|
1447
|
+
return
|
|
1448
|
+
|
|
1449
|
+
import tempfile
|
|
1450
|
+
|
|
1451
|
+
# Step 1: Serialize under lock (fast in-memory operation)
|
|
1452
|
+
with self._lock:
|
|
1453
|
+
data = self.export_patterns()
|
|
1454
|
+
|
|
1455
|
+
# Step 2: Write outside lock (slow I/O operation)
|
|
1456
|
+
path = Path(self._config.storage_path)
|
|
1457
|
+
|
|
1458
|
+
try:
|
|
1459
|
+
# Create parent directories if needed
|
|
1460
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
1461
|
+
|
|
1462
|
+
# Serialize to string (outside lock but before file ops)
|
|
1463
|
+
json_data = json.dumps(data, indent=2)
|
|
1464
|
+
|
|
1465
|
+
# Write to temporary file first (atomic write pattern)
|
|
1466
|
+
# Use same directory to ensure same filesystem for rename
|
|
1467
|
+
fd, tmp_path = tempfile.mkstemp(dir=path.parent, prefix=".toin_", suffix=".tmp")
|
|
1468
|
+
try:
|
|
1469
|
+
with open(fd, "w") as f:
|
|
1470
|
+
f.write(json_data)
|
|
1471
|
+
|
|
1472
|
+
# Atomic rename (on POSIX systems)
|
|
1473
|
+
Path(tmp_path).replace(path)
|
|
1474
|
+
|
|
1475
|
+
except Exception:
|
|
1476
|
+
# Clean up temp file on failure
|
|
1477
|
+
try:
|
|
1478
|
+
Path(tmp_path).unlink()
|
|
1479
|
+
except OSError:
|
|
1480
|
+
pass
|
|
1481
|
+
raise
|
|
1482
|
+
|
|
1483
|
+
# Step 3: Update state under lock (fast)
|
|
1484
|
+
with self._lock:
|
|
1485
|
+
self._dirty = False
|
|
1486
|
+
self._last_save_time = time.time()
|
|
1487
|
+
|
|
1488
|
+
except OSError as e:
|
|
1489
|
+
# Log error but don't crash - TOIN should be resilient
|
|
1490
|
+
logger.warning(f"Failed to save TOIN data: {e}")
|
|
1491
|
+
|
|
1492
|
+
def _load_from_disk(self) -> None:
|
|
1493
|
+
"""Load TOIN data from disk."""
|
|
1494
|
+
if not self._config.storage_path:
|
|
1495
|
+
return
|
|
1496
|
+
|
|
1497
|
+
path = Path(self._config.storage_path)
|
|
1498
|
+
if not path.exists():
|
|
1499
|
+
return
|
|
1500
|
+
|
|
1501
|
+
try:
|
|
1502
|
+
with open(path) as f:
|
|
1503
|
+
data = json.load(f)
|
|
1504
|
+
self.import_patterns(data)
|
|
1505
|
+
self._dirty = False
|
|
1506
|
+
except (json.JSONDecodeError, OSError):
|
|
1507
|
+
pass # Start fresh if corrupted
|
|
1508
|
+
|
|
1509
|
+
def _maybe_auto_save(self) -> None:
|
|
1510
|
+
"""Auto-save if enough time has passed.
|
|
1511
|
+
|
|
1512
|
+
HIGH FIX: Check conditions under lock to prevent race where another
|
|
1513
|
+
thread modifies _dirty or _last_save_time between check and save.
|
|
1514
|
+
The save() method already acquires the lock, and we use RLock so
|
|
1515
|
+
it's safe to hold the lock when calling save().
|
|
1516
|
+
"""
|
|
1517
|
+
if not self._config.storage_path or not self._config.auto_save_interval:
|
|
1518
|
+
return
|
|
1519
|
+
|
|
1520
|
+
# Check under lock to prevent race conditions
|
|
1521
|
+
with self._lock:
|
|
1522
|
+
if not self._dirty:
|
|
1523
|
+
return
|
|
1524
|
+
|
|
1525
|
+
elapsed = time.time() - self._last_save_time
|
|
1526
|
+
if elapsed >= self._config.auto_save_interval:
|
|
1527
|
+
# save() uses the same RLock, so this is safe
|
|
1528
|
+
self.save()
|
|
1529
|
+
|
|
1530
|
+
def clear(self) -> None:
|
|
1531
|
+
"""Clear all TOIN data. Mainly for testing."""
|
|
1532
|
+
with self._lock:
|
|
1533
|
+
self._patterns.clear()
|
|
1534
|
+
self._dirty = False
|
|
1535
|
+
|
|
1536
|
+
|
|
1537
|
+
# Global TOIN instance (lazy initialization)
|
|
1538
|
+
_toin_instance: ToolIntelligenceNetwork | None = None
|
|
1539
|
+
_toin_lock = threading.Lock()
|
|
1540
|
+
|
|
1541
|
+
|
|
1542
|
+
def get_toin(config: TOINConfig | None = None) -> ToolIntelligenceNetwork:
|
|
1543
|
+
"""Get the global TOIN instance.
|
|
1544
|
+
|
|
1545
|
+
Thread-safe singleton pattern. Always acquires lock to avoid subtle
|
|
1546
|
+
race conditions in double-checked locking on non-CPython implementations.
|
|
1547
|
+
|
|
1548
|
+
Args:
|
|
1549
|
+
config: Configuration (only used on first call). If the instance
|
|
1550
|
+
already exists, config is ignored and a warning is logged.
|
|
1551
|
+
|
|
1552
|
+
Returns:
|
|
1553
|
+
Global ToolIntelligenceNetwork instance.
|
|
1554
|
+
"""
|
|
1555
|
+
global _toin_instance
|
|
1556
|
+
|
|
1557
|
+
# CRITICAL FIX: Always acquire lock for thread safety across all Python
|
|
1558
|
+
# implementations. The overhead is negligible since we only construct once.
|
|
1559
|
+
with _toin_lock:
|
|
1560
|
+
if _toin_instance is None:
|
|
1561
|
+
_toin_instance = ToolIntelligenceNetwork(config)
|
|
1562
|
+
elif config is not None:
|
|
1563
|
+
# Warn when config is silently ignored
|
|
1564
|
+
logger.warning(
|
|
1565
|
+
"TOIN config ignored: instance already exists. "
|
|
1566
|
+
"Call reset_toin() first if you need to change config."
|
|
1567
|
+
)
|
|
1568
|
+
|
|
1569
|
+
return _toin_instance
|
|
1570
|
+
|
|
1571
|
+
|
|
1572
|
+
def reset_toin() -> None:
|
|
1573
|
+
"""Reset the global TOIN instance. Mainly for testing."""
|
|
1574
|
+
global _toin_instance
|
|
1575
|
+
|
|
1576
|
+
with _toin_lock:
|
|
1577
|
+
if _toin_instance is not None:
|
|
1578
|
+
_toin_instance.clear()
|
|
1579
|
+
_toin_instance = None
|