headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,880 @@
|
|
|
1
|
+
"""Data models for privacy-preserving telemetry.
|
|
2
|
+
|
|
3
|
+
These models capture PATTERNS, not DATA. We never store actual values,
|
|
4
|
+
user queries, or identifiable information.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any, Literal
|
|
13
|
+
|
|
14
|
+
# Type alias for field semantic types
|
|
15
|
+
FieldSemanticType = Literal[
|
|
16
|
+
"unknown",
|
|
17
|
+
"identifier",
|
|
18
|
+
"error_indicator",
|
|
19
|
+
"score",
|
|
20
|
+
"status",
|
|
21
|
+
"temporal",
|
|
22
|
+
"content",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class FieldDistribution:
|
|
28
|
+
"""Statistics about a field's distribution (no actual values).
|
|
29
|
+
|
|
30
|
+
This captures the SHAPE of the data, not the data itself.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
field_name_hash: str # SHA256[:8] of field name (anonymized)
|
|
34
|
+
field_type: Literal["string", "numeric", "boolean", "array", "object", "null", "mixed"]
|
|
35
|
+
|
|
36
|
+
# String field statistics
|
|
37
|
+
avg_length: float | None = None
|
|
38
|
+
unique_ratio: float | None = None # 0.0 = constant, 1.0 = all unique
|
|
39
|
+
entropy: float | None = None # Shannon entropy normalized to [0, 1]
|
|
40
|
+
looks_like_id: bool = False # High entropy + consistent format
|
|
41
|
+
|
|
42
|
+
# Numeric field statistics
|
|
43
|
+
has_variance: bool = False
|
|
44
|
+
variance_bucket: Literal["zero", "low", "medium", "high"] | None = None
|
|
45
|
+
has_negative: bool = False
|
|
46
|
+
is_integer: bool = True
|
|
47
|
+
has_outliers: bool = False # Values > 2σ from mean
|
|
48
|
+
|
|
49
|
+
# Array field statistics
|
|
50
|
+
avg_array_length: float | None = None
|
|
51
|
+
|
|
52
|
+
# Derived insights
|
|
53
|
+
is_likely_score: bool = False # Monotonic, bounded, high variance
|
|
54
|
+
is_likely_timestamp: bool = False # Sequential, numeric, consistent intervals
|
|
55
|
+
is_likely_status: bool = False # Low cardinality categorical
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> dict[str, Any]:
|
|
58
|
+
"""Convert to dictionary for serialization."""
|
|
59
|
+
return {
|
|
60
|
+
"field_name_hash": self.field_name_hash,
|
|
61
|
+
"field_type": self.field_type,
|
|
62
|
+
"avg_length": self.avg_length,
|
|
63
|
+
"unique_ratio": self.unique_ratio,
|
|
64
|
+
"entropy": self.entropy,
|
|
65
|
+
"looks_like_id": self.looks_like_id,
|
|
66
|
+
"has_variance": self.has_variance,
|
|
67
|
+
"variance_bucket": self.variance_bucket,
|
|
68
|
+
"has_negative": self.has_negative,
|
|
69
|
+
"is_integer": self.is_integer,
|
|
70
|
+
"has_outliers": self.has_outliers,
|
|
71
|
+
"avg_array_length": self.avg_array_length,
|
|
72
|
+
"is_likely_score": self.is_likely_score,
|
|
73
|
+
"is_likely_timestamp": self.is_likely_timestamp,
|
|
74
|
+
"is_likely_status": self.is_likely_status,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def from_dict(cls, data: dict[str, Any]) -> FieldDistribution:
|
|
79
|
+
"""Create from dictionary."""
|
|
80
|
+
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class ToolSignature:
|
|
85
|
+
"""Anonymized signature of a tool's output structure.
|
|
86
|
+
|
|
87
|
+
This identifies SIMILAR tools across users without revealing tool names.
|
|
88
|
+
Two tools with the same field structure will have the same signature.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
# Structural hash (based on field types and names)
|
|
92
|
+
# MEDIUM FIX #15: Uses SHA256[:24] (96 bits) for better collision resistance
|
|
93
|
+
structure_hash: str # SHA256[:24] of sorted field names + types
|
|
94
|
+
|
|
95
|
+
# Schema characteristics
|
|
96
|
+
field_count: int
|
|
97
|
+
has_nested_objects: bool
|
|
98
|
+
has_arrays: bool
|
|
99
|
+
max_depth: int
|
|
100
|
+
|
|
101
|
+
# Field type distribution
|
|
102
|
+
string_field_count: int = 0
|
|
103
|
+
numeric_field_count: int = 0
|
|
104
|
+
boolean_field_count: int = 0
|
|
105
|
+
array_field_count: int = 0
|
|
106
|
+
object_field_count: int = 0
|
|
107
|
+
|
|
108
|
+
# Pattern indicators (without revealing actual field names)
|
|
109
|
+
has_id_like_field: bool = False
|
|
110
|
+
has_score_like_field: bool = False
|
|
111
|
+
has_timestamp_like_field: bool = False
|
|
112
|
+
has_status_like_field: bool = False
|
|
113
|
+
has_error_like_field: bool = False
|
|
114
|
+
has_message_like_field: bool = False
|
|
115
|
+
|
|
116
|
+
def to_dict(self) -> dict[str, Any]:
|
|
117
|
+
"""Convert to dictionary for serialization."""
|
|
118
|
+
return {
|
|
119
|
+
"structure_hash": self.structure_hash,
|
|
120
|
+
"field_count": self.field_count,
|
|
121
|
+
"has_nested_objects": self.has_nested_objects,
|
|
122
|
+
"has_arrays": self.has_arrays,
|
|
123
|
+
"max_depth": self.max_depth,
|
|
124
|
+
"string_field_count": self.string_field_count,
|
|
125
|
+
"numeric_field_count": self.numeric_field_count,
|
|
126
|
+
"boolean_field_count": self.boolean_field_count,
|
|
127
|
+
"array_field_count": self.array_field_count,
|
|
128
|
+
"object_field_count": self.object_field_count,
|
|
129
|
+
"has_id_like_field": self.has_id_like_field,
|
|
130
|
+
"has_score_like_field": self.has_score_like_field,
|
|
131
|
+
"has_timestamp_like_field": self.has_timestamp_like_field,
|
|
132
|
+
"has_status_like_field": self.has_status_like_field,
|
|
133
|
+
"has_error_like_field": self.has_error_like_field,
|
|
134
|
+
"has_message_like_field": self.has_message_like_field,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def _calculate_depth(value: Any, current_depth: int = 1, max_depth_limit: int = 10) -> int:
|
|
139
|
+
"""Recursively calculate the depth of a nested structure.
|
|
140
|
+
|
|
141
|
+
MEDIUM FIX #12: Actually calculate max_depth instead of hardcoding 1.
|
|
142
|
+
"""
|
|
143
|
+
if current_depth >= max_depth_limit:
|
|
144
|
+
return current_depth # Prevent infinite recursion
|
|
145
|
+
|
|
146
|
+
if isinstance(value, dict):
|
|
147
|
+
if not value:
|
|
148
|
+
return current_depth
|
|
149
|
+
return max(
|
|
150
|
+
ToolSignature._calculate_depth(v, current_depth + 1, max_depth_limit)
|
|
151
|
+
for v in value.values()
|
|
152
|
+
)
|
|
153
|
+
elif isinstance(value, list):
|
|
154
|
+
if not value:
|
|
155
|
+
return current_depth
|
|
156
|
+
# Sample first few items in arrays to avoid O(n) traversal
|
|
157
|
+
sample_items = value[:3]
|
|
158
|
+
return max(
|
|
159
|
+
ToolSignature._calculate_depth(item, current_depth + 1, max_depth_limit)
|
|
160
|
+
for item in sample_items
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
return current_depth
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def _matches_pattern(
|
|
167
|
+
key_lower: str, patterns: list[str], original_key: str | None = None
|
|
168
|
+
) -> bool:
|
|
169
|
+
"""Check if key matches patterns using word boundary matching.
|
|
170
|
+
|
|
171
|
+
MEDIUM FIX #14: Prevent false positives like "hidden" matching "id".
|
|
172
|
+
Uses word boundary logic: pattern must be at start/end or surrounded by
|
|
173
|
+
non-alphanumeric characters (underscore, hyphen, or boundary).
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
key_lower: The field name in lowercase
|
|
177
|
+
patterns: List of patterns to match against
|
|
178
|
+
original_key: The original field name (for camelCase detection)
|
|
179
|
+
"""
|
|
180
|
+
import re
|
|
181
|
+
|
|
182
|
+
for pattern in patterns:
|
|
183
|
+
# Exact match
|
|
184
|
+
if key_lower == pattern:
|
|
185
|
+
return True
|
|
186
|
+
|
|
187
|
+
# Pattern at start with delimiter: "id_something" or "id-something"
|
|
188
|
+
if key_lower.startswith(pattern + "_") or key_lower.startswith(pattern + "-"):
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
# Pattern at end with delimiter: "user_id" or "user-id"
|
|
192
|
+
if key_lower.endswith("_" + pattern) or key_lower.endswith("-" + pattern):
|
|
193
|
+
return True
|
|
194
|
+
|
|
195
|
+
# Pattern in middle with delimiters: "some_id_field"
|
|
196
|
+
if f"_{pattern}_" in key_lower or f"-{pattern}-" in key_lower:
|
|
197
|
+
return True
|
|
198
|
+
if f"_{pattern}-" in key_lower or f"-{pattern}_" in key_lower:
|
|
199
|
+
return True
|
|
200
|
+
|
|
201
|
+
# camelCase detection: Look for capitalized pattern in original key
|
|
202
|
+
# e.g., "userId" should match "id" (as "Id")
|
|
203
|
+
if original_key:
|
|
204
|
+
# Pattern capitalized (e.g., "Id" for "id")
|
|
205
|
+
cap_pattern = pattern.capitalize()
|
|
206
|
+
# Look for capital letter at start of pattern, preceded by lowercase
|
|
207
|
+
camel_regex = rf"(?<=[a-z]){re.escape(cap_pattern)}(?=[A-Z]|$)"
|
|
208
|
+
if re.search(camel_regex, original_key):
|
|
209
|
+
return True
|
|
210
|
+
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def from_items(cls, items: list[dict[str, Any]]) -> ToolSignature:
|
|
215
|
+
"""Create signature from sample items."""
|
|
216
|
+
if not items:
|
|
217
|
+
# HIGH FIX: Generate unique hash for empty outputs to prevent
|
|
218
|
+
# different tools' empty responses from colliding into one pattern.
|
|
219
|
+
# Use a random component to ensure uniqueness across tool types.
|
|
220
|
+
import uuid
|
|
221
|
+
|
|
222
|
+
# MEDIUM FIX #15: Use 24 chars (96 bits) instead of 16 (64 bits) to reduce collision risk
|
|
223
|
+
empty_hash = hashlib.sha256(f"empty:{uuid.uuid4()}".encode()).hexdigest()[:24]
|
|
224
|
+
return cls(
|
|
225
|
+
structure_hash=empty_hash,
|
|
226
|
+
field_count=0,
|
|
227
|
+
has_nested_objects=False,
|
|
228
|
+
has_arrays=False,
|
|
229
|
+
max_depth=0,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# MEDIUM FIX #13: Analyze multiple items (up to 5) to get representative structure
|
|
233
|
+
# This catches cases where items have varying schemas
|
|
234
|
+
sample_items = items[:5] if len(items) >= 5 else items
|
|
235
|
+
|
|
236
|
+
# Merge field info from all sampled items
|
|
237
|
+
all_fields: dict[str, set[str]] = {} # field_name -> set of types seen
|
|
238
|
+
for item in sample_items:
|
|
239
|
+
if not isinstance(item, dict):
|
|
240
|
+
continue
|
|
241
|
+
for key, value in item.items():
|
|
242
|
+
if key not in all_fields:
|
|
243
|
+
all_fields[key] = set()
|
|
244
|
+
# Determine type
|
|
245
|
+
if isinstance(value, str):
|
|
246
|
+
all_fields[key].add("string")
|
|
247
|
+
elif isinstance(value, bool):
|
|
248
|
+
all_fields[key].add("boolean")
|
|
249
|
+
elif isinstance(value, (int, float)):
|
|
250
|
+
all_fields[key].add("numeric")
|
|
251
|
+
elif isinstance(value, list):
|
|
252
|
+
all_fields[key].add("array")
|
|
253
|
+
elif isinstance(value, dict):
|
|
254
|
+
all_fields[key].add("object")
|
|
255
|
+
else:
|
|
256
|
+
all_fields[key].add("null")
|
|
257
|
+
|
|
258
|
+
# Build field_info with most common type per field
|
|
259
|
+
field_info: list[tuple[str, str]] = []
|
|
260
|
+
string_count = 0
|
|
261
|
+
numeric_count = 0
|
|
262
|
+
boolean_count = 0
|
|
263
|
+
array_count = 0
|
|
264
|
+
object_count = 0
|
|
265
|
+
has_nested = False
|
|
266
|
+
has_arrays = False
|
|
267
|
+
|
|
268
|
+
# MEDIUM FIX #12: Calculate actual max_depth from sampled items
|
|
269
|
+
max_depth = 1
|
|
270
|
+
for item in sample_items:
|
|
271
|
+
if isinstance(item, dict):
|
|
272
|
+
item_depth = cls._calculate_depth(item)
|
|
273
|
+
max_depth = max(max_depth, item_depth)
|
|
274
|
+
|
|
275
|
+
# Pattern detection (heuristic field name matching)
|
|
276
|
+
has_id = False
|
|
277
|
+
has_score = False
|
|
278
|
+
has_timestamp = False
|
|
279
|
+
has_status = False
|
|
280
|
+
has_error = False
|
|
281
|
+
has_message = False
|
|
282
|
+
|
|
283
|
+
for key, types in all_fields.items():
|
|
284
|
+
key_lower = key.lower()
|
|
285
|
+
|
|
286
|
+
# Use most specific type if multiple seen (prefer non-null)
|
|
287
|
+
types_no_null = types - {"null"}
|
|
288
|
+
if len(types_no_null) == 1:
|
|
289
|
+
field_type = types_no_null.pop()
|
|
290
|
+
elif len(types_no_null) > 1:
|
|
291
|
+
# Multiple types seen - mark as mixed but pick one for counting
|
|
292
|
+
# Priority: object > array > string > numeric > boolean
|
|
293
|
+
for t in ["object", "array", "string", "numeric", "boolean"]:
|
|
294
|
+
if t in types_no_null:
|
|
295
|
+
field_type = t
|
|
296
|
+
break
|
|
297
|
+
else:
|
|
298
|
+
field_type = "mixed"
|
|
299
|
+
elif types:
|
|
300
|
+
field_type = types.pop() # Only null seen
|
|
301
|
+
else:
|
|
302
|
+
field_type = "null"
|
|
303
|
+
|
|
304
|
+
# Count field types
|
|
305
|
+
if field_type == "string":
|
|
306
|
+
string_count += 1
|
|
307
|
+
elif field_type == "boolean":
|
|
308
|
+
boolean_count += 1
|
|
309
|
+
elif field_type == "numeric":
|
|
310
|
+
numeric_count += 1
|
|
311
|
+
elif field_type == "array":
|
|
312
|
+
array_count += 1
|
|
313
|
+
has_arrays = True
|
|
314
|
+
elif field_type == "object":
|
|
315
|
+
object_count += 1
|
|
316
|
+
has_nested = True
|
|
317
|
+
|
|
318
|
+
field_info.append((key, field_type))
|
|
319
|
+
|
|
320
|
+
# MEDIUM FIX #14: Pattern detection with word boundary matching
|
|
321
|
+
# Prevents false positives like "hidden" matching "id"
|
|
322
|
+
# Pass original key for camelCase detection
|
|
323
|
+
if cls._matches_pattern(key_lower, ["id", "uuid", "guid"], key) or key_lower.endswith(
|
|
324
|
+
"key"
|
|
325
|
+
):
|
|
326
|
+
has_id = True
|
|
327
|
+
if cls._matches_pattern(
|
|
328
|
+
key_lower, ["score", "rank", "rating", "relevance", "priority"], key
|
|
329
|
+
):
|
|
330
|
+
has_score = True
|
|
331
|
+
if (
|
|
332
|
+
cls._matches_pattern(key_lower, ["time", "date", "timestamp"], key)
|
|
333
|
+
or key_lower.endswith("_at")
|
|
334
|
+
or key_lower in ["created", "updated"]
|
|
335
|
+
):
|
|
336
|
+
has_timestamp = True
|
|
337
|
+
if cls._matches_pattern(key_lower, ["status", "state"], key) or key_lower in [
|
|
338
|
+
"level",
|
|
339
|
+
"type",
|
|
340
|
+
"kind",
|
|
341
|
+
]:
|
|
342
|
+
has_status = True
|
|
343
|
+
if cls._matches_pattern(key_lower, ["error", "exception", "fail", "warning"], key):
|
|
344
|
+
has_error = True
|
|
345
|
+
if cls._matches_pattern(
|
|
346
|
+
key_lower, ["message", "msg", "text", "content", "body", "description"], key
|
|
347
|
+
):
|
|
348
|
+
has_message = True
|
|
349
|
+
|
|
350
|
+
# Create structure hash
|
|
351
|
+
# MEDIUM FIX #15: Use 24 chars (96 bits) instead of 16 (64 bits) for collision resistance
|
|
352
|
+
sorted_fields = sorted(field_info)
|
|
353
|
+
hash_input = json.dumps(sorted_fields, sort_keys=True)
|
|
354
|
+
structure_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:24]
|
|
355
|
+
|
|
356
|
+
return cls(
|
|
357
|
+
structure_hash=structure_hash,
|
|
358
|
+
field_count=len(field_info),
|
|
359
|
+
has_nested_objects=has_nested,
|
|
360
|
+
has_arrays=has_arrays,
|
|
361
|
+
max_depth=max_depth,
|
|
362
|
+
string_field_count=string_count,
|
|
363
|
+
numeric_field_count=numeric_count,
|
|
364
|
+
boolean_field_count=boolean_count,
|
|
365
|
+
array_field_count=array_count,
|
|
366
|
+
object_field_count=object_count,
|
|
367
|
+
has_id_like_field=has_id,
|
|
368
|
+
has_score_like_field=has_score,
|
|
369
|
+
has_timestamp_like_field=has_timestamp,
|
|
370
|
+
has_status_like_field=has_status,
|
|
371
|
+
has_error_like_field=has_error,
|
|
372
|
+
has_message_like_field=has_message,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
@classmethod
|
|
376
|
+
def from_dict(cls, data: dict[str, Any]) -> ToolSignature:
|
|
377
|
+
"""Create from dictionary."""
|
|
378
|
+
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
@dataclass
|
|
382
|
+
class FieldSemantics:
|
|
383
|
+
"""Learned semantics for a field based on retrieval patterns.
|
|
384
|
+
|
|
385
|
+
This is the evolution of TOIN - we learn WHAT fields mean
|
|
386
|
+
from HOW users retrieve them. No hardcoded patterns, no assumptions.
|
|
387
|
+
|
|
388
|
+
Learning process:
|
|
389
|
+
1. User retrieves items where field X has value Y
|
|
390
|
+
2. TOIN records: field_hash, value_hash, retrieval context
|
|
391
|
+
3. After N retrievals, TOIN infers: "This field behaves like an error indicator"
|
|
392
|
+
4. SmartCrusher uses this learned signal (O(1) lookup, zero latency)
|
|
393
|
+
|
|
394
|
+
Privacy: All field names and values are hashed (SHA256[:8]).
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
field_hash: str # SHA256[:8] of field name
|
|
398
|
+
|
|
399
|
+
# Inferred semantic type (learned from retrieval patterns, NOT hardcoded)
|
|
400
|
+
# These are behavioral categories, not syntactic patterns:
|
|
401
|
+
# - "identifier": Users query by exact value (e.g., "show me item X")
|
|
402
|
+
# - "error_indicator": Users retrieve when value != most common value
|
|
403
|
+
# - "score": Users retrieve top-N by this field
|
|
404
|
+
# - "status": Low cardinality, specific values trigger retrieval
|
|
405
|
+
# - "temporal": Users query by time ranges
|
|
406
|
+
# - "content": Users do text search on this field
|
|
407
|
+
inferred_type: FieldSemanticType = "unknown"
|
|
408
|
+
|
|
409
|
+
confidence: float = 0.0 # 0.0 = no data, 1.0 = high confidence
|
|
410
|
+
|
|
411
|
+
# Value patterns (all hashed for privacy)
|
|
412
|
+
# important_value_hashes: values that triggered retrieval
|
|
413
|
+
# default_value_hash: most common value (probably NOT important)
|
|
414
|
+
important_value_hashes: list[str] = field(default_factory=list)
|
|
415
|
+
default_value_hash: str | None = None
|
|
416
|
+
value_retrieval_frequency: dict[str, int] = field(default_factory=dict) # value_hash -> count
|
|
417
|
+
|
|
418
|
+
# Value statistics (for inferring type)
|
|
419
|
+
total_unique_values_seen: int = 0
|
|
420
|
+
total_values_seen: int = 0
|
|
421
|
+
most_common_value_frequency: float = 0.0 # Fraction of items with most common value
|
|
422
|
+
|
|
423
|
+
# Query patterns (anonymized)
|
|
424
|
+
# Tracks HOW users query this field (equals, not-equals, greater-than, etc.)
|
|
425
|
+
query_operator_frequency: dict[str, int] = field(default_factory=dict) # operator -> count
|
|
426
|
+
|
|
427
|
+
# Learning metadata
|
|
428
|
+
retrieval_count: int = 0
|
|
429
|
+
compression_count: int = 0 # How many times we've seen this field in compression
|
|
430
|
+
last_updated: float = 0.0
|
|
431
|
+
|
|
432
|
+
# Bounds for memory management
|
|
433
|
+
MAX_IMPORTANT_VALUES: int = 50
|
|
434
|
+
MAX_VALUE_FREQUENCY_ENTRIES: int = 100
|
|
435
|
+
|
|
436
|
+
def to_dict(self) -> dict[str, Any]:
|
|
437
|
+
"""Convert to dictionary for serialization."""
|
|
438
|
+
return {
|
|
439
|
+
"field_hash": self.field_hash,
|
|
440
|
+
"inferred_type": self.inferred_type,
|
|
441
|
+
"confidence": self.confidence,
|
|
442
|
+
"important_value_hashes": self.important_value_hashes[: self.MAX_IMPORTANT_VALUES],
|
|
443
|
+
"default_value_hash": self.default_value_hash,
|
|
444
|
+
"value_retrieval_frequency": dict(
|
|
445
|
+
sorted(
|
|
446
|
+
self.value_retrieval_frequency.items(),
|
|
447
|
+
key=lambda x: x[1],
|
|
448
|
+
reverse=True,
|
|
449
|
+
)[: self.MAX_VALUE_FREQUENCY_ENTRIES]
|
|
450
|
+
),
|
|
451
|
+
"total_unique_values_seen": self.total_unique_values_seen,
|
|
452
|
+
"total_values_seen": self.total_values_seen,
|
|
453
|
+
"most_common_value_frequency": self.most_common_value_frequency,
|
|
454
|
+
"query_operator_frequency": self.query_operator_frequency,
|
|
455
|
+
"retrieval_count": self.retrieval_count,
|
|
456
|
+
"compression_count": self.compression_count,
|
|
457
|
+
"last_updated": self.last_updated,
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def from_dict(cls, data: dict[str, Any]) -> FieldSemantics:
|
|
462
|
+
"""Create from dictionary."""
|
|
463
|
+
# Filter to valid fields only
|
|
464
|
+
valid_fields = {
|
|
465
|
+
"field_hash",
|
|
466
|
+
"inferred_type",
|
|
467
|
+
"confidence",
|
|
468
|
+
"important_value_hashes",
|
|
469
|
+
"default_value_hash",
|
|
470
|
+
"value_retrieval_frequency",
|
|
471
|
+
"total_unique_values_seen",
|
|
472
|
+
"total_values_seen",
|
|
473
|
+
"most_common_value_frequency",
|
|
474
|
+
"query_operator_frequency",
|
|
475
|
+
"retrieval_count",
|
|
476
|
+
"compression_count",
|
|
477
|
+
"last_updated",
|
|
478
|
+
}
|
|
479
|
+
filtered = {k: v for k, v in data.items() if k in valid_fields}
|
|
480
|
+
return cls(**filtered)
|
|
481
|
+
|
|
482
|
+
def record_retrieval_value(self, value_hash: str, operator: str = "=") -> None:
|
|
483
|
+
"""Record that a value was retrieved for this field.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
value_hash: SHA256[:8] hash of the retrieved value.
|
|
487
|
+
operator: Query operator used ("=", "!=", ">", "<", "contains", etc.)
|
|
488
|
+
"""
|
|
489
|
+
import time
|
|
490
|
+
|
|
491
|
+
self.retrieval_count += 1
|
|
492
|
+
self.last_updated = time.time()
|
|
493
|
+
|
|
494
|
+
# Track value frequency
|
|
495
|
+
self.value_retrieval_frequency[value_hash] = (
|
|
496
|
+
self.value_retrieval_frequency.get(value_hash, 0) + 1
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Bound the frequency dict
|
|
500
|
+
if len(self.value_retrieval_frequency) > self.MAX_VALUE_FREQUENCY_ENTRIES:
|
|
501
|
+
sorted_items = sorted(
|
|
502
|
+
self.value_retrieval_frequency.items(),
|
|
503
|
+
key=lambda x: x[1],
|
|
504
|
+
reverse=True,
|
|
505
|
+
)[: self.MAX_VALUE_FREQUENCY_ENTRIES]
|
|
506
|
+
self.value_retrieval_frequency = dict(sorted_items)
|
|
507
|
+
|
|
508
|
+
# Track important values (values that get retrieved)
|
|
509
|
+
if value_hash not in self.important_value_hashes:
|
|
510
|
+
self.important_value_hashes.append(value_hash)
|
|
511
|
+
if len(self.important_value_hashes) > self.MAX_IMPORTANT_VALUES:
|
|
512
|
+
# Keep most frequently retrieved values
|
|
513
|
+
self.important_value_hashes = sorted(
|
|
514
|
+
self.important_value_hashes,
|
|
515
|
+
key=lambda v: self.value_retrieval_frequency.get(v, 0),
|
|
516
|
+
reverse=True,
|
|
517
|
+
)[: self.MAX_IMPORTANT_VALUES]
|
|
518
|
+
|
|
519
|
+
# Track query operators
|
|
520
|
+
self.query_operator_frequency[operator] = self.query_operator_frequency.get(operator, 0) + 1
|
|
521
|
+
|
|
522
|
+
def record_compression_stats(
|
|
523
|
+
self,
|
|
524
|
+
unique_values: int,
|
|
525
|
+
total_values: int,
|
|
526
|
+
most_common_value_hash: str | None,
|
|
527
|
+
most_common_frequency: float,
|
|
528
|
+
) -> None:
|
|
529
|
+
"""Record statistics from compression for type inference.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
unique_values: Number of unique values seen for this field.
|
|
533
|
+
total_values: Total number of items with this field.
|
|
534
|
+
most_common_value_hash: Hash of the most common value.
|
|
535
|
+
most_common_frequency: Fraction of items with the most common value.
|
|
536
|
+
"""
|
|
537
|
+
import time
|
|
538
|
+
|
|
539
|
+
self.compression_count += 1
|
|
540
|
+
self.last_updated = time.time()
|
|
541
|
+
|
|
542
|
+
# Update rolling statistics
|
|
543
|
+
n = self.compression_count
|
|
544
|
+
self.total_unique_values_seen = int(
|
|
545
|
+
(self.total_unique_values_seen * (n - 1) + unique_values) / n
|
|
546
|
+
)
|
|
547
|
+
self.total_values_seen = int((self.total_values_seen * (n - 1) + total_values) / n)
|
|
548
|
+
self.most_common_value_frequency = (
|
|
549
|
+
self.most_common_value_frequency * (n - 1) + most_common_frequency
|
|
550
|
+
) / n
|
|
551
|
+
|
|
552
|
+
# Track default value (most common)
|
|
553
|
+
if most_common_value_hash and most_common_frequency > 0.5:
|
|
554
|
+
self.default_value_hash = most_common_value_hash
|
|
555
|
+
|
|
556
|
+
def infer_type(self) -> None:
|
|
557
|
+
"""Infer semantic type from accumulated statistics.
|
|
558
|
+
|
|
559
|
+
This is the learning algorithm - purely data-driven, no hardcoded patterns.
|
|
560
|
+
"""
|
|
561
|
+
# Need minimum data to infer
|
|
562
|
+
min_retrievals = 3
|
|
563
|
+
min_compressions = 2
|
|
564
|
+
|
|
565
|
+
if self.retrieval_count < min_retrievals or self.compression_count < min_compressions:
|
|
566
|
+
self.inferred_type = "unknown"
|
|
567
|
+
self.confidence = 0.0
|
|
568
|
+
return
|
|
569
|
+
|
|
570
|
+
# Calculate metrics
|
|
571
|
+
uniqueness_ratio = self.total_unique_values_seen / max(1, self.total_values_seen)
|
|
572
|
+
has_dominant_default = self.most_common_value_frequency > 0.7
|
|
573
|
+
retrieval_diversity = len(self.value_retrieval_frequency) / max(1, self.retrieval_count)
|
|
574
|
+
|
|
575
|
+
# Check query operator patterns
|
|
576
|
+
total_ops = sum(self.query_operator_frequency.values())
|
|
577
|
+
equals_ratio = self.query_operator_frequency.get("=", 0) / max(1, total_ops)
|
|
578
|
+
range_ratio = (
|
|
579
|
+
self.query_operator_frequency.get(">", 0)
|
|
580
|
+
+ self.query_operator_frequency.get("<", 0)
|
|
581
|
+
+ self.query_operator_frequency.get(">=", 0)
|
|
582
|
+
+ self.query_operator_frequency.get("<=", 0)
|
|
583
|
+
) / max(1, total_ops)
|
|
584
|
+
contains_ratio = self.query_operator_frequency.get("contains", 0) / max(1, total_ops)
|
|
585
|
+
|
|
586
|
+
# Inference logic (data-driven, no field name patterns)
|
|
587
|
+
inferred: FieldSemanticType = "unknown"
|
|
588
|
+
confidence = 0.0
|
|
589
|
+
|
|
590
|
+
# IDENTIFIER: High uniqueness + exact match queries
|
|
591
|
+
if uniqueness_ratio > 0.8 and equals_ratio > 0.7:
|
|
592
|
+
inferred = "identifier"
|
|
593
|
+
confidence = min(0.9, uniqueness_ratio * equals_ratio)
|
|
594
|
+
|
|
595
|
+
# ERROR_INDICATOR: Has dominant default + retrievals are for non-default values
|
|
596
|
+
elif has_dominant_default and self.default_value_hash:
|
|
597
|
+
# Check if retrieved values are different from default
|
|
598
|
+
default_retrieval_count = self.value_retrieval_frequency.get(self.default_value_hash, 0)
|
|
599
|
+
non_default_retrieval_ratio = 1 - (
|
|
600
|
+
default_retrieval_count / max(1, self.retrieval_count)
|
|
601
|
+
)
|
|
602
|
+
if non_default_retrieval_ratio > 0.7:
|
|
603
|
+
inferred = "error_indicator"
|
|
604
|
+
confidence = min(
|
|
605
|
+
0.9, non_default_retrieval_ratio * self.most_common_value_frequency
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
# STATUS: Low uniqueness + specific values retrieved
|
|
609
|
+
elif uniqueness_ratio < 0.2 and retrieval_diversity < 0.5:
|
|
610
|
+
inferred = "status"
|
|
611
|
+
confidence = min(0.85, (1 - uniqueness_ratio) * (1 - retrieval_diversity))
|
|
612
|
+
|
|
613
|
+
# SCORE: Range queries or sorted access patterns
|
|
614
|
+
elif range_ratio > 0.5:
|
|
615
|
+
inferred = "score"
|
|
616
|
+
confidence = min(0.85, range_ratio)
|
|
617
|
+
|
|
618
|
+
# TEMPORAL: Range queries + high uniqueness (likely timestamps)
|
|
619
|
+
elif range_ratio > 0.3 and uniqueness_ratio > 0.7:
|
|
620
|
+
inferred = "temporal"
|
|
621
|
+
confidence = min(0.8, range_ratio * uniqueness_ratio)
|
|
622
|
+
|
|
623
|
+
# CONTENT: Contains/text search queries
|
|
624
|
+
elif contains_ratio > 0.5:
|
|
625
|
+
inferred = "content"
|
|
626
|
+
confidence = min(0.85, contains_ratio)
|
|
627
|
+
|
|
628
|
+
# Apply minimum confidence threshold
|
|
629
|
+
if confidence < 0.3:
|
|
630
|
+
inferred = "unknown"
|
|
631
|
+
confidence = 0.0
|
|
632
|
+
|
|
633
|
+
self.inferred_type = inferred
|
|
634
|
+
self.confidence = confidence
|
|
635
|
+
|
|
636
|
+
def is_value_important(self, value_hash: str) -> bool:
|
|
637
|
+
"""Check if a specific value is considered important.
|
|
638
|
+
|
|
639
|
+
A value is important if:
|
|
640
|
+
1. It's in the important_value_hashes list (has been retrieved)
|
|
641
|
+
2. It's NOT the default value (for error_indicator type)
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
value_hash: SHA256[:8] hash of the value to check.
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
True if this value should be preserved during compression.
|
|
648
|
+
"""
|
|
649
|
+
# If we don't have enough data, be conservative
|
|
650
|
+
if self.confidence < 0.3:
|
|
651
|
+
return False
|
|
652
|
+
|
|
653
|
+
# For error_indicator: non-default values are important
|
|
654
|
+
if self.inferred_type == "error_indicator":
|
|
655
|
+
if self.default_value_hash and value_hash != self.default_value_hash:
|
|
656
|
+
return True
|
|
657
|
+
|
|
658
|
+
# For any type: values that have been retrieved are important
|
|
659
|
+
if value_hash in self.important_value_hashes:
|
|
660
|
+
return True
|
|
661
|
+
|
|
662
|
+
# For status: check if this value has been retrieved
|
|
663
|
+
if self.inferred_type == "status":
|
|
664
|
+
return value_hash in self.value_retrieval_frequency
|
|
665
|
+
|
|
666
|
+
return False
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
@dataclass
|
|
670
|
+
class CompressionEvent:
|
|
671
|
+
"""Record of a single compression decision (anonymized).
|
|
672
|
+
|
|
673
|
+
This captures WHAT happened, not WHAT the data was.
|
|
674
|
+
"""
|
|
675
|
+
|
|
676
|
+
# Tool identification (anonymized)
|
|
677
|
+
tool_signature: ToolSignature
|
|
678
|
+
|
|
679
|
+
# Compression metrics
|
|
680
|
+
original_item_count: int
|
|
681
|
+
compressed_item_count: int
|
|
682
|
+
compression_ratio: float # compressed / original
|
|
683
|
+
original_tokens: int
|
|
684
|
+
compressed_tokens: int
|
|
685
|
+
token_reduction_ratio: float # 1 - (compressed / original)
|
|
686
|
+
|
|
687
|
+
# Strategy used
|
|
688
|
+
strategy: str # "top_n", "time_series", "smart_sample", "skip", etc.
|
|
689
|
+
strategy_reason: str | None = None # "high_variance", "has_score_field", etc.
|
|
690
|
+
|
|
691
|
+
# Crushability analysis results
|
|
692
|
+
crushability_score: float | None = None # 0.0 = don't crush, 1.0 = safe to crush
|
|
693
|
+
crushability_reason: str | None = None
|
|
694
|
+
|
|
695
|
+
# Field distributions (anonymized)
|
|
696
|
+
field_distributions: list[FieldDistribution] = field(default_factory=list)
|
|
697
|
+
|
|
698
|
+
# What was preserved
|
|
699
|
+
kept_first_n: int = 0
|
|
700
|
+
kept_last_n: int = 0
|
|
701
|
+
kept_errors: int = 0
|
|
702
|
+
kept_anomalies: int = 0
|
|
703
|
+
kept_by_relevance: int = 0
|
|
704
|
+
kept_by_score: int = 0
|
|
705
|
+
|
|
706
|
+
# Timing
|
|
707
|
+
timestamp: float = 0.0
|
|
708
|
+
processing_time_ms: float = 0.0
|
|
709
|
+
|
|
710
|
+
def to_dict(self) -> dict[str, Any]:
|
|
711
|
+
"""Convert to dictionary for serialization."""
|
|
712
|
+
return {
|
|
713
|
+
"tool_signature": self.tool_signature.to_dict(),
|
|
714
|
+
"original_item_count": self.original_item_count,
|
|
715
|
+
"compressed_item_count": self.compressed_item_count,
|
|
716
|
+
"compression_ratio": self.compression_ratio,
|
|
717
|
+
"original_tokens": self.original_tokens,
|
|
718
|
+
"compressed_tokens": self.compressed_tokens,
|
|
719
|
+
"token_reduction_ratio": self.token_reduction_ratio,
|
|
720
|
+
"strategy": self.strategy,
|
|
721
|
+
"strategy_reason": self.strategy_reason,
|
|
722
|
+
"crushability_score": self.crushability_score,
|
|
723
|
+
"crushability_reason": self.crushability_reason,
|
|
724
|
+
"field_distributions": [f.to_dict() for f in self.field_distributions],
|
|
725
|
+
"kept_first_n": self.kept_first_n,
|
|
726
|
+
"kept_last_n": self.kept_last_n,
|
|
727
|
+
"kept_errors": self.kept_errors,
|
|
728
|
+
"kept_anomalies": self.kept_anomalies,
|
|
729
|
+
"kept_by_relevance": self.kept_by_relevance,
|
|
730
|
+
"kept_by_score": self.kept_by_score,
|
|
731
|
+
"timestamp": self.timestamp,
|
|
732
|
+
"processing_time_ms": self.processing_time_ms,
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
@dataclass
|
|
737
|
+
class RetrievalStats:
|
|
738
|
+
"""Aggregated retrieval statistics for a tool signature.
|
|
739
|
+
|
|
740
|
+
This tracks how often compression decisions needed correction.
|
|
741
|
+
"""
|
|
742
|
+
|
|
743
|
+
tool_signature_hash: str # Reference to ToolSignature.structure_hash
|
|
744
|
+
|
|
745
|
+
# Retrieval counts
|
|
746
|
+
total_compressions: int = 0
|
|
747
|
+
total_retrievals: int = 0
|
|
748
|
+
full_retrievals: int = 0 # Retrieved everything
|
|
749
|
+
search_retrievals: int = 0 # Used search filter
|
|
750
|
+
|
|
751
|
+
# Derived metrics
|
|
752
|
+
@property
|
|
753
|
+
def retrieval_rate(self) -> float:
|
|
754
|
+
"""Fraction of compressions that triggered retrieval."""
|
|
755
|
+
if self.total_compressions == 0:
|
|
756
|
+
return 0.0
|
|
757
|
+
return self.total_retrievals / self.total_compressions
|
|
758
|
+
|
|
759
|
+
@property
|
|
760
|
+
def full_retrieval_rate(self) -> float:
|
|
761
|
+
"""Fraction of retrievals that were full (not search)."""
|
|
762
|
+
if self.total_retrievals == 0:
|
|
763
|
+
return 0.0
|
|
764
|
+
return self.full_retrievals / self.total_retrievals
|
|
765
|
+
|
|
766
|
+
# Query pattern analysis (no actual queries, just patterns)
|
|
767
|
+
query_field_frequency: dict[str, int] = field(default_factory=dict) # field_hash -> count
|
|
768
|
+
|
|
769
|
+
def to_dict(self) -> dict[str, Any]:
|
|
770
|
+
"""Convert to dictionary for serialization."""
|
|
771
|
+
return {
|
|
772
|
+
"tool_signature_hash": self.tool_signature_hash,
|
|
773
|
+
"total_compressions": self.total_compressions,
|
|
774
|
+
"total_retrievals": self.total_retrievals,
|
|
775
|
+
"full_retrievals": self.full_retrievals,
|
|
776
|
+
"search_retrievals": self.search_retrievals,
|
|
777
|
+
"retrieval_rate": self.retrieval_rate,
|
|
778
|
+
"full_retrieval_rate": self.full_retrieval_rate,
|
|
779
|
+
"query_field_frequency": self.query_field_frequency,
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
@dataclass
|
|
784
|
+
class AnonymizedToolStats:
|
|
785
|
+
"""Complete anonymized statistics for a tool type.
|
|
786
|
+
|
|
787
|
+
This is what gets aggregated across users to build the data flywheel.
|
|
788
|
+
"""
|
|
789
|
+
|
|
790
|
+
# Tool identification
|
|
791
|
+
signature: ToolSignature
|
|
792
|
+
|
|
793
|
+
# Compression statistics
|
|
794
|
+
total_compressions: int = 0
|
|
795
|
+
total_items_seen: int = 0
|
|
796
|
+
total_items_kept: int = 0
|
|
797
|
+
avg_compression_ratio: float = 0.0
|
|
798
|
+
avg_token_reduction: float = 0.0
|
|
799
|
+
|
|
800
|
+
# Strategy distribution
|
|
801
|
+
strategy_counts: dict[str, int] = field(default_factory=dict) # strategy -> count
|
|
802
|
+
strategy_success_rate: dict[str, float] = field(
|
|
803
|
+
default_factory=dict
|
|
804
|
+
) # strategy -> success rate
|
|
805
|
+
|
|
806
|
+
# Retrieval statistics
|
|
807
|
+
retrieval_stats: RetrievalStats | None = None
|
|
808
|
+
|
|
809
|
+
# Learned optimal settings
|
|
810
|
+
recommended_min_items: int | None = None
|
|
811
|
+
recommended_preserve_fields: list[str] = field(default_factory=list) # field hashes
|
|
812
|
+
skip_compression_recommended: bool = False
|
|
813
|
+
|
|
814
|
+
# Confidence in recommendations
|
|
815
|
+
sample_size: int = 0
|
|
816
|
+
confidence: float = 0.0 # 0.0 = no confidence, 1.0 = high confidence
|
|
817
|
+
|
|
818
|
+
def to_dict(self) -> dict[str, Any]:
|
|
819
|
+
"""Convert to dictionary for serialization."""
|
|
820
|
+
return {
|
|
821
|
+
"signature": self.signature.to_dict(),
|
|
822
|
+
"total_compressions": self.total_compressions,
|
|
823
|
+
"total_items_seen": self.total_items_seen,
|
|
824
|
+
"total_items_kept": self.total_items_kept,
|
|
825
|
+
"avg_compression_ratio": self.avg_compression_ratio,
|
|
826
|
+
"avg_token_reduction": self.avg_token_reduction,
|
|
827
|
+
"strategy_counts": self.strategy_counts,
|
|
828
|
+
"strategy_success_rate": self.strategy_success_rate,
|
|
829
|
+
"retrieval_stats": self.retrieval_stats.to_dict() if self.retrieval_stats else None,
|
|
830
|
+
"recommended_min_items": self.recommended_min_items,
|
|
831
|
+
"recommended_preserve_fields": self.recommended_preserve_fields,
|
|
832
|
+
"skip_compression_recommended": self.skip_compression_recommended,
|
|
833
|
+
"sample_size": self.sample_size,
|
|
834
|
+
"confidence": self.confidence,
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
@classmethod
|
|
838
|
+
def from_dict(cls, data: dict[str, Any]) -> AnonymizedToolStats:
|
|
839
|
+
"""Create from dictionary.
|
|
840
|
+
|
|
841
|
+
Note: This method does not mutate the input dictionary.
|
|
842
|
+
"""
|
|
843
|
+
# Use .get() instead of .pop() to avoid mutating input
|
|
844
|
+
signature_data = data.get("signature", {})
|
|
845
|
+
signature = ToolSignature.from_dict(signature_data)
|
|
846
|
+
|
|
847
|
+
retrieval_data = data.get("retrieval_stats")
|
|
848
|
+
retrieval_stats = None
|
|
849
|
+
if retrieval_data:
|
|
850
|
+
# Copy query_field_frequency to avoid mutation issues
|
|
851
|
+
query_freq = retrieval_data.get("query_field_frequency", {})
|
|
852
|
+
retrieval_stats = RetrievalStats(
|
|
853
|
+
tool_signature_hash=retrieval_data.get("tool_signature_hash", ""),
|
|
854
|
+
total_compressions=retrieval_data.get("total_compressions", 0),
|
|
855
|
+
total_retrievals=retrieval_data.get("total_retrievals", 0),
|
|
856
|
+
full_retrievals=retrieval_data.get("full_retrievals", 0),
|
|
857
|
+
search_retrievals=retrieval_data.get("search_retrievals", 0),
|
|
858
|
+
query_field_frequency=dict(query_freq) if query_freq else {},
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
# Filter to only dataclass fields, excluding signature and retrieval_stats
|
|
862
|
+
# which we've already handled
|
|
863
|
+
excluded_keys = {"signature", "retrieval_stats"}
|
|
864
|
+
filtered_data: dict[str, Any] = {}
|
|
865
|
+
for k, v in data.items():
|
|
866
|
+
if k not in cls.__dataclass_fields__ or k in excluded_keys:
|
|
867
|
+
continue
|
|
868
|
+
# Deep copy mutable values to avoid corruption if caller modifies input
|
|
869
|
+
if isinstance(v, dict):
|
|
870
|
+
filtered_data[k] = dict(v)
|
|
871
|
+
elif isinstance(v, list):
|
|
872
|
+
filtered_data[k] = list(v) # type: ignore[assignment]
|
|
873
|
+
else:
|
|
874
|
+
filtered_data[k] = v
|
|
875
|
+
|
|
876
|
+
return cls(
|
|
877
|
+
signature=signature,
|
|
878
|
+
retrieval_stats=retrieval_stats,
|
|
879
|
+
**filtered_data, # type: ignore[arg-type]
|
|
880
|
+
)
|