headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,880 @@
1
+ """Data models for privacy-preserving telemetry.
2
+
3
+ These models capture PATTERNS, not DATA. We never store actual values,
4
+ user queries, or identifiable information.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Literal
13
+
14
+ # Type alias for field semantic types
15
+ FieldSemanticType = Literal[
16
+ "unknown",
17
+ "identifier",
18
+ "error_indicator",
19
+ "score",
20
+ "status",
21
+ "temporal",
22
+ "content",
23
+ ]
24
+
25
+
26
+ @dataclass
27
+ class FieldDistribution:
28
+ """Statistics about a field's distribution (no actual values).
29
+
30
+ This captures the SHAPE of the data, not the data itself.
31
+ """
32
+
33
+ field_name_hash: str # SHA256[:8] of field name (anonymized)
34
+ field_type: Literal["string", "numeric", "boolean", "array", "object", "null", "mixed"]
35
+
36
+ # String field statistics
37
+ avg_length: float | None = None
38
+ unique_ratio: float | None = None # 0.0 = constant, 1.0 = all unique
39
+ entropy: float | None = None # Shannon entropy normalized to [0, 1]
40
+ looks_like_id: bool = False # High entropy + consistent format
41
+
42
+ # Numeric field statistics
43
+ has_variance: bool = False
44
+ variance_bucket: Literal["zero", "low", "medium", "high"] | None = None
45
+ has_negative: bool = False
46
+ is_integer: bool = True
47
+ has_outliers: bool = False # Values > 2σ from mean
48
+
49
+ # Array field statistics
50
+ avg_array_length: float | None = None
51
+
52
+ # Derived insights
53
+ is_likely_score: bool = False # Monotonic, bounded, high variance
54
+ is_likely_timestamp: bool = False # Sequential, numeric, consistent intervals
55
+ is_likely_status: bool = False # Low cardinality categorical
56
+
57
+ def to_dict(self) -> dict[str, Any]:
58
+ """Convert to dictionary for serialization."""
59
+ return {
60
+ "field_name_hash": self.field_name_hash,
61
+ "field_type": self.field_type,
62
+ "avg_length": self.avg_length,
63
+ "unique_ratio": self.unique_ratio,
64
+ "entropy": self.entropy,
65
+ "looks_like_id": self.looks_like_id,
66
+ "has_variance": self.has_variance,
67
+ "variance_bucket": self.variance_bucket,
68
+ "has_negative": self.has_negative,
69
+ "is_integer": self.is_integer,
70
+ "has_outliers": self.has_outliers,
71
+ "avg_array_length": self.avg_array_length,
72
+ "is_likely_score": self.is_likely_score,
73
+ "is_likely_timestamp": self.is_likely_timestamp,
74
+ "is_likely_status": self.is_likely_status,
75
+ }
76
+
77
+ @classmethod
78
+ def from_dict(cls, data: dict[str, Any]) -> FieldDistribution:
79
+ """Create from dictionary."""
80
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
81
+
82
+
83
+ @dataclass
84
+ class ToolSignature:
85
+ """Anonymized signature of a tool's output structure.
86
+
87
+ This identifies SIMILAR tools across users without revealing tool names.
88
+ Two tools with the same field structure will have the same signature.
89
+ """
90
+
91
+ # Structural hash (based on field types and names)
92
+ # MEDIUM FIX #15: Uses SHA256[:24] (96 bits) for better collision resistance
93
+ structure_hash: str # SHA256[:24] of sorted field names + types
94
+
95
+ # Schema characteristics
96
+ field_count: int
97
+ has_nested_objects: bool
98
+ has_arrays: bool
99
+ max_depth: int
100
+
101
+ # Field type distribution
102
+ string_field_count: int = 0
103
+ numeric_field_count: int = 0
104
+ boolean_field_count: int = 0
105
+ array_field_count: int = 0
106
+ object_field_count: int = 0
107
+
108
+ # Pattern indicators (without revealing actual field names)
109
+ has_id_like_field: bool = False
110
+ has_score_like_field: bool = False
111
+ has_timestamp_like_field: bool = False
112
+ has_status_like_field: bool = False
113
+ has_error_like_field: bool = False
114
+ has_message_like_field: bool = False
115
+
116
+ def to_dict(self) -> dict[str, Any]:
117
+ """Convert to dictionary for serialization."""
118
+ return {
119
+ "structure_hash": self.structure_hash,
120
+ "field_count": self.field_count,
121
+ "has_nested_objects": self.has_nested_objects,
122
+ "has_arrays": self.has_arrays,
123
+ "max_depth": self.max_depth,
124
+ "string_field_count": self.string_field_count,
125
+ "numeric_field_count": self.numeric_field_count,
126
+ "boolean_field_count": self.boolean_field_count,
127
+ "array_field_count": self.array_field_count,
128
+ "object_field_count": self.object_field_count,
129
+ "has_id_like_field": self.has_id_like_field,
130
+ "has_score_like_field": self.has_score_like_field,
131
+ "has_timestamp_like_field": self.has_timestamp_like_field,
132
+ "has_status_like_field": self.has_status_like_field,
133
+ "has_error_like_field": self.has_error_like_field,
134
+ "has_message_like_field": self.has_message_like_field,
135
+ }
136
+
137
+ @staticmethod
138
+ def _calculate_depth(value: Any, current_depth: int = 1, max_depth_limit: int = 10) -> int:
139
+ """Recursively calculate the depth of a nested structure.
140
+
141
+ MEDIUM FIX #12: Actually calculate max_depth instead of hardcoding 1.
142
+ """
143
+ if current_depth >= max_depth_limit:
144
+ return current_depth # Prevent infinite recursion
145
+
146
+ if isinstance(value, dict):
147
+ if not value:
148
+ return current_depth
149
+ return max(
150
+ ToolSignature._calculate_depth(v, current_depth + 1, max_depth_limit)
151
+ for v in value.values()
152
+ )
153
+ elif isinstance(value, list):
154
+ if not value:
155
+ return current_depth
156
+ # Sample first few items in arrays to avoid O(n) traversal
157
+ sample_items = value[:3]
158
+ return max(
159
+ ToolSignature._calculate_depth(item, current_depth + 1, max_depth_limit)
160
+ for item in sample_items
161
+ )
162
+ else:
163
+ return current_depth
164
+
165
+ @staticmethod
166
+ def _matches_pattern(
167
+ key_lower: str, patterns: list[str], original_key: str | None = None
168
+ ) -> bool:
169
+ """Check if key matches patterns using word boundary matching.
170
+
171
+ MEDIUM FIX #14: Prevent false positives like "hidden" matching "id".
172
+ Uses word boundary logic: pattern must be at start/end or surrounded by
173
+ non-alphanumeric characters (underscore, hyphen, or boundary).
174
+
175
+ Args:
176
+ key_lower: The field name in lowercase
177
+ patterns: List of patterns to match against
178
+ original_key: The original field name (for camelCase detection)
179
+ """
180
+ import re
181
+
182
+ for pattern in patterns:
183
+ # Exact match
184
+ if key_lower == pattern:
185
+ return True
186
+
187
+ # Pattern at start with delimiter: "id_something" or "id-something"
188
+ if key_lower.startswith(pattern + "_") or key_lower.startswith(pattern + "-"):
189
+ return True
190
+
191
+ # Pattern at end with delimiter: "user_id" or "user-id"
192
+ if key_lower.endswith("_" + pattern) or key_lower.endswith("-" + pattern):
193
+ return True
194
+
195
+ # Pattern in middle with delimiters: "some_id_field"
196
+ if f"_{pattern}_" in key_lower or f"-{pattern}-" in key_lower:
197
+ return True
198
+ if f"_{pattern}-" in key_lower or f"-{pattern}_" in key_lower:
199
+ return True
200
+
201
+ # camelCase detection: Look for capitalized pattern in original key
202
+ # e.g., "userId" should match "id" (as "Id")
203
+ if original_key:
204
+ # Pattern capitalized (e.g., "Id" for "id")
205
+ cap_pattern = pattern.capitalize()
206
+ # Look for capital letter at start of pattern, preceded by lowercase
207
+ camel_regex = rf"(?<=[a-z]){re.escape(cap_pattern)}(?=[A-Z]|$)"
208
+ if re.search(camel_regex, original_key):
209
+ return True
210
+
211
+ return False
212
+
213
+ @classmethod
214
+ def from_items(cls, items: list[dict[str, Any]]) -> ToolSignature:
215
+ """Create signature from sample items."""
216
+ if not items:
217
+ # HIGH FIX: Generate unique hash for empty outputs to prevent
218
+ # different tools' empty responses from colliding into one pattern.
219
+ # Use a random component to ensure uniqueness across tool types.
220
+ import uuid
221
+
222
+ # MEDIUM FIX #15: Use 24 chars (96 bits) instead of 16 (64 bits) to reduce collision risk
223
+ empty_hash = hashlib.sha256(f"empty:{uuid.uuid4()}".encode()).hexdigest()[:24]
224
+ return cls(
225
+ structure_hash=empty_hash,
226
+ field_count=0,
227
+ has_nested_objects=False,
228
+ has_arrays=False,
229
+ max_depth=0,
230
+ )
231
+
232
+ # MEDIUM FIX #13: Analyze multiple items (up to 5) to get representative structure
233
+ # This catches cases where items have varying schemas
234
+ sample_items = items[:5] if len(items) >= 5 else items
235
+
236
+ # Merge field info from all sampled items
237
+ all_fields: dict[str, set[str]] = {} # field_name -> set of types seen
238
+ for item in sample_items:
239
+ if not isinstance(item, dict):
240
+ continue
241
+ for key, value in item.items():
242
+ if key not in all_fields:
243
+ all_fields[key] = set()
244
+ # Determine type
245
+ if isinstance(value, str):
246
+ all_fields[key].add("string")
247
+ elif isinstance(value, bool):
248
+ all_fields[key].add("boolean")
249
+ elif isinstance(value, (int, float)):
250
+ all_fields[key].add("numeric")
251
+ elif isinstance(value, list):
252
+ all_fields[key].add("array")
253
+ elif isinstance(value, dict):
254
+ all_fields[key].add("object")
255
+ else:
256
+ all_fields[key].add("null")
257
+
258
+ # Build field_info with most common type per field
259
+ field_info: list[tuple[str, str]] = []
260
+ string_count = 0
261
+ numeric_count = 0
262
+ boolean_count = 0
263
+ array_count = 0
264
+ object_count = 0
265
+ has_nested = False
266
+ has_arrays = False
267
+
268
+ # MEDIUM FIX #12: Calculate actual max_depth from sampled items
269
+ max_depth = 1
270
+ for item in sample_items:
271
+ if isinstance(item, dict):
272
+ item_depth = cls._calculate_depth(item)
273
+ max_depth = max(max_depth, item_depth)
274
+
275
+ # Pattern detection (heuristic field name matching)
276
+ has_id = False
277
+ has_score = False
278
+ has_timestamp = False
279
+ has_status = False
280
+ has_error = False
281
+ has_message = False
282
+
283
+ for key, types in all_fields.items():
284
+ key_lower = key.lower()
285
+
286
+ # Use most specific type if multiple seen (prefer non-null)
287
+ types_no_null = types - {"null"}
288
+ if len(types_no_null) == 1:
289
+ field_type = types_no_null.pop()
290
+ elif len(types_no_null) > 1:
291
+ # Multiple types seen - mark as mixed but pick one for counting
292
+ # Priority: object > array > string > numeric > boolean
293
+ for t in ["object", "array", "string", "numeric", "boolean"]:
294
+ if t in types_no_null:
295
+ field_type = t
296
+ break
297
+ else:
298
+ field_type = "mixed"
299
+ elif types:
300
+ field_type = types.pop() # Only null seen
301
+ else:
302
+ field_type = "null"
303
+
304
+ # Count field types
305
+ if field_type == "string":
306
+ string_count += 1
307
+ elif field_type == "boolean":
308
+ boolean_count += 1
309
+ elif field_type == "numeric":
310
+ numeric_count += 1
311
+ elif field_type == "array":
312
+ array_count += 1
313
+ has_arrays = True
314
+ elif field_type == "object":
315
+ object_count += 1
316
+ has_nested = True
317
+
318
+ field_info.append((key, field_type))
319
+
320
+ # MEDIUM FIX #14: Pattern detection with word boundary matching
321
+ # Prevents false positives like "hidden" matching "id"
322
+ # Pass original key for camelCase detection
323
+ if cls._matches_pattern(key_lower, ["id", "uuid", "guid"], key) or key_lower.endswith(
324
+ "key"
325
+ ):
326
+ has_id = True
327
+ if cls._matches_pattern(
328
+ key_lower, ["score", "rank", "rating", "relevance", "priority"], key
329
+ ):
330
+ has_score = True
331
+ if (
332
+ cls._matches_pattern(key_lower, ["time", "date", "timestamp"], key)
333
+ or key_lower.endswith("_at")
334
+ or key_lower in ["created", "updated"]
335
+ ):
336
+ has_timestamp = True
337
+ if cls._matches_pattern(key_lower, ["status", "state"], key) or key_lower in [
338
+ "level",
339
+ "type",
340
+ "kind",
341
+ ]:
342
+ has_status = True
343
+ if cls._matches_pattern(key_lower, ["error", "exception", "fail", "warning"], key):
344
+ has_error = True
345
+ if cls._matches_pattern(
346
+ key_lower, ["message", "msg", "text", "content", "body", "description"], key
347
+ ):
348
+ has_message = True
349
+
350
+ # Create structure hash
351
+ # MEDIUM FIX #15: Use 24 chars (96 bits) instead of 16 (64 bits) for collision resistance
352
+ sorted_fields = sorted(field_info)
353
+ hash_input = json.dumps(sorted_fields, sort_keys=True)
354
+ structure_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:24]
355
+
356
+ return cls(
357
+ structure_hash=structure_hash,
358
+ field_count=len(field_info),
359
+ has_nested_objects=has_nested,
360
+ has_arrays=has_arrays,
361
+ max_depth=max_depth,
362
+ string_field_count=string_count,
363
+ numeric_field_count=numeric_count,
364
+ boolean_field_count=boolean_count,
365
+ array_field_count=array_count,
366
+ object_field_count=object_count,
367
+ has_id_like_field=has_id,
368
+ has_score_like_field=has_score,
369
+ has_timestamp_like_field=has_timestamp,
370
+ has_status_like_field=has_status,
371
+ has_error_like_field=has_error,
372
+ has_message_like_field=has_message,
373
+ )
374
+
375
+ @classmethod
376
+ def from_dict(cls, data: dict[str, Any]) -> ToolSignature:
377
+ """Create from dictionary."""
378
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
379
+
380
+
381
+ @dataclass
382
+ class FieldSemantics:
383
+ """Learned semantics for a field based on retrieval patterns.
384
+
385
+ This is the evolution of TOIN - we learn WHAT fields mean
386
+ from HOW users retrieve them. No hardcoded patterns, no assumptions.
387
+
388
+ Learning process:
389
+ 1. User retrieves items where field X has value Y
390
+ 2. TOIN records: field_hash, value_hash, retrieval context
391
+ 3. After N retrievals, TOIN infers: "This field behaves like an error indicator"
392
+ 4. SmartCrusher uses this learned signal (O(1) lookup, zero latency)
393
+
394
+ Privacy: All field names and values are hashed (SHA256[:8]).
395
+ """
396
+
397
+ field_hash: str # SHA256[:8] of field name
398
+
399
+ # Inferred semantic type (learned from retrieval patterns, NOT hardcoded)
400
+ # These are behavioral categories, not syntactic patterns:
401
+ # - "identifier": Users query by exact value (e.g., "show me item X")
402
+ # - "error_indicator": Users retrieve when value != most common value
403
+ # - "score": Users retrieve top-N by this field
404
+ # - "status": Low cardinality, specific values trigger retrieval
405
+ # - "temporal": Users query by time ranges
406
+ # - "content": Users do text search on this field
407
+ inferred_type: FieldSemanticType = "unknown"
408
+
409
+ confidence: float = 0.0 # 0.0 = no data, 1.0 = high confidence
410
+
411
+ # Value patterns (all hashed for privacy)
412
+ # important_value_hashes: values that triggered retrieval
413
+ # default_value_hash: most common value (probably NOT important)
414
+ important_value_hashes: list[str] = field(default_factory=list)
415
+ default_value_hash: str | None = None
416
+ value_retrieval_frequency: dict[str, int] = field(default_factory=dict) # value_hash -> count
417
+
418
+ # Value statistics (for inferring type)
419
+ total_unique_values_seen: int = 0
420
+ total_values_seen: int = 0
421
+ most_common_value_frequency: float = 0.0 # Fraction of items with most common value
422
+
423
+ # Query patterns (anonymized)
424
+ # Tracks HOW users query this field (equals, not-equals, greater-than, etc.)
425
+ query_operator_frequency: dict[str, int] = field(default_factory=dict) # operator -> count
426
+
427
+ # Learning metadata
428
+ retrieval_count: int = 0
429
+ compression_count: int = 0 # How many times we've seen this field in compression
430
+ last_updated: float = 0.0
431
+
432
+ # Bounds for memory management
433
+ MAX_IMPORTANT_VALUES: int = 50
434
+ MAX_VALUE_FREQUENCY_ENTRIES: int = 100
435
+
436
+ def to_dict(self) -> dict[str, Any]:
437
+ """Convert to dictionary for serialization."""
438
+ return {
439
+ "field_hash": self.field_hash,
440
+ "inferred_type": self.inferred_type,
441
+ "confidence": self.confidence,
442
+ "important_value_hashes": self.important_value_hashes[: self.MAX_IMPORTANT_VALUES],
443
+ "default_value_hash": self.default_value_hash,
444
+ "value_retrieval_frequency": dict(
445
+ sorted(
446
+ self.value_retrieval_frequency.items(),
447
+ key=lambda x: x[1],
448
+ reverse=True,
449
+ )[: self.MAX_VALUE_FREQUENCY_ENTRIES]
450
+ ),
451
+ "total_unique_values_seen": self.total_unique_values_seen,
452
+ "total_values_seen": self.total_values_seen,
453
+ "most_common_value_frequency": self.most_common_value_frequency,
454
+ "query_operator_frequency": self.query_operator_frequency,
455
+ "retrieval_count": self.retrieval_count,
456
+ "compression_count": self.compression_count,
457
+ "last_updated": self.last_updated,
458
+ }
459
+
460
+ @classmethod
461
+ def from_dict(cls, data: dict[str, Any]) -> FieldSemantics:
462
+ """Create from dictionary."""
463
+ # Filter to valid fields only
464
+ valid_fields = {
465
+ "field_hash",
466
+ "inferred_type",
467
+ "confidence",
468
+ "important_value_hashes",
469
+ "default_value_hash",
470
+ "value_retrieval_frequency",
471
+ "total_unique_values_seen",
472
+ "total_values_seen",
473
+ "most_common_value_frequency",
474
+ "query_operator_frequency",
475
+ "retrieval_count",
476
+ "compression_count",
477
+ "last_updated",
478
+ }
479
+ filtered = {k: v for k, v in data.items() if k in valid_fields}
480
+ return cls(**filtered)
481
+
482
+ def record_retrieval_value(self, value_hash: str, operator: str = "=") -> None:
483
+ """Record that a value was retrieved for this field.
484
+
485
+ Args:
486
+ value_hash: SHA256[:8] hash of the retrieved value.
487
+ operator: Query operator used ("=", "!=", ">", "<", "contains", etc.)
488
+ """
489
+ import time
490
+
491
+ self.retrieval_count += 1
492
+ self.last_updated = time.time()
493
+
494
+ # Track value frequency
495
+ self.value_retrieval_frequency[value_hash] = (
496
+ self.value_retrieval_frequency.get(value_hash, 0) + 1
497
+ )
498
+
499
+ # Bound the frequency dict
500
+ if len(self.value_retrieval_frequency) > self.MAX_VALUE_FREQUENCY_ENTRIES:
501
+ sorted_items = sorted(
502
+ self.value_retrieval_frequency.items(),
503
+ key=lambda x: x[1],
504
+ reverse=True,
505
+ )[: self.MAX_VALUE_FREQUENCY_ENTRIES]
506
+ self.value_retrieval_frequency = dict(sorted_items)
507
+
508
+ # Track important values (values that get retrieved)
509
+ if value_hash not in self.important_value_hashes:
510
+ self.important_value_hashes.append(value_hash)
511
+ if len(self.important_value_hashes) > self.MAX_IMPORTANT_VALUES:
512
+ # Keep most frequently retrieved values
513
+ self.important_value_hashes = sorted(
514
+ self.important_value_hashes,
515
+ key=lambda v: self.value_retrieval_frequency.get(v, 0),
516
+ reverse=True,
517
+ )[: self.MAX_IMPORTANT_VALUES]
518
+
519
+ # Track query operators
520
+ self.query_operator_frequency[operator] = self.query_operator_frequency.get(operator, 0) + 1
521
+
522
+ def record_compression_stats(
523
+ self,
524
+ unique_values: int,
525
+ total_values: int,
526
+ most_common_value_hash: str | None,
527
+ most_common_frequency: float,
528
+ ) -> None:
529
+ """Record statistics from compression for type inference.
530
+
531
+ Args:
532
+ unique_values: Number of unique values seen for this field.
533
+ total_values: Total number of items with this field.
534
+ most_common_value_hash: Hash of the most common value.
535
+ most_common_frequency: Fraction of items with the most common value.
536
+ """
537
+ import time
538
+
539
+ self.compression_count += 1
540
+ self.last_updated = time.time()
541
+
542
+ # Update rolling statistics
543
+ n = self.compression_count
544
+ self.total_unique_values_seen = int(
545
+ (self.total_unique_values_seen * (n - 1) + unique_values) / n
546
+ )
547
+ self.total_values_seen = int((self.total_values_seen * (n - 1) + total_values) / n)
548
+ self.most_common_value_frequency = (
549
+ self.most_common_value_frequency * (n - 1) + most_common_frequency
550
+ ) / n
551
+
552
+ # Track default value (most common)
553
+ if most_common_value_hash and most_common_frequency > 0.5:
554
+ self.default_value_hash = most_common_value_hash
555
+
556
+ def infer_type(self) -> None:
557
+ """Infer semantic type from accumulated statistics.
558
+
559
+ This is the learning algorithm - purely data-driven, no hardcoded patterns.
560
+ """
561
+ # Need minimum data to infer
562
+ min_retrievals = 3
563
+ min_compressions = 2
564
+
565
+ if self.retrieval_count < min_retrievals or self.compression_count < min_compressions:
566
+ self.inferred_type = "unknown"
567
+ self.confidence = 0.0
568
+ return
569
+
570
+ # Calculate metrics
571
+ uniqueness_ratio = self.total_unique_values_seen / max(1, self.total_values_seen)
572
+ has_dominant_default = self.most_common_value_frequency > 0.7
573
+ retrieval_diversity = len(self.value_retrieval_frequency) / max(1, self.retrieval_count)
574
+
575
+ # Check query operator patterns
576
+ total_ops = sum(self.query_operator_frequency.values())
577
+ equals_ratio = self.query_operator_frequency.get("=", 0) / max(1, total_ops)
578
+ range_ratio = (
579
+ self.query_operator_frequency.get(">", 0)
580
+ + self.query_operator_frequency.get("<", 0)
581
+ + self.query_operator_frequency.get(">=", 0)
582
+ + self.query_operator_frequency.get("<=", 0)
583
+ ) / max(1, total_ops)
584
+ contains_ratio = self.query_operator_frequency.get("contains", 0) / max(1, total_ops)
585
+
586
+ # Inference logic (data-driven, no field name patterns)
587
+ inferred: FieldSemanticType = "unknown"
588
+ confidence = 0.0
589
+
590
+ # IDENTIFIER: High uniqueness + exact match queries
591
+ if uniqueness_ratio > 0.8 and equals_ratio > 0.7:
592
+ inferred = "identifier"
593
+ confidence = min(0.9, uniqueness_ratio * equals_ratio)
594
+
595
+ # ERROR_INDICATOR: Has dominant default + retrievals are for non-default values
596
+ elif has_dominant_default and self.default_value_hash:
597
+ # Check if retrieved values are different from default
598
+ default_retrieval_count = self.value_retrieval_frequency.get(self.default_value_hash, 0)
599
+ non_default_retrieval_ratio = 1 - (
600
+ default_retrieval_count / max(1, self.retrieval_count)
601
+ )
602
+ if non_default_retrieval_ratio > 0.7:
603
+ inferred = "error_indicator"
604
+ confidence = min(
605
+ 0.9, non_default_retrieval_ratio * self.most_common_value_frequency
606
+ )
607
+
608
+ # STATUS: Low uniqueness + specific values retrieved
609
+ elif uniqueness_ratio < 0.2 and retrieval_diversity < 0.5:
610
+ inferred = "status"
611
+ confidence = min(0.85, (1 - uniqueness_ratio) * (1 - retrieval_diversity))
612
+
613
+ # SCORE: Range queries or sorted access patterns
614
+ elif range_ratio > 0.5:
615
+ inferred = "score"
616
+ confidence = min(0.85, range_ratio)
617
+
618
+ # TEMPORAL: Range queries + high uniqueness (likely timestamps)
619
+ elif range_ratio > 0.3 and uniqueness_ratio > 0.7:
620
+ inferred = "temporal"
621
+ confidence = min(0.8, range_ratio * uniqueness_ratio)
622
+
623
+ # CONTENT: Contains/text search queries
624
+ elif contains_ratio > 0.5:
625
+ inferred = "content"
626
+ confidence = min(0.85, contains_ratio)
627
+
628
+ # Apply minimum confidence threshold
629
+ if confidence < 0.3:
630
+ inferred = "unknown"
631
+ confidence = 0.0
632
+
633
+ self.inferred_type = inferred
634
+ self.confidence = confidence
635
+
636
+ def is_value_important(self, value_hash: str) -> bool:
637
+ """Check if a specific value is considered important.
638
+
639
+ A value is important if:
640
+ 1. It's in the important_value_hashes list (has been retrieved)
641
+ 2. It's NOT the default value (for error_indicator type)
642
+
643
+ Args:
644
+ value_hash: SHA256[:8] hash of the value to check.
645
+
646
+ Returns:
647
+ True if this value should be preserved during compression.
648
+ """
649
+ # If we don't have enough data, be conservative
650
+ if self.confidence < 0.3:
651
+ return False
652
+
653
+ # For error_indicator: non-default values are important
654
+ if self.inferred_type == "error_indicator":
655
+ if self.default_value_hash and value_hash != self.default_value_hash:
656
+ return True
657
+
658
+ # For any type: values that have been retrieved are important
659
+ if value_hash in self.important_value_hashes:
660
+ return True
661
+
662
+ # For status: check if this value has been retrieved
663
+ if self.inferred_type == "status":
664
+ return value_hash in self.value_retrieval_frequency
665
+
666
+ return False
667
+
668
+
669
+ @dataclass
670
+ class CompressionEvent:
671
+ """Record of a single compression decision (anonymized).
672
+
673
+ This captures WHAT happened, not WHAT the data was.
674
+ """
675
+
676
+ # Tool identification (anonymized)
677
+ tool_signature: ToolSignature
678
+
679
+ # Compression metrics
680
+ original_item_count: int
681
+ compressed_item_count: int
682
+ compression_ratio: float # compressed / original
683
+ original_tokens: int
684
+ compressed_tokens: int
685
+ token_reduction_ratio: float # 1 - (compressed / original)
686
+
687
+ # Strategy used
688
+ strategy: str # "top_n", "time_series", "smart_sample", "skip", etc.
689
+ strategy_reason: str | None = None # "high_variance", "has_score_field", etc.
690
+
691
+ # Crushability analysis results
692
+ crushability_score: float | None = None # 0.0 = don't crush, 1.0 = safe to crush
693
+ crushability_reason: str | None = None
694
+
695
+ # Field distributions (anonymized)
696
+ field_distributions: list[FieldDistribution] = field(default_factory=list)
697
+
698
+ # What was preserved
699
+ kept_first_n: int = 0
700
+ kept_last_n: int = 0
701
+ kept_errors: int = 0
702
+ kept_anomalies: int = 0
703
+ kept_by_relevance: int = 0
704
+ kept_by_score: int = 0
705
+
706
+ # Timing
707
+ timestamp: float = 0.0
708
+ processing_time_ms: float = 0.0
709
+
710
+ def to_dict(self) -> dict[str, Any]:
711
+ """Convert to dictionary for serialization."""
712
+ return {
713
+ "tool_signature": self.tool_signature.to_dict(),
714
+ "original_item_count": self.original_item_count,
715
+ "compressed_item_count": self.compressed_item_count,
716
+ "compression_ratio": self.compression_ratio,
717
+ "original_tokens": self.original_tokens,
718
+ "compressed_tokens": self.compressed_tokens,
719
+ "token_reduction_ratio": self.token_reduction_ratio,
720
+ "strategy": self.strategy,
721
+ "strategy_reason": self.strategy_reason,
722
+ "crushability_score": self.crushability_score,
723
+ "crushability_reason": self.crushability_reason,
724
+ "field_distributions": [f.to_dict() for f in self.field_distributions],
725
+ "kept_first_n": self.kept_first_n,
726
+ "kept_last_n": self.kept_last_n,
727
+ "kept_errors": self.kept_errors,
728
+ "kept_anomalies": self.kept_anomalies,
729
+ "kept_by_relevance": self.kept_by_relevance,
730
+ "kept_by_score": self.kept_by_score,
731
+ "timestamp": self.timestamp,
732
+ "processing_time_ms": self.processing_time_ms,
733
+ }
734
+
735
+
736
+ @dataclass
737
+ class RetrievalStats:
738
+ """Aggregated retrieval statistics for a tool signature.
739
+
740
+ This tracks how often compression decisions needed correction.
741
+ """
742
+
743
+ tool_signature_hash: str # Reference to ToolSignature.structure_hash
744
+
745
+ # Retrieval counts
746
+ total_compressions: int = 0
747
+ total_retrievals: int = 0
748
+ full_retrievals: int = 0 # Retrieved everything
749
+ search_retrievals: int = 0 # Used search filter
750
+
751
+ # Derived metrics
752
+ @property
753
+ def retrieval_rate(self) -> float:
754
+ """Fraction of compressions that triggered retrieval."""
755
+ if self.total_compressions == 0:
756
+ return 0.0
757
+ return self.total_retrievals / self.total_compressions
758
+
759
+ @property
760
+ def full_retrieval_rate(self) -> float:
761
+ """Fraction of retrievals that were full (not search)."""
762
+ if self.total_retrievals == 0:
763
+ return 0.0
764
+ return self.full_retrievals / self.total_retrievals
765
+
766
+ # Query pattern analysis (no actual queries, just patterns)
767
+ query_field_frequency: dict[str, int] = field(default_factory=dict) # field_hash -> count
768
+
769
+ def to_dict(self) -> dict[str, Any]:
770
+ """Convert to dictionary for serialization."""
771
+ return {
772
+ "tool_signature_hash": self.tool_signature_hash,
773
+ "total_compressions": self.total_compressions,
774
+ "total_retrievals": self.total_retrievals,
775
+ "full_retrievals": self.full_retrievals,
776
+ "search_retrievals": self.search_retrievals,
777
+ "retrieval_rate": self.retrieval_rate,
778
+ "full_retrieval_rate": self.full_retrieval_rate,
779
+ "query_field_frequency": self.query_field_frequency,
780
+ }
781
+
782
+
783
+ @dataclass
784
+ class AnonymizedToolStats:
785
+ """Complete anonymized statistics for a tool type.
786
+
787
+ This is what gets aggregated across users to build the data flywheel.
788
+ """
789
+
790
+ # Tool identification
791
+ signature: ToolSignature
792
+
793
+ # Compression statistics
794
+ total_compressions: int = 0
795
+ total_items_seen: int = 0
796
+ total_items_kept: int = 0
797
+ avg_compression_ratio: float = 0.0
798
+ avg_token_reduction: float = 0.0
799
+
800
+ # Strategy distribution
801
+ strategy_counts: dict[str, int] = field(default_factory=dict) # strategy -> count
802
+ strategy_success_rate: dict[str, float] = field(
803
+ default_factory=dict
804
+ ) # strategy -> success rate
805
+
806
+ # Retrieval statistics
807
+ retrieval_stats: RetrievalStats | None = None
808
+
809
+ # Learned optimal settings
810
+ recommended_min_items: int | None = None
811
+ recommended_preserve_fields: list[str] = field(default_factory=list) # field hashes
812
+ skip_compression_recommended: bool = False
813
+
814
+ # Confidence in recommendations
815
+ sample_size: int = 0
816
+ confidence: float = 0.0 # 0.0 = no confidence, 1.0 = high confidence
817
+
818
+ def to_dict(self) -> dict[str, Any]:
819
+ """Convert to dictionary for serialization."""
820
+ return {
821
+ "signature": self.signature.to_dict(),
822
+ "total_compressions": self.total_compressions,
823
+ "total_items_seen": self.total_items_seen,
824
+ "total_items_kept": self.total_items_kept,
825
+ "avg_compression_ratio": self.avg_compression_ratio,
826
+ "avg_token_reduction": self.avg_token_reduction,
827
+ "strategy_counts": self.strategy_counts,
828
+ "strategy_success_rate": self.strategy_success_rate,
829
+ "retrieval_stats": self.retrieval_stats.to_dict() if self.retrieval_stats else None,
830
+ "recommended_min_items": self.recommended_min_items,
831
+ "recommended_preserve_fields": self.recommended_preserve_fields,
832
+ "skip_compression_recommended": self.skip_compression_recommended,
833
+ "sample_size": self.sample_size,
834
+ "confidence": self.confidence,
835
+ }
836
+
837
+ @classmethod
838
+ def from_dict(cls, data: dict[str, Any]) -> AnonymizedToolStats:
839
+ """Create from dictionary.
840
+
841
+ Note: This method does not mutate the input dictionary.
842
+ """
843
+ # Use .get() instead of .pop() to avoid mutating input
844
+ signature_data = data.get("signature", {})
845
+ signature = ToolSignature.from_dict(signature_data)
846
+
847
+ retrieval_data = data.get("retrieval_stats")
848
+ retrieval_stats = None
849
+ if retrieval_data:
850
+ # Copy query_field_frequency to avoid mutation issues
851
+ query_freq = retrieval_data.get("query_field_frequency", {})
852
+ retrieval_stats = RetrievalStats(
853
+ tool_signature_hash=retrieval_data.get("tool_signature_hash", ""),
854
+ total_compressions=retrieval_data.get("total_compressions", 0),
855
+ total_retrievals=retrieval_data.get("total_retrievals", 0),
856
+ full_retrievals=retrieval_data.get("full_retrievals", 0),
857
+ search_retrievals=retrieval_data.get("search_retrievals", 0),
858
+ query_field_frequency=dict(query_freq) if query_freq else {},
859
+ )
860
+
861
+ # Filter to only dataclass fields, excluding signature and retrieval_stats
862
+ # which we've already handled
863
+ excluded_keys = {"signature", "retrieval_stats"}
864
+ filtered_data: dict[str, Any] = {}
865
+ for k, v in data.items():
866
+ if k not in cls.__dataclass_fields__ or k in excluded_keys:
867
+ continue
868
+ # Deep copy mutable values to avoid corruption if caller modifies input
869
+ if isinstance(v, dict):
870
+ filtered_data[k] = dict(v)
871
+ elif isinstance(v, list):
872
+ filtered_data[k] = list(v) # type: ignore[assignment]
873
+ else:
874
+ filtered_data[k] = v
875
+
876
+ return cls(
877
+ signature=signature,
878
+ retrieval_stats=retrieval_stats,
879
+ **filtered_data, # type: ignore[arg-type]
880
+ )