headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,2682 @@
1
+ """Smart statistical tool output compression for Headroom SDK.
2
+
3
+ This module provides intelligent JSON compression based on statistical analysis
4
+ rather than fixed rules. It analyzes data patterns and applies optimal compression
5
+ strategies to maximize token reduction while preserving important information.
6
+
7
+ SCOPE: SmartCrusher handles JSON arrays only. Non-JSON content (plain text,
8
+ search results, logs, code, diffs) passes through UNCHANGED.
9
+
10
+ TEXT COMPRESSION IS OPT-IN: For text-based content, Headroom provides standalone
11
+ utilities that applications can use explicitly:
12
+ - SearchCompressor: For grep/ripgrep output (file:line:content format)
13
+ - LogCompressor: For build/test logs (pytest, npm, cargo output)
14
+ - TextCompressor: For generic plain text with anchor preservation
15
+
16
+ Applications should decide when and how to use text compression based on their
17
+ specific needs. This design prevents lossy text compression from being applied
18
+ automatically, which could lose important context in coding tasks.
19
+
20
+ SCHEMA-PRESERVING: Output contains only items from the original array.
21
+ No wrappers, no generated text, no metadata keys. This ensures downstream
22
+ tools and parsers work unchanged.
23
+
24
+ Safe V1 Compression Recipe - Always keeps:
25
+ - First K items (default 3)
26
+ - Last K items (default 2)
27
+ - Error items (containing 'error', 'exception', 'failed', 'critical')
28
+ - Anomalous numeric items (> 2 std from mean)
29
+ - Items around detected change points
30
+ - Top-K by score if score field present
31
+ - Items with high relevance score to user query (via RelevanceScorer)
32
+
33
+ Key Features:
34
+ - RelevanceScorer: ML-powered or BM25-based relevance matching (replaces regex)
35
+ - Variance-based change point detection (preserve anomalies)
36
+ - Error item detection (never lose error messages)
37
+ - Pattern detection (time series, logs, search results)
38
+ - Strategy selection based on data characteristics
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import hashlib
44
+ import json
45
+ import logging
46
+ import math
47
+ import re
48
+ import statistics
49
+ import threading
50
+ from collections import Counter
51
+ from dataclasses import dataclass, field
52
+ from enum import Enum
53
+ from typing import Any
54
+
55
+ from ..cache.compression_feedback import CompressionFeedback, get_compression_feedback
56
+ from ..cache.compression_store import CompressionStore, get_compression_store
57
+ from ..config import CCRConfig, RelevanceScorerConfig, TransformResult
58
+ from ..relevance import RelevanceScorer, create_scorer
59
+ from ..telemetry import TelemetryCollector, ToolSignature, get_telemetry_collector
60
+ from ..telemetry.models import FieldSemantics
61
+ from ..telemetry.toin import ToolIntelligenceNetwork, get_toin
62
+ from ..tokenizer import Tokenizer
63
+ from ..utils import (
64
+ compute_short_hash,
65
+ create_tool_digest_marker,
66
+ deep_copy_messages,
67
+ safe_json_dumps,
68
+ safe_json_loads,
69
+ )
70
+ from .base import Transform
71
+
72
+ logger = logging.getLogger(__name__)
73
+
74
+ # Legacy patterns for backwards compatibility (extract_query_anchors)
75
+ _UUID_PATTERN = re.compile(
76
+ r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
77
+ )
78
+ _NUMERIC_ID_PATTERN = re.compile(r"\b\d{4,}\b") # 4+ digit numbers (likely IDs)
79
+ _HOSTNAME_PATTERN = re.compile(
80
+ r"\b[a-zA-Z0-9][-a-zA-Z0-9]*\.[a-zA-Z0-9][-a-zA-Z0-9]*(?:\.[a-zA-Z]{2,})?\b"
81
+ )
82
+ _QUOTED_STRING_PATTERN = re.compile(r"['\"]([^'\"]{1,50})['\"]") # Short quoted strings
83
+ _EMAIL_PATTERN = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
84
+
85
+
86
+ def extract_query_anchors(text: str) -> set[str]:
87
+ """Extract query anchors from user text (legacy regex-based method).
88
+
89
+ DEPRECATED: Use RelevanceScorer.score_batch() for better semantic matching.
90
+
91
+ Query anchors are identifiers or values that the user is likely searching for.
92
+ When crushing tool outputs, items matching these anchors should be preserved.
93
+
94
+ Extracts:
95
+ - UUIDs (e.g., "550e8400-e29b-41d4-a716-446655440000")
96
+ - Numeric IDs (4+ digits, e.g., "12345", "1001234")
97
+ - Hostnames (e.g., "api.example.com", "server-01.prod")
98
+ - Quoted strings (e.g., 'Alice', "error_code")
99
+ - Email addresses (e.g., "user@example.com")
100
+
101
+ Args:
102
+ text: User message text to extract anchors from.
103
+
104
+ Returns:
105
+ Set of anchor strings (lowercased for case-insensitive matching).
106
+ """
107
+ anchors: set[str] = set()
108
+
109
+ if not text:
110
+ return anchors
111
+
112
+ # UUIDs
113
+ for match in _UUID_PATTERN.findall(text):
114
+ anchors.add(match.lower())
115
+
116
+ # Numeric IDs
117
+ for match in _NUMERIC_ID_PATTERN.findall(text):
118
+ anchors.add(match)
119
+
120
+ # Hostnames
121
+ for match in _HOSTNAME_PATTERN.findall(text):
122
+ # Filter out common false positives
123
+ if match.lower() not in ("e.g", "i.e", "etc."):
124
+ anchors.add(match.lower())
125
+
126
+ # Quoted strings
127
+ for match in _QUOTED_STRING_PATTERN.findall(text):
128
+ if len(match.strip()) >= 2: # Skip very short matches
129
+ anchors.add(match.lower())
130
+
131
+ # Email addresses
132
+ for match in _EMAIL_PATTERN.findall(text):
133
+ anchors.add(match.lower())
134
+
135
+ return anchors
136
+
137
+
138
+ def item_matches_anchors(item: dict, anchors: set[str]) -> bool:
139
+ """Check if an item matches any query anchors (legacy method).
140
+
141
+ DEPRECATED: Use RelevanceScorer for better matching.
142
+
143
+ Args:
144
+ item: Dictionary item from tool output.
145
+ anchors: Set of anchor strings to match.
146
+
147
+ Returns:
148
+ True if any anchor is found in the item's string representation.
149
+ """
150
+ if not anchors:
151
+ return False
152
+
153
+ item_str = str(item).lower()
154
+ return any(anchor in item_str for anchor in anchors)
155
+
156
+
157
+ def _hash_field_name(field_name: str) -> str:
158
+ """Hash a field name to match TOIN's anonymized preserve_fields.
159
+
160
+ TOIN stores field names as SHA256[:8] hashes for privacy.
161
+ This function produces the same hash format.
162
+ """
163
+ return hashlib.sha256(field_name.encode()).hexdigest()[:8]
164
+
165
+
166
+ def _get_preserve_field_values(
167
+ item: dict,
168
+ preserve_field_hashes: list[str],
169
+ ) -> list[tuple[str, Any]]:
170
+ """Get values from item fields that match TOIN's preserve_field hashes.
171
+
172
+ TOIN stores preserve_fields as hashed field names (SHA256[:8]).
173
+ This function iterates over item fields, hashes each, and returns
174
+ matching field names and values.
175
+
176
+ Args:
177
+ item: Dictionary item from tool output.
178
+ preserve_field_hashes: List of SHA256[:8] hashed field names from TOIN.
179
+
180
+ Returns:
181
+ List of (field_name, value) tuples for fields that match.
182
+ """
183
+ if not preserve_field_hashes or not item:
184
+ return []
185
+
186
+ # Convert preserve_fields to set for O(1) lookup
187
+ hash_set = set(preserve_field_hashes)
188
+
189
+ matches = []
190
+ for field_name, value in item.items():
191
+ field_hash = _hash_field_name(field_name)
192
+ if field_hash in hash_set:
193
+ matches.append((field_name, value))
194
+
195
+ return matches
196
+
197
+
198
+ def _item_has_preserve_field_match(
199
+ item: dict,
200
+ preserve_field_hashes: list[str],
201
+ query_context: str,
202
+ ) -> bool:
203
+ """Check if item has a preserve_field value that matches query context.
204
+
205
+ Args:
206
+ item: Dictionary item from tool output.
207
+ preserve_field_hashes: List of SHA256[:8] hashed field names from TOIN.
208
+ query_context: User's query to match against field values.
209
+
210
+ Returns:
211
+ True if any preserve_field value matches the query context.
212
+ """
213
+ if not query_context:
214
+ return False
215
+
216
+ query_lower = query_context.lower()
217
+
218
+ for _field_name, value in _get_preserve_field_values(item, preserve_field_hashes):
219
+ if value is not None:
220
+ value_str = str(value).lower()
221
+ if value_str in query_lower or query_lower in value_str:
222
+ return True
223
+
224
+ return False
225
+
226
+
227
+ class CompressionStrategy(Enum):
228
+ """Compression strategies based on data patterns."""
229
+
230
+ NONE = "none" # No compression needed
231
+ SKIP = "skip" # Explicitly skip - not safe to crush
232
+ TIME_SERIES = "time_series" # Keep change points, summarize stable
233
+ CLUSTER_SAMPLE = "cluster" # Dedupe similar items
234
+ TOP_N = "top_n" # Keep highest scored items
235
+ SMART_SAMPLE = "smart_sample" # Statistical sampling with constants
236
+
237
+
238
+ # =====================================================================
239
+ # STATISTICAL FIELD DETECTION (replaces hardcoded string patterns)
240
+ # =====================================================================
241
+ # Instead of matching field names like "id", "score", "error", we use
242
+ # statistical and structural properties of the data to detect field types.
243
+
244
+
245
+ def _is_uuid_format(value: str) -> bool:
246
+ """Check if a string looks like a UUID (structural pattern)."""
247
+ if not isinstance(value, str) or len(value) != 36:
248
+ return False
249
+ # UUID format: 8-4-4-4-12 hex chars
250
+ parts = value.split("-")
251
+ if len(parts) != 5:
252
+ return False
253
+ expected_lens = [8, 4, 4, 4, 12]
254
+ for part, expected_len in zip(parts, expected_lens):
255
+ if len(part) != expected_len:
256
+ return False
257
+ if not all(c in "0123456789abcdefABCDEF" for c in part):
258
+ return False
259
+ return True
260
+
261
+
262
+ def _calculate_string_entropy(s: str) -> float:
263
+ """Calculate Shannon entropy of a string, normalized to [0, 1].
264
+
265
+ High entropy (>0.7) suggests random/ID-like content.
266
+ Low entropy (<0.3) suggests repetitive/predictable content.
267
+ """
268
+ if not s or len(s) < 2:
269
+ return 0.0
270
+
271
+ # Count character frequencies
272
+ freq: dict[str, int] = {}
273
+ for c in s:
274
+ freq[c] = freq.get(c, 0) + 1
275
+
276
+ # Calculate entropy
277
+ import math
278
+
279
+ entropy = 0.0
280
+ length = len(s)
281
+ for count in freq.values():
282
+ p = count / length
283
+ if p > 0:
284
+ entropy -= p * math.log2(p)
285
+
286
+ # Normalize by max possible entropy for this length
287
+ max_entropy = math.log2(min(len(freq), length))
288
+ if max_entropy > 0:
289
+ return entropy / max_entropy
290
+ return 0.0
291
+
292
+
293
+ def _detect_sequential_pattern(values: list[Any], check_order: bool = True) -> bool:
294
+ """Detect if numeric values form a sequential pattern (like IDs: 1,2,3,...).
295
+
296
+ Returns True if values appear to be auto-incrementing or sequential.
297
+
298
+ Args:
299
+ values: List of values to check.
300
+ check_order: If True, also check if values are in ascending order in the array.
301
+ Score fields are often sorted descending, while IDs are ascending.
302
+ """
303
+ if len(values) < 5:
304
+ return False
305
+
306
+ # Get numeric values
307
+ nums = []
308
+ for v in values:
309
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
310
+ nums.append(v)
311
+ elif isinstance(v, str):
312
+ try:
313
+ nums.append(int(v))
314
+ except ValueError:
315
+ pass
316
+
317
+ if len(nums) < 5:
318
+ return False
319
+
320
+ # Check if sorted values form a near-sequence
321
+ sorted_nums = sorted(nums)
322
+ diffs = [sorted_nums[i + 1] - sorted_nums[i] for i in range(len(sorted_nums) - 1)]
323
+
324
+ if not diffs:
325
+ return False
326
+
327
+ # If most differences are 1 (or small constant), it's sequential
328
+ avg_diff = sum(diffs) / len(diffs)
329
+ if 0.5 <= avg_diff <= 2.0:
330
+ # Check consistency - sequential IDs have consistent spacing
331
+ consistent_count = sum(1 for d in diffs if 0.5 <= d <= 2.0)
332
+ is_sequential = consistent_count / len(diffs) > 0.8
333
+
334
+ # Additional check: IDs are typically in ASCENDING order in the array
335
+ # Scores sorted by relevance are typically in DESCENDING order
336
+ if check_order and is_sequential:
337
+ # Check if original order is ascending (like IDs)
338
+ ascending_count = sum(1 for i in range(len(nums) - 1) if nums[i] <= nums[i + 1])
339
+ is_ascending = ascending_count / (len(nums) - 1) > 0.7
340
+ return is_ascending # Only flag as sequential if ascending (ID-like)
341
+
342
+ return is_sequential
343
+
344
+ return False
345
+
346
+
347
+ def _detect_id_field_statistically(stats: FieldStats, values: list[Any]) -> tuple[bool, float]:
348
+ """Detect if a field is an ID field using statistical properties.
349
+
350
+ Returns (is_id_field, confidence).
351
+
352
+ ID fields have:
353
+ - Very high uniqueness (>0.95)
354
+ - Sequential numeric pattern OR UUID format OR high entropy strings
355
+ """
356
+ # Must have high uniqueness
357
+ if stats.unique_ratio < 0.9:
358
+ return False, 0.0
359
+
360
+ confidence = 0.0
361
+
362
+ # Check for UUID format (structural detection)
363
+ if stats.field_type == "string":
364
+ sample_values = [v for v in values[:20] if isinstance(v, str)]
365
+ uuid_count = sum(1 for v in sample_values if _is_uuid_format(v))
366
+ if sample_values and uuid_count / len(sample_values) > 0.8:
367
+ return True, 0.95
368
+
369
+ # Check for high entropy (random string IDs)
370
+ if sample_values:
371
+ avg_entropy = sum(_calculate_string_entropy(v) for v in sample_values) / len(
372
+ sample_values
373
+ )
374
+ if avg_entropy > 0.7 and stats.unique_ratio > 0.95:
375
+ confidence = 0.8
376
+ return True, confidence
377
+
378
+ # Check for sequential numeric pattern
379
+ if stats.field_type == "numeric":
380
+ if _detect_sequential_pattern(values) and stats.unique_ratio > 0.95:
381
+ return True, 0.9
382
+
383
+ # High uniqueness numeric with high range suggests ID
384
+ if stats.min_val is not None and stats.max_val is not None:
385
+ value_range = stats.max_val - stats.min_val
386
+ if value_range > 0 and stats.unique_ratio > 0.95:
387
+ return True, 0.85
388
+
389
+ # Very high uniqueness alone is a signal (even without other patterns)
390
+ if stats.unique_ratio > 0.98:
391
+ return True, 0.7
392
+
393
+ return False, 0.0
394
+
395
+
396
+ def _detect_score_field_statistically(stats: FieldStats, items: list[dict]) -> tuple[bool, float]:
397
+ """Detect if a field is a score/ranking field using statistical properties.
398
+
399
+ Returns (is_score_field, confidence).
400
+
401
+ Score fields have:
402
+ - Numeric type
403
+ - Bounded range (0-1, 0-10, 0-100, or similar)
404
+ - NOT sequential (unlike IDs)
405
+ - Often the data appears sorted by this field (descending)
406
+ """
407
+ if stats.field_type != "numeric":
408
+ return False, 0.0
409
+
410
+ if stats.min_val is None or stats.max_val is None:
411
+ return False, 0.0
412
+
413
+ confidence = 0.0
414
+
415
+ # Check for bounded range typical of scores
416
+ stats.max_val - stats.min_val
417
+ min_val, max_val = stats.min_val, stats.max_val
418
+
419
+ # Common score ranges: [0,1], [0,10], [0,100], [-1,1], [0,5]
420
+ is_bounded = False
421
+ if 0 <= min_val <= 1 and 0 <= max_val <= 1: # [0,1] range
422
+ is_bounded = True
423
+ confidence += 0.4
424
+ elif 0 <= min_val <= 10 and 0 <= max_val <= 10: # [0,10] range
425
+ is_bounded = True
426
+ confidence += 0.3
427
+ elif 0 <= min_val <= 100 and 0 <= max_val <= 100: # [0,100] range
428
+ is_bounded = True
429
+ confidence += 0.25
430
+ elif -1 <= min_val and max_val <= 1: # [-1,1] range
431
+ is_bounded = True
432
+ confidence += 0.35
433
+
434
+ if not is_bounded:
435
+ return False, 0.0
436
+
437
+ # Should NOT be sequential (IDs are sequential, scores are not)
438
+ sample_values = [item.get(stats.name) for item in items[:50] if stats.name in item]
439
+ if _detect_sequential_pattern(sample_values):
440
+ return False, 0.0
441
+
442
+ # Check if data appears sorted by this field (descending = relevance sorted)
443
+ # Filter out NaN/Inf which break comparisons
444
+ values_in_order: list[float] = []
445
+ for item in items:
446
+ if stats.name in item:
447
+ val = item.get(stats.name)
448
+ if isinstance(val, (int, float)) and math.isfinite(val):
449
+ values_in_order.append(float(val))
450
+ if len(values_in_order) >= 5:
451
+ # Check for descending sort
452
+ descending_count = sum(
453
+ 1
454
+ for i in range(len(values_in_order) - 1)
455
+ if values_in_order[i] >= values_in_order[i + 1]
456
+ )
457
+ if descending_count / (len(values_in_order) - 1) > 0.7:
458
+ confidence += 0.3
459
+
460
+ # Score fields often have floating point values
461
+ # Filter out NaN/Inf which can't be converted to int
462
+ float_count = sum(
463
+ 1 for v in values_in_order[:20] if isinstance(v, float) and math.isfinite(v) and v != int(v)
464
+ )
465
+ if float_count > len(values_in_order[:20]) * 0.3:
466
+ confidence += 0.1
467
+
468
+ return confidence >= 0.4, min(confidence, 0.95)
469
+
470
+
471
+ def _detect_structural_outliers(items: list[dict]) -> list[int]:
472
+ """Detect items that are structural outliers (error-like items).
473
+
474
+ Instead of looking for "error" keywords, we detect:
475
+ 1. Items with extra fields that others don't have
476
+ 2. Items with rare status/state values
477
+ 3. Items with significantly different structure
478
+
479
+ Returns indices of outlier items.
480
+ """
481
+ if len(items) < 5:
482
+ return []
483
+
484
+ outlier_indices: list[int] = []
485
+
486
+ # 1. Detect items with extra fields
487
+ # Find the "common" field set (fields present in >80% of items)
488
+ field_counts: dict[str, int] = {}
489
+ for item in items:
490
+ if isinstance(item, dict):
491
+ for key in item.keys():
492
+ field_counts[key] = field_counts.get(key, 0) + 1
493
+
494
+ n = len(items)
495
+ common_fields = {k for k, v in field_counts.items() if v >= n * 0.8}
496
+ rare_fields = {k for k, v in field_counts.items() if v < n * 0.2}
497
+
498
+ for i, item in enumerate(items):
499
+ if not isinstance(item, dict):
500
+ continue
501
+
502
+ item_fields = set(item.keys())
503
+
504
+ # Has rare fields that most items don't have
505
+ has_rare = bool(item_fields & rare_fields)
506
+ if has_rare:
507
+ outlier_indices.append(i)
508
+ continue
509
+
510
+ # 2. Detect rare status/state values
511
+ # Find fields that look like status fields (low cardinality, categorical)
512
+ status_outliers = _detect_rare_status_values(items, common_fields)
513
+ outlier_indices.extend(status_outliers)
514
+
515
+ return list(set(outlier_indices))
516
+
517
+
518
+ def _detect_rare_status_values(items: list[dict], common_fields: set[str]) -> list[int]:
519
+ """Detect items with rare values in status-like fields.
520
+
521
+ A status field has low cardinality (few distinct values).
522
+ If 95%+ have the same value, items with different values are interesting.
523
+ """
524
+ outlier_indices: list[int] = []
525
+
526
+ # Find potential status fields (low cardinality)
527
+ for field_name in common_fields:
528
+ values = [
529
+ item.get(field_name) for item in items if isinstance(item, dict) and field_name in item
530
+ ]
531
+
532
+ # Skip if too few values or non-hashable
533
+ try:
534
+ unique_values = {str(v) for v in values if v is not None}
535
+ except Exception:
536
+ continue
537
+
538
+ # Status field = low cardinality (2-10 distinct values)
539
+ if not (2 <= len(unique_values) <= 10):
540
+ continue
541
+
542
+ # Count value frequencies
543
+ value_counts: dict[str, int] = {}
544
+ for v in values:
545
+ key = str(v) if v is not None else "__none__"
546
+ value_counts[key] = value_counts.get(key, 0) + 1
547
+
548
+ # Find the dominant value
549
+ if not value_counts:
550
+ continue
551
+
552
+ max_count = max(value_counts.values())
553
+ total = len(values)
554
+
555
+ # If one value dominates (>90%), others are interesting
556
+ if max_count >= total * 0.9:
557
+ dominant_value = max(value_counts.keys(), key=lambda k: value_counts[k])
558
+
559
+ for i, item in enumerate(items):
560
+ if not isinstance(item, dict) or field_name not in item:
561
+ continue
562
+ item_value = str(item[field_name]) if item[field_name] is not None else "__none__"
563
+ if item_value != dominant_value:
564
+ outlier_indices.append(i)
565
+
566
+ return outlier_indices
567
+
568
+
569
+ # Error keywords for PRESERVATION guarantee (not crushability detection)
570
+ # This is for the quality guarantee: "ALL error items are ALWAYS preserved"
571
+ # regardless of how common they are. Used in _prioritize_indices().
572
+ _ERROR_KEYWORDS_FOR_PRESERVATION = frozenset(
573
+ {
574
+ "error",
575
+ "exception",
576
+ "failed",
577
+ "failure",
578
+ "critical",
579
+ "fatal",
580
+ "crash",
581
+ "panic",
582
+ "abort",
583
+ "timeout",
584
+ "denied",
585
+ "rejected",
586
+ }
587
+ )
588
+
589
+
590
+ def _detect_error_items_for_preservation(items: list[dict]) -> list[int]:
591
+ """Detect items containing error keywords for PRESERVATION guarantee.
592
+
593
+ This is NOT for crushability analysis - it's for ensuring ALL error items
594
+ are retained during compression. The quality guarantee is that error items
595
+ are NEVER dropped, even if errors are common in the dataset.
596
+
597
+ Uses keywords because error semantics are well-defined across domains.
598
+ """
599
+ error_indices: list[int] = []
600
+
601
+ for i, item in enumerate(items):
602
+ if not isinstance(item, dict):
603
+ continue
604
+
605
+ # Serialize item to check all content
606
+ try:
607
+ item_str = json.dumps(item).lower()
608
+ except Exception:
609
+ continue
610
+
611
+ # Check if any error keyword is present
612
+ for keyword in _ERROR_KEYWORDS_FOR_PRESERVATION:
613
+ if keyword in item_str:
614
+ error_indices.append(i)
615
+ break
616
+
617
+ return error_indices
618
+
619
+
620
+ def _detect_items_by_learned_semantics(
621
+ items: list[dict],
622
+ field_semantics: dict[str, FieldSemantics],
623
+ ) -> list[int]:
624
+ """Detect items with important values based on learned field semantics.
625
+
626
+ This is the TOIN Evolution integration - uses learned field semantic types
627
+ to identify items that should be preserved during compression.
628
+
629
+ Key insight: Instead of hardcoded patterns, we learn from user behavior
630
+ which field values are actually important (e.g., error indicators, rare
631
+ status values, identifiers that get queried).
632
+
633
+ Args:
634
+ items: List of items to analyze.
635
+ field_semantics: Learned field semantics from TOIN (field_hash -> FieldSemantics).
636
+
637
+ Returns:
638
+ List of indices for items containing important values.
639
+ """
640
+ if not field_semantics or not items:
641
+ return []
642
+
643
+ important_indices: list[int] = []
644
+
645
+ # Build a quick lookup for field_hash -> FieldSemantics
646
+ # Pre-filter to fields with sufficient confidence
647
+ confident_semantics = {
648
+ fh: fs
649
+ for fh, fs in field_semantics.items()
650
+ if fs.confidence >= 0.3 and fs.inferred_type != "unknown"
651
+ }
652
+
653
+ if not confident_semantics:
654
+ return []
655
+
656
+ for i, item in enumerate(items):
657
+ if not isinstance(item, dict):
658
+ continue
659
+
660
+ for field_name, value in item.items():
661
+ # Hash the field name to match TOIN's format
662
+ field_hash = hashlib.sha256(field_name.encode()).hexdigest()[:8]
663
+
664
+ if field_hash not in confident_semantics:
665
+ continue
666
+
667
+ field_sem = confident_semantics[field_hash]
668
+
669
+ # Hash the value to check importance
670
+ if value is None:
671
+ value_canonical = "null"
672
+ elif isinstance(value, bool):
673
+ value_canonical = "true" if value else "false"
674
+ elif isinstance(value, (int, float)):
675
+ value_canonical = str(value)
676
+ elif isinstance(value, str):
677
+ value_canonical = value
678
+ elif isinstance(value, (list, dict)):
679
+ try:
680
+ value_canonical = json.dumps(value, sort_keys=True, default=str)
681
+ except (TypeError, ValueError):
682
+ value_canonical = str(value)
683
+ else:
684
+ value_canonical = str(value)
685
+
686
+ value_hash = hashlib.sha256(value_canonical.encode()).hexdigest()[:8]
687
+
688
+ # Check if this value is important based on learned semantics
689
+ if field_sem.is_value_important(value_hash):
690
+ important_indices.append(i)
691
+ break # Only need to mark item once
692
+
693
+ return important_indices
694
+
695
+
696
+ @dataclass
697
+ class CrushabilityAnalysis:
698
+ """Analysis of whether an array is safe to crush.
699
+
700
+ The key insight: if we don't have a reliable SIGNAL to determine
701
+ which items are important, we should NOT crush at all.
702
+
703
+ Signals include:
704
+ - Score/rank fields (search results)
705
+ - Error keywords (logs)
706
+ - Numeric anomalies (metrics)
707
+ - Low uniqueness (repetitive data where sampling is representative)
708
+
709
+ High variability + No signal = DON'T CRUSH
710
+ """
711
+
712
+ crushable: bool
713
+ confidence: float # 0.0 to 1.0
714
+ reason: str
715
+ signals_present: list[str] = field(default_factory=list)
716
+ signals_absent: list[str] = field(default_factory=list)
717
+
718
+ # Detailed metrics
719
+ has_id_field: bool = False
720
+ id_uniqueness: float = 0.0
721
+ avg_string_uniqueness: float = 0.0
722
+ has_score_field: bool = False
723
+ error_item_count: int = 0
724
+ anomaly_count: int = 0
725
+
726
+
727
+ @dataclass
728
+ class FieldStats:
729
+ """Statistics for a single field across array items."""
730
+
731
+ name: str
732
+ field_type: str # "numeric", "string", "boolean", "object", "array", "null"
733
+ count: int
734
+ unique_count: int
735
+ unique_ratio: float
736
+ is_constant: bool
737
+ constant_value: Any = None
738
+
739
+ # Numeric-specific stats
740
+ min_val: float | None = None
741
+ max_val: float | None = None
742
+ mean_val: float | None = None
743
+ variance: float | None = None
744
+ change_points: list[int] = field(default_factory=list)
745
+
746
+ # String-specific stats
747
+ avg_length: float | None = None
748
+ top_values: list[tuple[str, int]] = field(default_factory=list)
749
+
750
+
751
+ @dataclass
752
+ class ArrayAnalysis:
753
+ """Complete analysis of an array."""
754
+
755
+ item_count: int
756
+ field_stats: dict[str, FieldStats]
757
+ detected_pattern: str # "time_series", "logs", "search_results", "generic"
758
+ recommended_strategy: CompressionStrategy
759
+ constant_fields: dict[str, Any]
760
+ estimated_reduction: float
761
+ crushability: CrushabilityAnalysis | None = None # Whether it's safe to crush
762
+
763
+
764
+ @dataclass
765
+ class CompressionPlan:
766
+ """Plan for how to compress an array."""
767
+
768
+ strategy: CompressionStrategy
769
+ keep_indices: list[int] = field(default_factory=list)
770
+ constant_fields: dict[str, Any] = field(default_factory=dict)
771
+ summary_ranges: list[tuple[int, int, dict]] = field(default_factory=list)
772
+ cluster_field: str | None = None
773
+ sort_field: str | None = None
774
+ keep_count: int = 10
775
+
776
+
777
+ @dataclass
778
+ class CrushResult:
779
+ """Result from SmartCrusher.crush() method.
780
+
781
+ Used by ContentRouter when routing JSON arrays to SmartCrusher.
782
+ """
783
+
784
+ compressed: str
785
+ original: str
786
+ was_modified: bool
787
+ strategy: str = "passthrough"
788
+
789
+
790
+ @dataclass
791
+ class SmartCrusherConfig:
792
+ """Configuration for smart crusher.
793
+
794
+ SCHEMA-PRESERVING: Output contains only items from the original array.
795
+ No wrappers, no generated text, no metadata keys.
796
+ """
797
+
798
+ enabled: bool = True
799
+ min_items_to_analyze: int = 5 # Don't analyze tiny arrays
800
+ min_tokens_to_crush: int = 200 # Only crush if > N tokens
801
+ variance_threshold: float = 2.0 # Std devs for change point detection
802
+ uniqueness_threshold: float = 0.1 # Below this = nearly constant
803
+ similarity_threshold: float = 0.8 # For clustering similar strings
804
+ max_items_after_crush: int = 15 # Target max items in output
805
+ preserve_change_points: bool = True
806
+ factor_out_constants: bool = False # Disabled - preserves original schema
807
+ include_summaries: bool = False # Disabled - no generated text
808
+
809
+ # Feedback loop integration
810
+ use_feedback_hints: bool = True # Use learned patterns to adjust compression
811
+
812
+ # LOW FIX #21: Make TOIN confidence threshold configurable
813
+ # Minimum confidence required to apply TOIN recommendations
814
+ toin_confidence_threshold: float = 0.5
815
+
816
+
817
+ class SmartAnalyzer:
818
+ """Analyzes JSON arrays to determine optimal compression strategy."""
819
+
820
+ def __init__(self, config: SmartCrusherConfig | None = None):
821
+ self.config = config or SmartCrusherConfig()
822
+
823
+ def analyze_array(self, items: list[dict]) -> ArrayAnalysis:
824
+ """Perform complete statistical analysis of an array."""
825
+ if not items or not isinstance(items[0], dict):
826
+ return ArrayAnalysis(
827
+ item_count=len(items) if items else 0,
828
+ field_stats={},
829
+ detected_pattern="generic",
830
+ recommended_strategy=CompressionStrategy.NONE,
831
+ constant_fields={},
832
+ estimated_reduction=0.0,
833
+ )
834
+
835
+ # Analyze each field
836
+ field_stats = {}
837
+ all_keys: set[str] = set()
838
+ for item in items:
839
+ if isinstance(item, dict):
840
+ all_keys.update(item.keys())
841
+
842
+ for key in all_keys:
843
+ field_stats[key] = self._analyze_field(key, items)
844
+
845
+ # Detect pattern
846
+ pattern = self._detect_pattern(field_stats, items)
847
+
848
+ # Extract constants
849
+ constant_fields = {k: v.constant_value for k, v in field_stats.items() if v.is_constant}
850
+
851
+ # CRITICAL: Analyze crushability BEFORE selecting strategy
852
+ crushability = self.analyze_crushability(items, field_stats)
853
+
854
+ # Select strategy (respects crushability)
855
+ strategy = self._select_strategy(field_stats, pattern, len(items), crushability)
856
+
857
+ # Estimate reduction (0 if not crushable)
858
+ if strategy == CompressionStrategy.SKIP:
859
+ reduction = 0.0
860
+ else:
861
+ reduction = self._estimate_reduction(field_stats, strategy, len(items))
862
+
863
+ return ArrayAnalysis(
864
+ item_count=len(items),
865
+ field_stats=field_stats,
866
+ detected_pattern=pattern,
867
+ recommended_strategy=strategy,
868
+ constant_fields=constant_fields,
869
+ estimated_reduction=reduction,
870
+ crushability=crushability,
871
+ )
872
+
873
+ def _analyze_field(self, key: str, items: list[dict]) -> FieldStats:
874
+ """Analyze a single field across all items."""
875
+ values = [item.get(key) for item in items if isinstance(item, dict)]
876
+ non_null_values = [v for v in values if v is not None]
877
+
878
+ if not non_null_values:
879
+ return FieldStats(
880
+ name=key,
881
+ field_type="null",
882
+ count=len(values),
883
+ unique_count=0,
884
+ unique_ratio=0.0,
885
+ is_constant=True,
886
+ constant_value=None,
887
+ )
888
+
889
+ # Determine type from first non-null value
890
+ first_val = non_null_values[0]
891
+ if isinstance(first_val, bool):
892
+ field_type = "boolean"
893
+ elif isinstance(first_val, (int, float)):
894
+ field_type = "numeric"
895
+ elif isinstance(first_val, str):
896
+ field_type = "string"
897
+ elif isinstance(first_val, dict):
898
+ field_type = "object"
899
+ elif isinstance(first_val, list):
900
+ field_type = "array"
901
+ else:
902
+ field_type = "unknown"
903
+
904
+ # Compute uniqueness
905
+ str_values = [str(v) for v in values]
906
+ unique_values = set(str_values)
907
+ unique_count = len(unique_values)
908
+ unique_ratio = unique_count / len(values) if values else 0
909
+
910
+ # Check if constant
911
+ is_constant = unique_count == 1
912
+ constant_value = non_null_values[0] if is_constant else None
913
+
914
+ stats = FieldStats(
915
+ name=key,
916
+ field_type=field_type,
917
+ count=len(values),
918
+ unique_count=unique_count,
919
+ unique_ratio=unique_ratio,
920
+ is_constant=is_constant,
921
+ constant_value=constant_value,
922
+ )
923
+
924
+ # Numeric-specific analysis
925
+ if field_type == "numeric":
926
+ # Filter out NaN and Infinity which break statistics functions
927
+ nums = [v for v in non_null_values if isinstance(v, (int, float)) and math.isfinite(v)]
928
+ if nums:
929
+ try:
930
+ stats.min_val = min(nums)
931
+ stats.max_val = max(nums)
932
+ stats.mean_val = statistics.mean(nums)
933
+ stats.variance = statistics.variance(nums) if len(nums) > 1 else 0
934
+ stats.change_points = self._detect_change_points(nums)
935
+ except (OverflowError, ValueError):
936
+ # Extreme values that overflow - skip detailed statistics
937
+ stats.min_val = None
938
+ stats.max_val = None
939
+ stats.mean_val = None
940
+ stats.variance = 0
941
+ stats.change_points = []
942
+
943
+ # String-specific analysis
944
+ elif field_type == "string":
945
+ strs = [v for v in non_null_values if isinstance(v, str)]
946
+ if strs:
947
+ stats.avg_length = statistics.mean(len(s) for s in strs)
948
+ stats.top_values = Counter(strs).most_common(5)
949
+
950
+ return stats
951
+
952
+ def _detect_change_points(self, values: list[float], window: int = 5) -> list[int]:
953
+ """Detect indices where values change significantly."""
954
+ if len(values) < window * 2:
955
+ return []
956
+
957
+ change_points = []
958
+
959
+ # Calculate overall statistics
960
+ overall_std = statistics.stdev(values) if len(values) > 1 else 0
961
+ if overall_std == 0:
962
+ return []
963
+
964
+ threshold = self.config.variance_threshold * overall_std
965
+
966
+ # Sliding window comparison
967
+ for i in range(window, len(values) - window):
968
+ before_mean = statistics.mean(values[i - window : i])
969
+ after_mean = statistics.mean(values[i : i + window])
970
+
971
+ if abs(after_mean - before_mean) > threshold:
972
+ change_points.append(i)
973
+
974
+ # Deduplicate nearby change points
975
+ if change_points:
976
+ deduped = [change_points[0]]
977
+ for cp in change_points[1:]:
978
+ if cp - deduped[-1] > window:
979
+ deduped.append(cp)
980
+ return deduped
981
+
982
+ return []
983
+
984
+ def _detect_pattern(self, field_stats: dict[str, FieldStats], items: list[dict]) -> str:
985
+ """Detect the data pattern using STATISTICAL analysis (no hardcoded field names).
986
+
987
+ Pattern detection:
988
+ - TIME_SERIES: Has a temporal field (detected by value format) + numeric variance
989
+ - LOGS: Has a high-cardinality string field + low-cardinality categorical field
990
+ - SEARCH_RESULTS: Has a score-like field (bounded numeric, possibly sorted)
991
+ - GENERIC: Default
992
+ """
993
+ # Check for time series pattern using STRUCTURAL detection
994
+ has_timestamp = self._detect_temporal_field(field_stats, items)
995
+
996
+ numeric_fields = [k for k, v in field_stats.items() if v.field_type == "numeric"]
997
+ has_numeric_with_variance = any(
998
+ (field_stats[k].variance is not None and (field_stats[k].variance or 0) > 0)
999
+ for k in numeric_fields
1000
+ )
1001
+
1002
+ if has_timestamp and has_numeric_with_variance:
1003
+ return "time_series"
1004
+
1005
+ # Check for logs pattern using STATISTICAL detection
1006
+ # Logs have: high-cardinality string (message) + low-cardinality categorical (level)
1007
+ has_message_like = False
1008
+ has_level_like = False
1009
+
1010
+ for _name, stats in field_stats.items():
1011
+ if stats.field_type == "string":
1012
+ # High-cardinality string = likely message field
1013
+ if stats.unique_ratio > 0.5 and stats.avg_length and stats.avg_length > 20:
1014
+ has_message_like = True
1015
+ # Low-cardinality string = likely level/status field
1016
+ elif stats.unique_ratio < 0.1 and 2 <= stats.unique_count <= 10:
1017
+ has_level_like = True
1018
+
1019
+ if has_message_like and has_level_like:
1020
+ return "logs"
1021
+
1022
+ # Check for search results pattern using STATISTICAL score detection
1023
+ for _name, stats in field_stats.items():
1024
+ is_score, confidence = _detect_score_field_statistically(stats, items)
1025
+ if is_score and confidence >= 0.5:
1026
+ return "search_results"
1027
+
1028
+ return "generic"
1029
+
1030
+ def _detect_temporal_field(self, field_stats: dict[str, FieldStats], items: list[dict]) -> bool:
1031
+ """Detect if any field contains temporal values (dates/timestamps).
1032
+
1033
+ Uses STRUCTURAL detection based on value format, not field names.
1034
+ """
1035
+ # Check string fields for ISO 8601 patterns
1036
+ iso_datetime_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}")
1037
+ iso_date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}$")
1038
+
1039
+ for name, stats in field_stats.items():
1040
+ if stats.field_type == "string":
1041
+ # Sample some values
1042
+ sample_values = [
1043
+ item.get(name) for item in items[:10] if isinstance(item.get(name), str)
1044
+ ]
1045
+ if sample_values:
1046
+ # Check if values look like dates/datetimes
1047
+ iso_count = sum(
1048
+ 1
1049
+ for v in sample_values
1050
+ if v is not None
1051
+ and (iso_datetime_pattern.match(v) or iso_date_pattern.match(v))
1052
+ )
1053
+ if iso_count / len(sample_values) > 0.5:
1054
+ return True
1055
+
1056
+ # Check numeric fields for Unix timestamp range
1057
+ elif stats.field_type == "numeric":
1058
+ if stats.min_val and stats.max_val:
1059
+ # Unix timestamps (seconds): 1000000000 to 2000000000 (roughly 2001-2033)
1060
+ # Unix timestamps (milliseconds): 1000000000000 to 2000000000000
1061
+ is_unix_seconds = 1000000000 <= stats.min_val <= 2000000000
1062
+ is_unix_millis = 1000000000000 <= stats.min_val <= 2000000000000
1063
+ if is_unix_seconds or is_unix_millis:
1064
+ return True
1065
+
1066
+ return False
1067
+
1068
+ def analyze_crushability(
1069
+ self,
1070
+ items: list[dict],
1071
+ field_stats: dict[str, FieldStats],
1072
+ ) -> CrushabilityAnalysis:
1073
+ """Analyze whether it's SAFE to crush this array.
1074
+
1075
+ The key insight: High variability + No importance signal = DON'T CRUSH.
1076
+
1077
+ We use STATISTICAL detection (no hardcoded field names):
1078
+ 1. ID fields detected by uniqueness + sequential/UUID/entropy patterns
1079
+ 2. Score fields detected by bounded range + sorted order
1080
+ 3. Error items detected by structural outliers (rare fields, rare status values)
1081
+ 4. Numeric anomalies (importance signal)
1082
+ 5. Low uniqueness (safe to sample)
1083
+
1084
+ Returns:
1085
+ CrushabilityAnalysis with decision and reasoning.
1086
+ """
1087
+ signals_present: list[str] = []
1088
+ signals_absent: list[str] = []
1089
+
1090
+ # 1. Detect ID field STATISTICALLY (no hardcoded field names)
1091
+ id_field_name = None
1092
+ id_uniqueness = 0.0
1093
+ id_confidence = 0.0
1094
+ for name, stats in field_stats.items():
1095
+ values = [item.get(name) for item in items if isinstance(item, dict)]
1096
+ is_id, confidence = _detect_id_field_statistically(stats, values)
1097
+ if is_id and confidence > id_confidence:
1098
+ id_field_name = name
1099
+ id_uniqueness = stats.unique_ratio
1100
+ id_confidence = confidence
1101
+
1102
+ has_id_field = id_field_name is not None and id_confidence >= 0.7
1103
+
1104
+ # 2. Detect score/rank field STATISTICALLY (no hardcoded field names)
1105
+ has_score_field = False
1106
+ for name, stats in field_stats.items():
1107
+ is_score, confidence = _detect_score_field_statistically(stats, items)
1108
+ if is_score:
1109
+ has_score_field = True
1110
+ signals_present.append(f"score_field:{name}(conf={confidence:.2f})")
1111
+ break
1112
+ if not has_score_field:
1113
+ signals_absent.append("score_field")
1114
+
1115
+ # 3. Detect error items via STRUCTURAL OUTLIERS (no hardcoded keywords)
1116
+ outlier_indices = _detect_structural_outliers(items)
1117
+ structural_outlier_count = len(outlier_indices)
1118
+
1119
+ if structural_outlier_count > 0:
1120
+ signals_present.append(f"structural_outliers:{structural_outlier_count}")
1121
+ else:
1122
+ signals_absent.append("structural_outliers")
1123
+
1124
+ # 3b. Also detect errors via keywords in content (for log/message-style data)
1125
+ # This catches errors that are in the content but not structural outliers
1126
+ # (e.g., Slack messages where error is in the text field)
1127
+ error_keyword_indices = _detect_error_items_for_preservation(items)
1128
+ keyword_error_count = len(error_keyword_indices)
1129
+
1130
+ if keyword_error_count > 0 and structural_outlier_count == 0:
1131
+ signals_present.append(f"error_keywords:{keyword_error_count}")
1132
+
1133
+ # Combined error count for crushability analysis
1134
+ error_count = max(structural_outlier_count, keyword_error_count)
1135
+
1136
+ # 4. Count numeric anomalies (importance signal)
1137
+ anomaly_count = 0
1138
+ anomaly_indices: set[int] = set()
1139
+ for stats in field_stats.values():
1140
+ if stats.field_type == "numeric" and stats.mean_val is not None and stats.variance:
1141
+ std = stats.variance**0.5
1142
+ if std > 0:
1143
+ threshold = self.config.variance_threshold * std
1144
+ for i, item in enumerate(items):
1145
+ val = item.get(stats.name)
1146
+ if isinstance(val, (int, float)):
1147
+ if abs(val - stats.mean_val) > threshold:
1148
+ anomaly_indices.add(i)
1149
+
1150
+ anomaly_count = len(anomaly_indices)
1151
+ if anomaly_count > 0:
1152
+ signals_present.append(f"anomalies:{anomaly_count}")
1153
+ else:
1154
+ signals_absent.append("anomalies")
1155
+
1156
+ # 5. Compute average string uniqueness (EXCLUDING statistically-detected ID fields)
1157
+ string_stats = [
1158
+ s for s in field_stats.values() if s.field_type == "string" and s.name != id_field_name
1159
+ ]
1160
+ avg_string_uniqueness = (
1161
+ statistics.mean(s.unique_ratio for s in string_stats) if string_stats else 0.0
1162
+ )
1163
+
1164
+ # Compute uniqueness of non-ID numeric fields
1165
+ non_id_numeric_stats = [
1166
+ s for s in field_stats.values() if s.field_type == "numeric" and s.name != id_field_name
1167
+ ]
1168
+ avg_non_id_numeric_uniqueness = (
1169
+ statistics.mean(s.unique_ratio for s in non_id_numeric_stats)
1170
+ if non_id_numeric_stats
1171
+ else 0.0
1172
+ )
1173
+
1174
+ # Combined uniqueness metric (including ID fields)
1175
+ max_uniqueness = max(avg_string_uniqueness, id_uniqueness, 0.0)
1176
+
1177
+ # Non-ID content uniqueness (for detecting repetitive content with unique IDs)
1178
+ non_id_content_uniqueness = max(avg_string_uniqueness, avg_non_id_numeric_uniqueness)
1179
+
1180
+ # 6. Check for change points (importance signal for time series)
1181
+ has_change_points = any(
1182
+ stats.change_points for stats in field_stats.values() if stats.field_type == "numeric"
1183
+ )
1184
+ if has_change_points:
1185
+ signals_present.append("change_points")
1186
+
1187
+ # DECISION LOGIC
1188
+ has_any_signal = len(signals_present) > 0
1189
+
1190
+ # Case 0: Repetitive content with unique IDs
1191
+ # If all non-ID fields are nearly constant, data is safe to sample
1192
+ # even if there's a unique ID field (e.g., status="success" for all items)
1193
+ if non_id_content_uniqueness < 0.1 and has_id_field:
1194
+ signals_present.append("repetitive_content")
1195
+ return CrushabilityAnalysis(
1196
+ crushable=True,
1197
+ confidence=0.85,
1198
+ reason="repetitive_content_with_ids",
1199
+ signals_present=signals_present,
1200
+ signals_absent=signals_absent,
1201
+ has_id_field=has_id_field,
1202
+ id_uniqueness=id_uniqueness,
1203
+ avg_string_uniqueness=avg_string_uniqueness,
1204
+ has_score_field=has_score_field,
1205
+ error_item_count=error_count,
1206
+ anomaly_count=anomaly_count,
1207
+ )
1208
+
1209
+ # Case 1: Low uniqueness - safe to sample (data is repetitive)
1210
+ if max_uniqueness < 0.3:
1211
+ return CrushabilityAnalysis(
1212
+ crushable=True,
1213
+ confidence=0.9,
1214
+ reason="low_uniqueness_safe_to_sample",
1215
+ signals_present=signals_present,
1216
+ signals_absent=signals_absent,
1217
+ has_id_field=has_id_field,
1218
+ id_uniqueness=id_uniqueness,
1219
+ avg_string_uniqueness=avg_string_uniqueness,
1220
+ has_score_field=has_score_field,
1221
+ error_item_count=error_count,
1222
+ anomaly_count=anomaly_count,
1223
+ )
1224
+
1225
+ # Case 2: High uniqueness + ID field + NO signal = DON'T CRUSH
1226
+ # This is the critical case: DB results, file listings, user lists
1227
+ if has_id_field and max_uniqueness > 0.8 and not has_any_signal:
1228
+ return CrushabilityAnalysis(
1229
+ crushable=False,
1230
+ confidence=0.85,
1231
+ reason="unique_entities_no_signal",
1232
+ signals_present=signals_present,
1233
+ signals_absent=signals_absent,
1234
+ has_id_field=has_id_field,
1235
+ id_uniqueness=id_uniqueness,
1236
+ avg_string_uniqueness=avg_string_uniqueness,
1237
+ has_score_field=has_score_field,
1238
+ error_item_count=error_count,
1239
+ anomaly_count=anomaly_count,
1240
+ )
1241
+
1242
+ # Case 3: High uniqueness + has signal = CRUSH using signal
1243
+ if max_uniqueness > 0.8 and has_any_signal:
1244
+ return CrushabilityAnalysis(
1245
+ crushable=True,
1246
+ confidence=0.7,
1247
+ reason="unique_entities_with_signal",
1248
+ signals_present=signals_present,
1249
+ signals_absent=signals_absent,
1250
+ has_id_field=has_id_field,
1251
+ id_uniqueness=id_uniqueness,
1252
+ avg_string_uniqueness=avg_string_uniqueness,
1253
+ has_score_field=has_score_field,
1254
+ error_item_count=error_count,
1255
+ anomaly_count=anomaly_count,
1256
+ )
1257
+
1258
+ # Case 4: Medium uniqueness + no signal = be cautious, don't crush
1259
+ if not has_any_signal:
1260
+ return CrushabilityAnalysis(
1261
+ crushable=False,
1262
+ confidence=0.6,
1263
+ reason="medium_uniqueness_no_signal",
1264
+ signals_present=signals_present,
1265
+ signals_absent=signals_absent,
1266
+ has_id_field=has_id_field,
1267
+ id_uniqueness=id_uniqueness,
1268
+ avg_string_uniqueness=avg_string_uniqueness,
1269
+ has_score_field=has_score_field,
1270
+ error_item_count=error_count,
1271
+ anomaly_count=anomaly_count,
1272
+ )
1273
+
1274
+ # Case 5: Medium uniqueness + has signal = crush with caution
1275
+ return CrushabilityAnalysis(
1276
+ crushable=True,
1277
+ confidence=0.5,
1278
+ reason="medium_uniqueness_with_signal",
1279
+ signals_present=signals_present,
1280
+ signals_absent=signals_absent,
1281
+ has_id_field=has_id_field,
1282
+ id_uniqueness=id_uniqueness,
1283
+ avg_string_uniqueness=avg_string_uniqueness,
1284
+ has_score_field=has_score_field,
1285
+ error_item_count=error_count,
1286
+ anomaly_count=anomaly_count,
1287
+ )
1288
+
1289
+ def _select_strategy(
1290
+ self,
1291
+ field_stats: dict[str, FieldStats],
1292
+ pattern: str,
1293
+ item_count: int,
1294
+ crushability: CrushabilityAnalysis | None = None,
1295
+ ) -> CompressionStrategy:
1296
+ """Select optimal compression strategy based on analysis."""
1297
+ if item_count < self.config.min_items_to_analyze:
1298
+ return CompressionStrategy.NONE
1299
+
1300
+ # CRITICAL: Check crushability first
1301
+ if crushability is not None and not crushability.crushable:
1302
+ return CompressionStrategy.SKIP
1303
+
1304
+ if pattern == "time_series":
1305
+ # Check if there are change points worth preserving
1306
+ numeric_fields = [v for v in field_stats.values() if v.field_type == "numeric"]
1307
+ has_change_points = any(f.change_points for f in numeric_fields)
1308
+ if has_change_points:
1309
+ return CompressionStrategy.TIME_SERIES
1310
+
1311
+ if pattern == "logs":
1312
+ # Check if messages are clusterable (low-medium uniqueness)
1313
+ message_field = next(
1314
+ (v for k, v in field_stats.items() if "message" in k.lower()), None
1315
+ )
1316
+ if message_field and message_field.unique_ratio < 0.5:
1317
+ return CompressionStrategy.CLUSTER_SAMPLE
1318
+
1319
+ if pattern == "search_results":
1320
+ return CompressionStrategy.TOP_N
1321
+
1322
+ # Default: smart sampling
1323
+ return CompressionStrategy.SMART_SAMPLE
1324
+
1325
+ def _estimate_reduction(
1326
+ self, field_stats: dict[str, FieldStats], strategy: CompressionStrategy, item_count: int
1327
+ ) -> float:
1328
+ """Estimate token reduction ratio."""
1329
+ if strategy == CompressionStrategy.NONE:
1330
+ return 0.0
1331
+
1332
+ # Count constant fields (will be factored out)
1333
+ constant_ratio = sum(1 for v in field_stats.values() if v.is_constant) / len(field_stats)
1334
+
1335
+ # Estimate based on strategy
1336
+ base_reduction = {
1337
+ CompressionStrategy.TIME_SERIES: 0.7,
1338
+ CompressionStrategy.CLUSTER_SAMPLE: 0.8,
1339
+ CompressionStrategy.TOP_N: 0.6,
1340
+ CompressionStrategy.SMART_SAMPLE: 0.5,
1341
+ }.get(strategy, 0.3)
1342
+
1343
+ # Adjust for constants
1344
+ reduction = base_reduction + (constant_ratio * 0.2)
1345
+
1346
+ return min(reduction, 0.95)
1347
+
1348
+
1349
+ class SmartCrusher(Transform):
1350
+ """
1351
+ Intelligent tool output compression using statistical analysis.
1352
+
1353
+ Unlike fixed-rule crushing, SmartCrusher:
1354
+ 1. Analyzes JSON structure and computes field statistics
1355
+ 2. Detects data patterns (time series, logs, search results)
1356
+ 3. Identifies constant fields to factor out
1357
+ 4. Finds change points in numeric data to preserve
1358
+ 5. Applies optimal compression strategy per data type
1359
+ 6. Uses RelevanceScorer for semantic matching of user queries
1360
+
1361
+ This results in higher compression with lower information loss.
1362
+ """
1363
+
1364
+ name = "smart_crusher"
1365
+
1366
+ def __init__(
1367
+ self,
1368
+ config: SmartCrusherConfig | None = None,
1369
+ relevance_config: RelevanceScorerConfig | None = None,
1370
+ scorer: RelevanceScorer | None = None,
1371
+ ccr_config: CCRConfig | None = None,
1372
+ ):
1373
+ self.config = config or SmartCrusherConfig()
1374
+ self.analyzer = SmartAnalyzer(self.config)
1375
+
1376
+ # CCR (Compress-Cache-Retrieve) configuration
1377
+ # When no ccr_config provided, default to caching enabled but markers disabled
1378
+ # This maintains backward compatibility - callers must opt-in to markers
1379
+ if ccr_config is None:
1380
+ self._ccr_config = CCRConfig(
1381
+ enabled=True, # Still cache for potential retrieval
1382
+ inject_retrieval_marker=False, # Don't break JSON parsing by default
1383
+ )
1384
+ else:
1385
+ self._ccr_config = ccr_config
1386
+ self._compression_store: CompressionStore | None = None
1387
+
1388
+ # Feedback loop for learning compression patterns
1389
+ self._feedback: CompressionFeedback | None = None
1390
+
1391
+ # CRITICAL FIX: Lock for thread-safe lazy initialization
1392
+ # Without this, multiple threads could call _get_* methods simultaneously
1393
+ # and potentially create redundant initialization calls.
1394
+ self._lazy_init_lock = threading.Lock()
1395
+
1396
+ # Initialize relevance scorer
1397
+ if scorer is not None:
1398
+ self._scorer = scorer
1399
+ else:
1400
+ rel_config = relevance_config or RelevanceScorerConfig()
1401
+ # Build kwargs based on tier - BM25 params only apply to bm25 tier
1402
+ scorer_kwargs = {}
1403
+ if rel_config.tier == "bm25":
1404
+ scorer_kwargs = {"k1": rel_config.bm25_k1, "b": rel_config.bm25_b}
1405
+ elif rel_config.tier == "hybrid":
1406
+ scorer_kwargs = {
1407
+ "alpha": rel_config.hybrid_alpha,
1408
+ "adaptive": rel_config.adaptive_alpha,
1409
+ }
1410
+ self._scorer = create_scorer(tier=rel_config.tier, **scorer_kwargs)
1411
+ # Use threshold from config, or default from RelevanceScorerConfig
1412
+ rel_cfg = relevance_config or RelevanceScorerConfig()
1413
+ self._relevance_threshold = rel_cfg.relevance_threshold
1414
+
1415
+ # NOTE: Error detection now uses structural outlier detection (_detect_structural_outliers)
1416
+ # instead of hardcoded keywords. This scales to any data domain.
1417
+
1418
+ def crush(self, content: str, query: str = "") -> CrushResult:
1419
+ """Crush content string directly (for use by ContentRouter).
1420
+
1421
+ This is a simplified interface for compressing a single content string,
1422
+ used by ContentRouter when routing JSON arrays to SmartCrusher.
1423
+
1424
+ Args:
1425
+ content: JSON string content to compress.
1426
+ query: Query context for relevance-based compression.
1427
+
1428
+ Returns:
1429
+ CrushResult with compressed content and metadata.
1430
+ """
1431
+ compressed, was_modified, analysis_info = self._smart_crush_content(
1432
+ content, query_context=query
1433
+ )
1434
+ return CrushResult(
1435
+ compressed=compressed,
1436
+ original=content,
1437
+ was_modified=was_modified,
1438
+ strategy=analysis_info or "passthrough",
1439
+ )
1440
+
1441
+ def _get_compression_store(self) -> CompressionStore:
1442
+ """Get the compression store for CCR (lazy initialization).
1443
+
1444
+ CRITICAL FIX: Thread-safe double-checked locking pattern.
1445
+ """
1446
+ if self._compression_store is None:
1447
+ with self._lazy_init_lock:
1448
+ # Double-check after acquiring lock
1449
+ if self._compression_store is None:
1450
+ self._compression_store = get_compression_store(
1451
+ max_entries=self._ccr_config.store_max_entries,
1452
+ default_ttl=self._ccr_config.store_ttl_seconds,
1453
+ )
1454
+ return self._compression_store
1455
+
1456
+ def _get_feedback(self) -> CompressionFeedback:
1457
+ """Get the feedback analyzer (lazy initialization).
1458
+
1459
+ CRITICAL FIX: Thread-safe double-checked locking pattern.
1460
+ """
1461
+ if self._feedback is None:
1462
+ with self._lazy_init_lock:
1463
+ if self._feedback is None:
1464
+ self._feedback = get_compression_feedback()
1465
+ return self._feedback
1466
+
1467
+ def _get_telemetry(self) -> TelemetryCollector:
1468
+ """Get the telemetry collector (lazy initialization).
1469
+
1470
+ CRITICAL FIX: Thread-safe double-checked locking pattern.
1471
+ """
1472
+ # Use getattr to avoid hasattr race condition
1473
+ if getattr(self, "_telemetry", None) is None:
1474
+ with self._lazy_init_lock:
1475
+ if getattr(self, "_telemetry", None) is None:
1476
+ self._telemetry = get_telemetry_collector()
1477
+ return self._telemetry
1478
+
1479
+ def _get_toin(self) -> ToolIntelligenceNetwork:
1480
+ """Get the TOIN instance (lazy initialization).
1481
+
1482
+ CRITICAL FIX: Thread-safe double-checked locking pattern.
1483
+ """
1484
+ # Use getattr to avoid hasattr race condition
1485
+ if getattr(self, "_toin", None) is None:
1486
+ with self._lazy_init_lock:
1487
+ if getattr(self, "_toin", None) is None:
1488
+ self._toin = get_toin()
1489
+ return self._toin
1490
+
1491
+ def _record_telemetry(
1492
+ self,
1493
+ items: list[dict],
1494
+ result: list,
1495
+ analysis: ArrayAnalysis,
1496
+ plan: CompressionPlan,
1497
+ tool_name: str | None = None,
1498
+ ) -> None:
1499
+ """Record compression telemetry for the data flywheel.
1500
+
1501
+ This collects anonymized statistics about compression patterns to
1502
+ enable cross-user learning and improve compression over time.
1503
+
1504
+ Privacy guarantees:
1505
+ - No actual data values are stored
1506
+ - Tool names can be hashed
1507
+ - Only structural patterns are captured
1508
+ """
1509
+ try:
1510
+ telemetry = self._get_telemetry()
1511
+
1512
+ # Calculate what was kept
1513
+ kept_first_n = sum(1 for i in plan.keep_indices if i < 3)
1514
+ kept_last_n = sum(1 for i in plan.keep_indices if i >= len(items) - 2)
1515
+
1516
+ # Count error items in result
1517
+ error_indices = set(_detect_error_items_for_preservation(items))
1518
+ kept_errors = sum(1 for i in plan.keep_indices if i in error_indices)
1519
+
1520
+ # Count anomalies (approximate from change points)
1521
+ anomaly_count = 0
1522
+ for stats in analysis.field_stats.values():
1523
+ if stats.change_points:
1524
+ anomaly_count += len(stats.change_points)
1525
+ kept_anomalies = min(anomaly_count, len(plan.keep_indices))
1526
+
1527
+ # Crushability info
1528
+ crushability_score = None
1529
+ crushability_reason = None
1530
+ if analysis.crushability:
1531
+ crushability_score = analysis.crushability.confidence
1532
+ crushability_reason = analysis.crushability.reason
1533
+
1534
+ # Record the event
1535
+ telemetry.record_compression(
1536
+ items=items[:100], # Sample for structure analysis
1537
+ original_count=len(items),
1538
+ compressed_count=len(result),
1539
+ original_tokens=0, # Not available here
1540
+ compressed_tokens=0, # Not available here
1541
+ strategy=analysis.recommended_strategy.value,
1542
+ tool_name=tool_name,
1543
+ strategy_reason=analysis.detected_pattern,
1544
+ crushability_score=crushability_score,
1545
+ crushability_reason=crushability_reason,
1546
+ kept_first_n=kept_first_n,
1547
+ kept_last_n=kept_last_n,
1548
+ kept_errors=kept_errors,
1549
+ kept_anomalies=kept_anomalies,
1550
+ kept_by_relevance=0, # Would need to track separately
1551
+ kept_by_score=0, # Would need to track separately
1552
+ )
1553
+ except Exception:
1554
+ # Telemetry should never break compression
1555
+ pass
1556
+
1557
+ def _prioritize_indices(
1558
+ self,
1559
+ keep_indices: set[int],
1560
+ items: list[dict],
1561
+ n: int,
1562
+ analysis: ArrayAnalysis | None = None,
1563
+ max_items: int | None = None,
1564
+ field_semantics: dict[str, FieldSemantics] | None = None,
1565
+ ) -> set[int]:
1566
+ """Prioritize indices when we exceed max_items, ALWAYS keeping critical items.
1567
+
1568
+ Priority order:
1569
+ 1. ALL error items (non-negotiable) - items with error keywords
1570
+ 2. ALL structural outliers (non-negotiable) - items with rare fields/status values
1571
+ 3. ALL numeric anomalies (non-negotiable) - e.g., unusual values like 999999
1572
+ 4. ALL items with important values (learned) - TOIN field semantics
1573
+ 5. First 3 items (context)
1574
+ 6. Last 2 items (context)
1575
+ 7. Other important items by index order
1576
+
1577
+ Uses BOTH keyword detection (for preservation guarantee) AND statistical detection,
1578
+ PLUS learned field semantics from TOIN for zero-latency signal detection.
1579
+
1580
+ HIGH FIX: Note that this function may return MORE items than effective_max
1581
+ when critical items (errors, outliers, anomalies) exceed the limit. This is
1582
+ intentional to preserve the quality guarantee. A warning is logged when this
1583
+ happens to help diagnose cases where compression is less effective than expected.
1584
+
1585
+ Args:
1586
+ keep_indices: Initial set of indices to keep.
1587
+ items: The items being compressed.
1588
+ n: Total number of items.
1589
+ analysis: Optional analysis results for anomaly detection.
1590
+ max_items: Thread-safe max items limit (defaults to config value).
1591
+ field_semantics: Optional learned field semantics from TOIN.
1592
+
1593
+ Returns:
1594
+ Set of indices to keep (may exceed max_items if critical items require it).
1595
+ """
1596
+ # Use provided max_items or fall back to config
1597
+ effective_max = max_items if max_items is not None else self.config.max_items_after_crush
1598
+
1599
+ if len(keep_indices) <= effective_max:
1600
+ return keep_indices
1601
+
1602
+ # Use provided field_semantics or fall back to instance variable (set by crush())
1603
+ effective_field_semantics = field_semantics or getattr(
1604
+ self, "_current_field_semantics", None
1605
+ )
1606
+
1607
+ # Identify error items using KEYWORD detection (preservation guarantee)
1608
+ # This ensures ALL error items are kept, regardless of frequency
1609
+ error_indices = set(_detect_error_items_for_preservation(items))
1610
+
1611
+ # Identify structural outlier indices using STATISTICAL detection
1612
+ # (items with rare fields or rare status values)
1613
+ outlier_indices = set(_detect_structural_outliers(items))
1614
+
1615
+ # Identify numeric anomalies (MUST keep ALL of them)
1616
+ anomaly_indices = set()
1617
+ if analysis and analysis.field_stats:
1618
+ for field_name, stats in analysis.field_stats.items():
1619
+ if stats.field_type == "numeric" and stats.mean_val is not None and stats.variance:
1620
+ std = stats.variance**0.5
1621
+ if std > 0:
1622
+ threshold = self.config.variance_threshold * std
1623
+ for i, item in enumerate(items):
1624
+ val = item.get(field_name)
1625
+ if isinstance(val, (int, float)):
1626
+ if abs(val - stats.mean_val) > threshold:
1627
+ anomaly_indices.add(i)
1628
+
1629
+ # === TOIN Evolution: Identify items with important values (learned) ===
1630
+ # Uses learned field semantics for zero-latency signal detection
1631
+ learned_important_indices: set[int] = set()
1632
+ if effective_field_semantics:
1633
+ learned_important_indices = set(
1634
+ _detect_items_by_learned_semantics(items, effective_field_semantics)
1635
+ )
1636
+
1637
+ # Start with all critical items (these are non-negotiable)
1638
+ # Error items are ALWAYS preserved (quality guarantee)
1639
+ prioritized = error_indices | outlier_indices | anomaly_indices | learned_important_indices
1640
+
1641
+ # HIGH FIX: Log warning if critical items alone exceed the limit
1642
+ # This helps diagnose why compression may be less effective than expected
1643
+ critical_count = len(prioritized)
1644
+ if critical_count > effective_max:
1645
+ logger.warning(
1646
+ "Critical items (%d) exceed max_items (%d): errors=%d outliers=%d anomalies=%d learned=%d. "
1647
+ "Quality guarantee takes precedence - keeping all critical items.",
1648
+ critical_count,
1649
+ effective_max,
1650
+ len(error_indices),
1651
+ len(outlier_indices),
1652
+ len(anomaly_indices),
1653
+ len(learned_important_indices),
1654
+ )
1655
+
1656
+ # Add first/last items if we have room
1657
+ remaining_slots = effective_max - len(prioritized)
1658
+ if remaining_slots > 0:
1659
+ # First 3 items
1660
+ for i in range(min(3, n)):
1661
+ if i not in prioritized and remaining_slots > 0:
1662
+ prioritized.add(i)
1663
+ remaining_slots -= 1
1664
+ # Last 2 items
1665
+ for i in range(max(0, n - 2), n):
1666
+ if i not in prioritized and remaining_slots > 0:
1667
+ prioritized.add(i)
1668
+ remaining_slots -= 1
1669
+
1670
+ # Fill remaining slots with other important indices (by index order)
1671
+ if remaining_slots > 0:
1672
+ other_indices = sorted(keep_indices - prioritized)
1673
+ for i in other_indices:
1674
+ if remaining_slots <= 0:
1675
+ break
1676
+ prioritized.add(i)
1677
+ remaining_slots -= 1
1678
+
1679
+ return prioritized
1680
+
1681
+ def should_apply(
1682
+ self,
1683
+ messages: list[dict[str, Any]],
1684
+ tokenizer: Tokenizer,
1685
+ **kwargs: Any,
1686
+ ) -> bool:
1687
+ """Check if any tool messages would benefit from smart crushing."""
1688
+ if not self.config.enabled:
1689
+ return False
1690
+
1691
+ for msg in messages:
1692
+ # OpenAI style: role="tool"
1693
+ if msg.get("role") == "tool":
1694
+ content = msg.get("content", "")
1695
+ if isinstance(content, str):
1696
+ tokens = tokenizer.count_text(content)
1697
+ if tokens > self.config.min_tokens_to_crush:
1698
+ # Check if it's JSON with arrays
1699
+ parsed, success = safe_json_loads(content)
1700
+ if success and self._has_crushable_arrays(parsed):
1701
+ return True
1702
+
1703
+ # Anthropic style: role="user" with tool_result content blocks
1704
+ content = msg.get("content")
1705
+ if isinstance(content, list):
1706
+ for block in content:
1707
+ if isinstance(block, dict) and block.get("type") == "tool_result":
1708
+ tool_content = block.get("content", "")
1709
+ if isinstance(tool_content, str):
1710
+ tokens = tokenizer.count_text(tool_content)
1711
+ if tokens > self.config.min_tokens_to_crush:
1712
+ parsed, success = safe_json_loads(tool_content)
1713
+ if success and self._has_crushable_arrays(parsed):
1714
+ return True
1715
+
1716
+ return False
1717
+
1718
+ def _has_crushable_arrays(self, data: Any, depth: int = 0) -> bool:
1719
+ """Check if data contains arrays large enough to crush."""
1720
+ if depth > 5:
1721
+ return False
1722
+
1723
+ if isinstance(data, list):
1724
+ if len(data) >= self.config.min_items_to_analyze:
1725
+ if data and isinstance(data[0], dict):
1726
+ return True
1727
+ for item in data[:10]: # Check first few items
1728
+ if self._has_crushable_arrays(item, depth + 1):
1729
+ return True
1730
+
1731
+ elif isinstance(data, dict):
1732
+ for value in data.values():
1733
+ if self._has_crushable_arrays(value, depth + 1):
1734
+ return True
1735
+
1736
+ return False
1737
+
1738
+ def apply(
1739
+ self,
1740
+ messages: list[dict[str, Any]],
1741
+ tokenizer: Tokenizer,
1742
+ **kwargs: Any,
1743
+ ) -> TransformResult:
1744
+ """Apply smart crushing to messages."""
1745
+ tokens_before = tokenizer.count_messages(messages)
1746
+ result_messages = deep_copy_messages(messages)
1747
+ transforms_applied: list[str] = []
1748
+ markers_inserted: list[str] = []
1749
+ warnings: list[str] = []
1750
+
1751
+ # Extract query context from recent user messages for relevance scoring
1752
+ query_context = self._extract_context_from_messages(result_messages)
1753
+
1754
+ crushed_count = 0
1755
+
1756
+ for msg in result_messages:
1757
+ # OpenAI style
1758
+ if msg.get("role") == "tool":
1759
+ content = msg.get("content", "")
1760
+ if not isinstance(content, str):
1761
+ continue
1762
+
1763
+ tokens = tokenizer.count_text(content)
1764
+ if tokens <= self.config.min_tokens_to_crush:
1765
+ continue
1766
+
1767
+ crushed, was_modified, analysis_info = self._smart_crush_content(
1768
+ content, query_context
1769
+ )
1770
+
1771
+ if was_modified:
1772
+ original_hash = compute_short_hash(content)
1773
+ marker = create_tool_digest_marker(original_hash)
1774
+ msg["content"] = crushed + "\n" + marker
1775
+ crushed_count += 1
1776
+ markers_inserted.append(marker)
1777
+ if analysis_info:
1778
+ transforms_applied.append(f"smart:{analysis_info}")
1779
+
1780
+ # Anthropic style
1781
+ content = msg.get("content")
1782
+ if isinstance(content, list):
1783
+ for i, block in enumerate(content):
1784
+ if not isinstance(block, dict):
1785
+ continue
1786
+ if block.get("type") != "tool_result":
1787
+ continue
1788
+
1789
+ tool_content = block.get("content", "")
1790
+ if not isinstance(tool_content, str):
1791
+ continue
1792
+
1793
+ tokens = tokenizer.count_text(tool_content)
1794
+ if tokens <= self.config.min_tokens_to_crush:
1795
+ continue
1796
+
1797
+ crushed, was_modified, analysis_info = self._smart_crush_content(
1798
+ tool_content, query_context
1799
+ )
1800
+
1801
+ if was_modified:
1802
+ original_hash = compute_short_hash(tool_content)
1803
+ marker = create_tool_digest_marker(original_hash)
1804
+ content[i]["content"] = crushed + "\n" + marker
1805
+ crushed_count += 1
1806
+ markers_inserted.append(marker)
1807
+ if analysis_info:
1808
+ transforms_applied.append(f"smart:{analysis_info}")
1809
+
1810
+ if crushed_count > 0:
1811
+ transforms_applied.insert(0, f"smart_crush:{crushed_count}")
1812
+
1813
+ tokens_after = tokenizer.count_messages(result_messages)
1814
+
1815
+ return TransformResult(
1816
+ messages=result_messages,
1817
+ tokens_before=tokens_before,
1818
+ tokens_after=tokens_after,
1819
+ transforms_applied=transforms_applied,
1820
+ markers_inserted=markers_inserted,
1821
+ warnings=warnings,
1822
+ )
1823
+
1824
+ def _extract_context_from_messages(self, messages: list[dict[str, Any]]) -> str:
1825
+ """Extract query context from recent messages for relevance scoring.
1826
+
1827
+ Builds a context string from:
1828
+ - Recent user messages (what the user is asking about)
1829
+ - Recent tool call arguments (what data was requested)
1830
+
1831
+ This context is used by RelevanceScorer to determine which items
1832
+ to preserve during crushing.
1833
+
1834
+ Args:
1835
+ messages: Full message list.
1836
+
1837
+ Returns:
1838
+ Context string for relevance scoring.
1839
+ """
1840
+ context_parts: list[str] = []
1841
+
1842
+ # Look at last 5 user messages (most relevant to recent tool calls)
1843
+ user_message_count = 0
1844
+ for msg in reversed(messages):
1845
+ if msg.get("role") == "user":
1846
+ content = msg.get("content")
1847
+ if isinstance(content, str):
1848
+ context_parts.append(content)
1849
+ elif isinstance(content, list):
1850
+ # Anthropic style - extract from text blocks
1851
+ for block in content:
1852
+ if isinstance(block, dict) and block.get("type") == "text":
1853
+ text = block.get("text", "")
1854
+ if text:
1855
+ context_parts.append(text)
1856
+
1857
+ user_message_count += 1
1858
+ if user_message_count >= 5:
1859
+ break
1860
+
1861
+ # Also check assistant tool_calls for function arguments
1862
+ if msg.get("role") == "assistant" and msg.get("tool_calls"):
1863
+ for tc in msg.get("tool_calls", []):
1864
+ if isinstance(tc, dict):
1865
+ func = tc.get("function", {})
1866
+ args = func.get("arguments", "")
1867
+ if isinstance(args, str) and args:
1868
+ context_parts.append(args)
1869
+
1870
+ return " ".join(context_parts)
1871
+
1872
+ def _smart_crush_content(
1873
+ self, content: str, query_context: str = "", tool_name: str | None = None
1874
+ ) -> tuple[str, bool, str]:
1875
+ """
1876
+ Apply smart crushing to content.
1877
+
1878
+ Handles both JSON (existing SmartCrusher logic) and plain text content
1879
+ (search results, logs, generic text) using specialized compressors.
1880
+
1881
+ Args:
1882
+ content: Content to crush (JSON or plain text).
1883
+ query_context: Context string from user messages for relevance scoring.
1884
+ tool_name: Name of the tool that produced this output.
1885
+
1886
+ Returns:
1887
+ Tuple of (crushed_content, was_modified, analysis_info).
1888
+ """
1889
+ parsed, success = safe_json_loads(content)
1890
+ if not success:
1891
+ # Not JSON - pass through unchanged
1892
+ # Text compression utilities (SearchCompressor, LogCompressor, TextCompressor)
1893
+ # are available as standalone tools for applications to use explicitly
1894
+ return content, False, ""
1895
+
1896
+ # Recursively process and crush arrays
1897
+ crushed, info, ccr_markers = self._process_value(
1898
+ parsed, query_context=query_context, tool_name=tool_name
1899
+ )
1900
+
1901
+ result = safe_json_dumps(crushed, indent=None)
1902
+ was_modified = result != content.strip()
1903
+
1904
+ # CCR: Inject retrieval markers if compression happened and CCR is enabled
1905
+ if was_modified and ccr_markers and self._ccr_config.inject_retrieval_marker:
1906
+ for ccr_hash, original_count, compressed_count in ccr_markers:
1907
+ marker = self._ccr_config.marker_template.format(
1908
+ original_count=original_count,
1909
+ compressed_count=compressed_count,
1910
+ hash=ccr_hash,
1911
+ )
1912
+ result += marker
1913
+
1914
+ return result, was_modified, info
1915
+
1916
+ def _process_value(
1917
+ self, value: Any, depth: int = 0, query_context: str = "", tool_name: str | None = None
1918
+ ) -> tuple[Any, str, list[tuple[str, int, int]]]:
1919
+ """Recursively process a value, crushing arrays where appropriate.
1920
+
1921
+ Returns:
1922
+ Tuple of (processed_value, info_string, ccr_markers).
1923
+ ccr_markers is a list of (hash, original_count, compressed_count) tuples.
1924
+ """
1925
+ info_parts = []
1926
+ ccr_markers: list[tuple[str, int, int]] = []
1927
+
1928
+ if isinstance(value, list):
1929
+ # Check if this array should be crushed
1930
+ # Must have enough items AND all items must be dicts (not mixed types)
1931
+ all_dicts = value and all(isinstance(item, dict) for item in value)
1932
+ if len(value) >= self.config.min_items_to_analyze and all_dicts:
1933
+ crushed, strategy, ccr_hash = self._crush_array(value, query_context, tool_name)
1934
+ info_parts.append(f"{strategy}({len(value)}->{len(crushed)})")
1935
+
1936
+ # Track CCR marker for later injection
1937
+ if ccr_hash:
1938
+ ccr_markers.append((ccr_hash, len(value), len(crushed)))
1939
+
1940
+ return crushed, ",".join(info_parts), ccr_markers
1941
+ else:
1942
+ # Process items recursively
1943
+ processed = []
1944
+ for item in value:
1945
+ p_item, p_info, p_markers = self._process_value(
1946
+ item, depth + 1, query_context, tool_name
1947
+ )
1948
+ processed.append(p_item)
1949
+ if p_info:
1950
+ info_parts.append(p_info)
1951
+ ccr_markers.extend(p_markers)
1952
+ return processed, ",".join(info_parts), ccr_markers
1953
+
1954
+ elif isinstance(value, dict):
1955
+ # Process values recursively
1956
+ processed_dict: dict[str, Any] = {}
1957
+ for k, v in value.items():
1958
+ p_val, p_info, p_markers = self._process_value(
1959
+ v, depth + 1, query_context, tool_name
1960
+ )
1961
+ processed_dict[k] = p_val
1962
+ if p_info:
1963
+ info_parts.append(p_info)
1964
+ ccr_markers.extend(p_markers)
1965
+ return processed_dict, ",".join(info_parts), ccr_markers
1966
+
1967
+ else:
1968
+ return value, "", []
1969
+
1970
+ def _crush_array(
1971
+ self, items: list[dict], query_context: str = "", tool_name: str | None = None
1972
+ ) -> tuple[list, str, str | None]:
1973
+ """Crush an array using statistical analysis and relevance scoring.
1974
+
1975
+ IMPORTANT: If crushability analysis determines it's not safe to crush
1976
+ (high variability + no importance signal), returns original array unchanged.
1977
+
1978
+ TOIN-aware: Consults the Tool Output Intelligence Network for cross-user
1979
+ learned patterns. High retrieval rate across all users → compress less.
1980
+
1981
+ Feedback-aware: Uses learned patterns to adjust compression aggressiveness.
1982
+ High retrieval rate for a tool → compress less aggressively.
1983
+
1984
+ Returns:
1985
+ Tuple of (crushed_items, strategy_info, ccr_hash).
1986
+ ccr_hash is the hash for retrieval if CCR is enabled, None otherwise.
1987
+ """
1988
+ # BOUNDARY CHECK: If already at or below max_items, no compression needed
1989
+ if len(items) <= self.config.max_items_after_crush:
1990
+ return items, "none:at_limit", None
1991
+
1992
+ # Get feedback hints if enabled
1993
+ # THREAD-SAFETY: Use a local effective_max_items instead of mutating shared config
1994
+ effective_max_items = self.config.max_items_after_crush
1995
+ hints_applied = False
1996
+ toin_hint_applied = False
1997
+
1998
+ # Create ToolSignature for TOIN lookup
1999
+ tool_signature = ToolSignature.from_items(items)
2000
+
2001
+ # TOIN: Get cross-user learned recommendations
2002
+ toin = self._get_toin()
2003
+ toin_hint = toin.get_recommendation(tool_signature, query_context)
2004
+
2005
+ if toin_hint.skip_compression:
2006
+ return items, f"skip:toin({toin_hint.reason})", None
2007
+
2008
+ # Apply TOIN recommendations if from network or local learning
2009
+ toin_preserve_fields: list[str] = []
2010
+ toin_recommended_strategy: str | None = None
2011
+ toin_compression_level: str | None = None
2012
+ # LOW FIX #21: Use configurable threshold instead of hardcoded 0.5
2013
+ if (
2014
+ toin_hint.source in ("network", "local")
2015
+ and toin_hint.confidence >= self.config.toin_confidence_threshold
2016
+ ):
2017
+ # TOIN recommendations take precedence over local feedback
2018
+ effective_max_items = toin_hint.max_items
2019
+ toin_preserve_fields = toin_hint.preserve_fields # Fields to never remove
2020
+ toin_hint_applied = True
2021
+ # Store strategy and compression level for later use
2022
+ if toin_hint.recommended_strategy != "default":
2023
+ toin_recommended_strategy = toin_hint.recommended_strategy
2024
+ if toin_hint.compression_level != "moderate":
2025
+ toin_compression_level = toin_hint.compression_level
2026
+
2027
+ # === TOIN Evolution: Extract field semantics for signal detection ===
2028
+ # Store temporarily on instance for use in _prioritize_indices
2029
+ # This enables learned signal detection without changing all method signatures
2030
+ self._current_field_semantics = (
2031
+ toin_hint.field_semantics if toin_hint.field_semantics else None
2032
+ )
2033
+
2034
+ # Local feedback hints (if TOIN didn't apply)
2035
+ if not toin_hint_applied and self.config.use_feedback_hints and tool_name:
2036
+ feedback = self._get_feedback()
2037
+ hints = feedback.get_compression_hints(tool_name)
2038
+
2039
+ # Check if hints recommend skipping compression
2040
+ if hints.skip_compression:
2041
+ return items, f"skip:feedback({hints.reason})", None
2042
+
2043
+ # Adjust max_items based on feedback
2044
+ if hints.suggested_items is not None:
2045
+ effective_max_items = hints.suggested_items
2046
+ hints_applied = True
2047
+
2048
+ # Use preserve_fields from local feedback (hash them for TOIN compatibility)
2049
+ # Note: CompressionFeedback stores actual field names, but _plan methods
2050
+ # expect SHA256[:8] hashes for privacy-preserving comparison
2051
+ if hints.preserve_fields:
2052
+ toin_preserve_fields = [_hash_field_name(field) for field in hints.preserve_fields]
2053
+
2054
+ # Use recommended_strategy from local feedback if not already set by TOIN
2055
+ if hints.recommended_strategy and not toin_recommended_strategy:
2056
+ toin_recommended_strategy = hints.recommended_strategy
2057
+
2058
+ try:
2059
+ # Analyze the array (includes crushability check)
2060
+ analysis = self.analyzer.analyze_array(items)
2061
+
2062
+ # CRITICAL: If not crushable, return original array unchanged
2063
+ if analysis.recommended_strategy == CompressionStrategy.SKIP:
2064
+ reason = ""
2065
+ if analysis.crushability:
2066
+ reason = f"skip:{analysis.crushability.reason}"
2067
+ return items, reason, None
2068
+
2069
+ # Apply TOIN strategy recommendation if available
2070
+ # TOIN learns which strategies work best from cross-user patterns
2071
+ if toin_recommended_strategy:
2072
+ try:
2073
+ toin_strategy = CompressionStrategy(toin_recommended_strategy)
2074
+ # Only override if TOIN suggests a valid non-SKIP strategy
2075
+ if toin_strategy != CompressionStrategy.SKIP:
2076
+ analysis.recommended_strategy = toin_strategy
2077
+ except ValueError:
2078
+ pass # Invalid strategy name, keep analyzer's choice
2079
+
2080
+ # Apply TOIN compression level to adjust effective_max_items
2081
+ if toin_compression_level:
2082
+ if toin_compression_level == "none":
2083
+ # Don't compress - return original
2084
+ return items, "skip:toin_level_none", None
2085
+ elif toin_compression_level == "conservative":
2086
+ # Be conservative - keep more items
2087
+ effective_max_items = max(effective_max_items, min(50, len(items) // 2))
2088
+ elif toin_compression_level == "aggressive":
2089
+ # Be aggressive - keep fewer items
2090
+ effective_max_items = min(effective_max_items, 15)
2091
+
2092
+ # Create compression plan with relevance scoring
2093
+ # Pass TOIN preserve_fields so items with those fields get priority
2094
+ # Pass effective_max_items for thread-safe compression
2095
+ plan = self._create_plan(
2096
+ analysis,
2097
+ items,
2098
+ query_context,
2099
+ preserve_fields=toin_preserve_fields or None,
2100
+ effective_max_items=effective_max_items,
2101
+ )
2102
+
2103
+ # Execute compression
2104
+ result = self._execute_plan(plan, items, analysis)
2105
+
2106
+ # CCR: Store original content for retrieval if enabled
2107
+ ccr_hash = None
2108
+ if (
2109
+ self._ccr_config.enabled
2110
+ and len(items) >= self._ccr_config.min_items_to_cache
2111
+ and len(result) < len(items) # Only cache if compression actually happened
2112
+ ):
2113
+ store = self._get_compression_store()
2114
+ original_json = json.dumps(items, default=str)
2115
+ compressed_json = json.dumps(result, default=str)
2116
+
2117
+ ccr_hash = store.store(
2118
+ original=original_json,
2119
+ compressed=compressed_json,
2120
+ original_item_count=len(items),
2121
+ compressed_item_count=len(result),
2122
+ tool_name=tool_name,
2123
+ query_context=query_context,
2124
+ # CRITICAL: Pass the tool_signature_hash so retrieval events
2125
+ # can be correlated with compression events in TOIN
2126
+ tool_signature_hash=tool_signature.structure_hash,
2127
+ compression_strategy=analysis.recommended_strategy.value,
2128
+ )
2129
+
2130
+ # Record compression event for feedback loop
2131
+ if self.config.use_feedback_hints and tool_name:
2132
+ feedback = self._get_feedback()
2133
+ feedback.record_compression(
2134
+ tool_name=tool_name,
2135
+ original_count=len(items),
2136
+ compressed_count=len(result),
2137
+ strategy=analysis.recommended_strategy.value,
2138
+ tool_signature_hash=tool_signature.structure_hash,
2139
+ )
2140
+
2141
+ # Record telemetry for data flywheel
2142
+ self._record_telemetry(
2143
+ items=items,
2144
+ result=result,
2145
+ analysis=analysis,
2146
+ plan=plan,
2147
+ tool_name=tool_name,
2148
+ )
2149
+
2150
+ # TOIN: Record compression event for cross-user learning
2151
+ try:
2152
+ # Calculate token counts (approximate)
2153
+ original_tokens = len(json.dumps(items, default=str)) // 4
2154
+ compressed_tokens = len(json.dumps(result, default=str)) // 4
2155
+
2156
+ toin.record_compression(
2157
+ tool_signature=tool_signature,
2158
+ original_count=len(items),
2159
+ compressed_count=len(result),
2160
+ original_tokens=original_tokens,
2161
+ compressed_tokens=compressed_tokens,
2162
+ strategy=analysis.recommended_strategy.value,
2163
+ query_context=query_context,
2164
+ items=items, # Pass items for field-level semantic learning
2165
+ )
2166
+ except Exception:
2167
+ # TOIN should never break compression
2168
+ pass
2169
+
2170
+ strategy_info = analysis.recommended_strategy.value
2171
+ if toin_hint_applied:
2172
+ toin_parts = [f"items={toin_hint.max_items}", f"conf={toin_hint.confidence:.2f}"]
2173
+ if toin_recommended_strategy:
2174
+ toin_parts.append(f"strategy={toin_recommended_strategy}")
2175
+ if toin_compression_level and toin_compression_level != "moderate":
2176
+ toin_parts.append(f"level={toin_compression_level}")
2177
+ strategy_info += f"(toin:{','.join(toin_parts)})"
2178
+ elif hints_applied:
2179
+ strategy_info += f"(feedback:{effective_max_items})"
2180
+
2181
+ # Clean up temporary instance variable
2182
+ self._current_field_semantics = None
2183
+ return result, strategy_info, ccr_hash
2184
+
2185
+ except Exception:
2186
+ # Clean up temporary instance variable
2187
+ self._current_field_semantics = None
2188
+ # Re-raise any exceptions (removed finally block since we no longer mutate config)
2189
+ raise
2190
+
2191
+ def _create_plan(
2192
+ self,
2193
+ analysis: ArrayAnalysis,
2194
+ items: list[dict],
2195
+ query_context: str = "",
2196
+ preserve_fields: list[str] | None = None,
2197
+ effective_max_items: int | None = None,
2198
+ ) -> CompressionPlan:
2199
+ """Create a detailed compression plan using relevance scoring.
2200
+
2201
+ Args:
2202
+ analysis: The array analysis results.
2203
+ items: The items to compress.
2204
+ query_context: Context string from user messages for relevance scoring.
2205
+ preserve_fields: TOIN-learned fields that users commonly retrieve.
2206
+ Items with values in these fields get higher priority.
2207
+ effective_max_items: Thread-safe max items limit (defaults to config value).
2208
+ """
2209
+ # Use provided effective_max_items or fall back to config
2210
+ max_items = (
2211
+ effective_max_items
2212
+ if effective_max_items is not None
2213
+ else self.config.max_items_after_crush
2214
+ )
2215
+
2216
+ plan = CompressionPlan(
2217
+ strategy=analysis.recommended_strategy,
2218
+ constant_fields=analysis.constant_fields if self.config.factor_out_constants else {},
2219
+ )
2220
+
2221
+ # Handle SKIP - keep all items (shouldn't normally reach here)
2222
+ if analysis.recommended_strategy == CompressionStrategy.SKIP:
2223
+ plan.keep_indices = list(range(len(items)))
2224
+ return plan
2225
+
2226
+ if analysis.recommended_strategy == CompressionStrategy.TIME_SERIES:
2227
+ plan = self._plan_time_series(
2228
+ analysis, items, plan, query_context, preserve_fields, max_items
2229
+ )
2230
+
2231
+ elif analysis.recommended_strategy == CompressionStrategy.CLUSTER_SAMPLE:
2232
+ plan = self._plan_cluster_sample(
2233
+ analysis, items, plan, query_context, preserve_fields, max_items
2234
+ )
2235
+
2236
+ elif analysis.recommended_strategy == CompressionStrategy.TOP_N:
2237
+ plan = self._plan_top_n(
2238
+ analysis, items, plan, query_context, preserve_fields, max_items
2239
+ )
2240
+
2241
+ else: # SMART_SAMPLE or NONE
2242
+ plan = self._plan_smart_sample(
2243
+ analysis, items, plan, query_context, preserve_fields, max_items
2244
+ )
2245
+
2246
+ return plan
2247
+
2248
+ def _plan_time_series(
2249
+ self,
2250
+ analysis: ArrayAnalysis,
2251
+ items: list[dict],
2252
+ plan: CompressionPlan,
2253
+ query_context: str = "",
2254
+ preserve_fields: list[str] | None = None,
2255
+ max_items: int | None = None,
2256
+ ) -> CompressionPlan:
2257
+ """Plan compression for time series data.
2258
+
2259
+ Keeps items around change points (anomalies) plus first/last items.
2260
+ Uses STATISTICAL outlier detection for important items.
2261
+ Uses RelevanceScorer for semantic matching of user queries.
2262
+
2263
+ Args:
2264
+ preserve_fields: TOIN-learned fields that users commonly retrieve.
2265
+ Items where query_context matches these field values get priority.
2266
+ max_items: Thread-safe max items limit (defaults to config value).
2267
+ """
2268
+ # Use provided max_items or fall back to config
2269
+ effective_max = max_items if max_items is not None else self.config.max_items_after_crush
2270
+ n = len(items)
2271
+ keep_indices = set()
2272
+
2273
+ # 1. First 3 items
2274
+ for i in range(min(3, n)):
2275
+ keep_indices.add(i)
2276
+
2277
+ # 2. Last 2 items
2278
+ for i in range(max(0, n - 2), n):
2279
+ keep_indices.add(i)
2280
+
2281
+ # 3. Items around change points from numeric fields
2282
+ for stats in analysis.field_stats.values():
2283
+ if stats.change_points:
2284
+ for cp in stats.change_points:
2285
+ # Keep a window around each change point
2286
+ for offset in range(-2, 3):
2287
+ idx = cp + offset
2288
+ if 0 <= idx < n:
2289
+ keep_indices.add(idx)
2290
+
2291
+ # 4. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
2292
+ outlier_indices = _detect_structural_outliers(items)
2293
+ keep_indices.update(outlier_indices)
2294
+
2295
+ # 4b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
2296
+ # This is critical - errors must ALWAYS be preserved regardless of structure
2297
+ error_indices = _detect_error_items_for_preservation(items)
2298
+ keep_indices.update(error_indices)
2299
+
2300
+ # 5. Items matching query anchors (DETERMINISTIC exact match)
2301
+ # Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
2302
+ if query_context:
2303
+ anchors = extract_query_anchors(query_context)
2304
+ for i, item in enumerate(items):
2305
+ if item_matches_anchors(item, anchors):
2306
+ keep_indices.add(i)
2307
+
2308
+ # 6. Items with high relevance to query context (PROBABILISTIC semantic match)
2309
+ if query_context:
2310
+ item_strs = [json.dumps(item, default=str) for item in items]
2311
+ scores = self._scorer.score_batch(item_strs, query_context)
2312
+ for i, score in enumerate(scores):
2313
+ if score.score >= self._relevance_threshold:
2314
+ keep_indices.add(i)
2315
+
2316
+ # 6b. TOIN preserve_fields: boost items where query matches these fields
2317
+ # Note: preserve_fields are SHA256[:8] hashes, use helper to match
2318
+ if preserve_fields and query_context:
2319
+ for i, item in enumerate(items):
2320
+ if _item_has_preserve_field_match(item, preserve_fields, query_context):
2321
+ keep_indices.add(i)
2322
+
2323
+ # Limit to effective_max while ALWAYS preserving outliers and anomalies
2324
+ keep_indices = self._prioritize_indices(keep_indices, items, n, analysis, effective_max)
2325
+
2326
+ plan.keep_indices = sorted(keep_indices)
2327
+ return plan
2328
+
2329
+ def _plan_cluster_sample(
2330
+ self,
2331
+ analysis: ArrayAnalysis,
2332
+ items: list[dict],
2333
+ plan: CompressionPlan,
2334
+ query_context: str = "",
2335
+ preserve_fields: list[str] | None = None,
2336
+ max_items: int | None = None,
2337
+ ) -> CompressionPlan:
2338
+ """Plan compression for clusterable data (like logs).
2339
+
2340
+ Uses clustering plus STATISTICAL outlier detection.
2341
+ Uses RelevanceScorer for semantic matching of user queries.
2342
+
2343
+ Args:
2344
+ preserve_fields: TOIN-learned fields that users commonly retrieve.
2345
+ Items where query_context matches these field values get priority.
2346
+ max_items: Thread-safe max items limit (defaults to config value).
2347
+ """
2348
+ # Use provided max_items or fall back to config
2349
+ effective_max = max_items if max_items is not None else self.config.max_items_after_crush
2350
+ n = len(items)
2351
+ keep_indices = set()
2352
+
2353
+ # 1. First 3 items
2354
+ for i in range(min(3, n)):
2355
+ keep_indices.add(i)
2356
+
2357
+ # 2. Last 2 items
2358
+ for i in range(max(0, n - 2), n):
2359
+ keep_indices.add(i)
2360
+
2361
+ # 3. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
2362
+ outlier_indices = _detect_structural_outliers(items)
2363
+ keep_indices.update(outlier_indices)
2364
+
2365
+ # 3b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
2366
+ # This is critical - errors must ALWAYS be preserved regardless of structure
2367
+ error_indices = _detect_error_items_for_preservation(items)
2368
+ keep_indices.update(error_indices)
2369
+
2370
+ # 4. Cluster by message-like field and keep representatives
2371
+ # Find a high-cardinality string field (likely message field)
2372
+ message_field = None
2373
+ max_uniqueness = 0.0
2374
+ for name, stats in analysis.field_stats.items():
2375
+ if stats.field_type == "string" and stats.unique_ratio > max_uniqueness:
2376
+ # Prefer fields with moderate to high uniqueness (message-like)
2377
+ if stats.unique_ratio > 0.3:
2378
+ message_field = name
2379
+ max_uniqueness = stats.unique_ratio
2380
+
2381
+ if message_field:
2382
+ plan.cluster_field = message_field
2383
+
2384
+ # Simple clustering: group by first 50 chars of message
2385
+ clusters: dict[str, list[int]] = {}
2386
+ for i, item in enumerate(items):
2387
+ msg = str(item.get(message_field, ""))[:50]
2388
+ msg_hash = hashlib.md5(msg.encode()).hexdigest()[:8]
2389
+ if msg_hash not in clusters:
2390
+ clusters[msg_hash] = []
2391
+ clusters[msg_hash].append(i)
2392
+
2393
+ # Keep 1-2 representatives from each cluster
2394
+ for indices in clusters.values():
2395
+ for idx in indices[:2]:
2396
+ keep_indices.add(idx)
2397
+
2398
+ # 5. Items matching query anchors (DETERMINISTIC exact match)
2399
+ # Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
2400
+ if query_context:
2401
+ anchors = extract_query_anchors(query_context)
2402
+ for i, item in enumerate(items):
2403
+ if item_matches_anchors(item, anchors):
2404
+ keep_indices.add(i)
2405
+
2406
+ # 6. Items with high relevance to query context (PROBABILISTIC semantic match)
2407
+ if query_context:
2408
+ item_strs = [json.dumps(item, default=str) for item in items]
2409
+ scores = self._scorer.score_batch(item_strs, query_context)
2410
+ for i, score in enumerate(scores):
2411
+ if score.score >= self._relevance_threshold:
2412
+ keep_indices.add(i)
2413
+
2414
+ # 6b. TOIN preserve_fields: boost items where query matches these fields
2415
+ # Note: preserve_fields are SHA256[:8] hashes, use helper to match
2416
+ if preserve_fields and query_context:
2417
+ for i, item in enumerate(items):
2418
+ if _item_has_preserve_field_match(item, preserve_fields, query_context):
2419
+ keep_indices.add(i)
2420
+
2421
+ # Limit total while ALWAYS preserving outliers and anomalies
2422
+ keep_indices = self._prioritize_indices(keep_indices, items, n, analysis, effective_max)
2423
+
2424
+ plan.keep_indices = sorted(keep_indices)
2425
+ return plan
2426
+
2427
+ def _plan_top_n(
2428
+ self,
2429
+ analysis: ArrayAnalysis,
2430
+ items: list[dict],
2431
+ plan: CompressionPlan,
2432
+ query_context: str = "",
2433
+ preserve_fields: list[str] | None = None,
2434
+ max_items: int | None = None,
2435
+ ) -> CompressionPlan:
2436
+ """Plan compression for scored/ranked data.
2437
+
2438
+ For data with a score/relevance field, that field IS the primary relevance
2439
+ signal. Our internal relevance scoring is SECONDARY - it's used to find
2440
+ potential "needle" items that the original scoring might have missed.
2441
+
2442
+ Strategy:
2443
+ 1. Keep top N by score (the original system's relevance ranking)
2444
+ 2. Add structural outliers (errors, anomalies)
2445
+ 3. Add high-confidence relevance matches (needles the user is looking for)
2446
+
2447
+ Args:
2448
+ preserve_fields: TOIN-learned fields that users commonly retrieve.
2449
+ Items where query_context matches these field values get priority.
2450
+ max_items: Thread-safe max items limit (defaults to config value).
2451
+ """
2452
+ # Use provided max_items or fall back to config
2453
+ effective_max = max_items if max_items is not None else self.config.max_items_after_crush
2454
+
2455
+ # Find score field using STATISTICAL detection (no hardcoded field names)
2456
+ score_field = None
2457
+ max_confidence = 0.0
2458
+ for name, stats in analysis.field_stats.items():
2459
+ is_score, confidence = _detect_score_field_statistically(stats, items)
2460
+ if is_score and confidence > max_confidence:
2461
+ score_field = name
2462
+ max_confidence = confidence
2463
+
2464
+ if not score_field:
2465
+ return self._plan_smart_sample(
2466
+ analysis, items, plan, query_context, preserve_fields, effective_max
2467
+ )
2468
+
2469
+ plan.sort_field = score_field
2470
+ keep_indices = set()
2471
+
2472
+ # 1. TOP N by score FIRST (the primary relevance signal)
2473
+ # The original system's score field is the authoritative ranking
2474
+ scored_items = [(i, item.get(score_field, 0)) for i, item in enumerate(items)]
2475
+ scored_items.sort(key=lambda x: x[1], reverse=True)
2476
+
2477
+ # Reserve slots for outliers
2478
+ top_count = max(0, effective_max - 3)
2479
+ for idx, _ in scored_items[:top_count]:
2480
+ keep_indices.add(idx)
2481
+
2482
+ # 2. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
2483
+ outlier_indices = _detect_structural_outliers(items)
2484
+ keep_indices.update(outlier_indices)
2485
+
2486
+ # 2b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
2487
+ # This is critical - errors must ALWAYS be preserved regardless of structure
2488
+ error_indices = _detect_error_items_for_preservation(items)
2489
+ keep_indices.update(error_indices)
2490
+
2491
+ # 3. Items matching query anchors (DETERMINISTIC exact match) - ADDITIVE
2492
+ # Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
2493
+ # These are ALWAYS preserved since they represent explicit user intent
2494
+ if query_context:
2495
+ anchors = extract_query_anchors(query_context)
2496
+ for i, item in enumerate(items):
2497
+ if i not in keep_indices and item_matches_anchors(item, anchors):
2498
+ keep_indices.add(i)
2499
+
2500
+ # 4. HIGH-CONFIDENCE relevance matches (potential needles) - ADDITIVE only
2501
+ # Only add items that are NOT already in top N but match the query strongly
2502
+ # Use a higher threshold (0.5) since the score field already captures relevance
2503
+ if query_context:
2504
+ item_strs = [json.dumps(item, default=str) for item in items]
2505
+ scores = self._scorer.score_batch(item_strs, query_context)
2506
+ # Higher threshold and limit count to avoid adding everything
2507
+ high_threshold = max(0.5, self._relevance_threshold * 2)
2508
+ added_count = 0
2509
+ max_relevance_adds = 3 # Limit additional relevance matches
2510
+ for i, score in enumerate(scores):
2511
+ if i not in keep_indices and score.score >= high_threshold:
2512
+ keep_indices.add(i)
2513
+ added_count += 1
2514
+ if added_count >= max_relevance_adds:
2515
+ break
2516
+
2517
+ # 4b. TOIN preserve_fields: boost items where query matches these fields
2518
+ # Note: preserve_fields are SHA256[:8] hashes, use helper to match
2519
+ if preserve_fields and query_context:
2520
+ for i, item in enumerate(items):
2521
+ if i not in keep_indices: # Only add if not already kept
2522
+ if _item_has_preserve_field_match(item, preserve_fields, query_context):
2523
+ keep_indices.add(i)
2524
+
2525
+ plan.keep_count = len(keep_indices)
2526
+ plan.keep_indices = sorted(keep_indices)
2527
+ return plan
2528
+
2529
+ def _plan_smart_sample(
2530
+ self,
2531
+ analysis: ArrayAnalysis,
2532
+ items: list[dict],
2533
+ plan: CompressionPlan,
2534
+ query_context: str = "",
2535
+ preserve_fields: list[str] | None = None,
2536
+ max_items: int | None = None,
2537
+ ) -> CompressionPlan:
2538
+ """Plan smart statistical sampling using STATISTICAL detection.
2539
+
2540
+ Always keeps:
2541
+ - First K items (default 3)
2542
+ - Last K items (default 2)
2543
+ - Structural outliers (items with rare fields or rare status values)
2544
+ - Anomalous numeric items (> 2 std from mean)
2545
+ - Items around change points
2546
+ - Items with high relevance to query context (via RelevanceScorer)
2547
+
2548
+ Uses STATISTICAL detection instead of hardcoded keywords.
2549
+
2550
+ Args:
2551
+ preserve_fields: TOIN-learned fields that users commonly retrieve.
2552
+ Items where query_context matches these field values get priority.
2553
+ max_items: Thread-safe max items limit (defaults to config value).
2554
+ """
2555
+ # Use provided max_items or fall back to config
2556
+ effective_max = max_items if max_items is not None else self.config.max_items_after_crush
2557
+
2558
+ n = len(items)
2559
+ keep_indices = set()
2560
+
2561
+ # 1. First K items (default 3)
2562
+ for i in range(min(3, n)):
2563
+ keep_indices.add(i)
2564
+
2565
+ # 2. Last K items (default 2)
2566
+ for i in range(max(0, n - 2), n):
2567
+ keep_indices.add(i)
2568
+
2569
+ # 3. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
2570
+ outlier_indices = _detect_structural_outliers(items)
2571
+ keep_indices.update(outlier_indices)
2572
+
2573
+ # 3b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
2574
+ # This is critical - errors must ALWAYS be preserved regardless of structure
2575
+ error_indices = _detect_error_items_for_preservation(items)
2576
+ keep_indices.update(error_indices)
2577
+
2578
+ # 4. Anomalous numeric items (> 2 std from mean)
2579
+ for name, stats in analysis.field_stats.items():
2580
+ if stats.field_type == "numeric" and stats.mean_val is not None and stats.variance:
2581
+ std = stats.variance**0.5
2582
+ if std > 0:
2583
+ threshold = self.config.variance_threshold * std
2584
+ for i, item in enumerate(items):
2585
+ val = item.get(name)
2586
+ if isinstance(val, (int, float)):
2587
+ if abs(val - stats.mean_val) > threshold:
2588
+ keep_indices.add(i)
2589
+
2590
+ # 5. Items around change points (if detected)
2591
+ if self.config.preserve_change_points:
2592
+ for stats in analysis.field_stats.values():
2593
+ if stats.change_points:
2594
+ for cp in stats.change_points:
2595
+ # Keep items around change point
2596
+ for offset in range(-1, 2):
2597
+ idx = cp + offset
2598
+ if 0 <= idx < n:
2599
+ keep_indices.add(idx)
2600
+
2601
+ # 6. Items matching query anchors (DETERMINISTIC exact match)
2602
+ # Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
2603
+ if query_context:
2604
+ anchors = extract_query_anchors(query_context)
2605
+ for i, item in enumerate(items):
2606
+ if item_matches_anchors(item, anchors):
2607
+ keep_indices.add(i)
2608
+
2609
+ # 7. Items with high relevance to query context (PROBABILISTIC semantic match)
2610
+ if query_context:
2611
+ item_strs = [json.dumps(item, default=str) for item in items]
2612
+ scores = self._scorer.score_batch(item_strs, query_context)
2613
+ for i, score in enumerate(scores):
2614
+ if score.score >= self._relevance_threshold:
2615
+ keep_indices.add(i)
2616
+
2617
+ # 7b. TOIN preserve_fields: boost items where query matches these fields
2618
+ # Note: preserve_fields are SHA256[:8] hashes, use helper to match
2619
+ if preserve_fields and query_context:
2620
+ for i, item in enumerate(items):
2621
+ if _item_has_preserve_field_match(item, preserve_fields, query_context):
2622
+ keep_indices.add(i)
2623
+
2624
+ # Limit to effective_max while ALWAYS preserving outliers and anomalies
2625
+ keep_indices = self._prioritize_indices(keep_indices, items, n, analysis, effective_max)
2626
+
2627
+ plan.keep_indices = sorted(keep_indices)
2628
+ return plan
2629
+
2630
+ def _execute_plan(
2631
+ self, plan: CompressionPlan, items: list[dict], analysis: ArrayAnalysis
2632
+ ) -> list:
2633
+ """Execute a compression plan and return crushed array.
2634
+
2635
+ SCHEMA-PRESERVING: Returns only items from the original array.
2636
+ No wrappers, no generated text, no metadata keys.
2637
+ """
2638
+ result = []
2639
+
2640
+ # Return only the kept items, preserving original schema
2641
+ for idx in sorted(plan.keep_indices):
2642
+ if 0 <= idx < len(items):
2643
+ # Copy item unchanged - no modifications to schema
2644
+ result.append(items[idx].copy())
2645
+
2646
+ return result
2647
+
2648
+
2649
+ def smart_crush_tool_output(
2650
+ content: str,
2651
+ config: SmartCrusherConfig | None = None,
2652
+ ccr_config: CCRConfig | None = None,
2653
+ ) -> tuple[str, bool, str]:
2654
+ """
2655
+ Convenience function to smart-crush a single tool output.
2656
+
2657
+ NOTE: CCR markers are DISABLED by default in this convenience function
2658
+ to maintain backward compatibility (output remains valid JSON).
2659
+ To enable CCR markers, pass a CCRConfig with inject_retrieval_marker=True.
2660
+
2661
+ Args:
2662
+ content: The tool output content (JSON string).
2663
+ config: Optional SmartCrusher configuration.
2664
+ ccr_config: Optional CCR (Compress-Cache-Retrieve) configuration.
2665
+ By default, CCR is enabled (caching) but markers are disabled.
2666
+
2667
+ Returns:
2668
+ Tuple of (crushed_content, was_modified, analysis_info).
2669
+ """
2670
+ cfg = config or SmartCrusherConfig()
2671
+
2672
+ # Default: CCR enabled for caching, but markers disabled for clean JSON output
2673
+ if ccr_config is None:
2674
+ ccr_cfg = CCRConfig(
2675
+ enabled=True, # Still cache for retrieval
2676
+ inject_retrieval_marker=False, # Don't break JSON output
2677
+ )
2678
+ else:
2679
+ ccr_cfg = ccr_config
2680
+
2681
+ crusher = SmartCrusher(cfg, ccr_config=ccr_cfg)
2682
+ return crusher._smart_crush_content(content)