headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,2682 @@
|
|
|
1
|
+
"""Smart statistical tool output compression for Headroom SDK.
|
|
2
|
+
|
|
3
|
+
This module provides intelligent JSON compression based on statistical analysis
|
|
4
|
+
rather than fixed rules. It analyzes data patterns and applies optimal compression
|
|
5
|
+
strategies to maximize token reduction while preserving important information.
|
|
6
|
+
|
|
7
|
+
SCOPE: SmartCrusher handles JSON arrays only. Non-JSON content (plain text,
|
|
8
|
+
search results, logs, code, diffs) passes through UNCHANGED.
|
|
9
|
+
|
|
10
|
+
TEXT COMPRESSION IS OPT-IN: For text-based content, Headroom provides standalone
|
|
11
|
+
utilities that applications can use explicitly:
|
|
12
|
+
- SearchCompressor: For grep/ripgrep output (file:line:content format)
|
|
13
|
+
- LogCompressor: For build/test logs (pytest, npm, cargo output)
|
|
14
|
+
- TextCompressor: For generic plain text with anchor preservation
|
|
15
|
+
|
|
16
|
+
Applications should decide when and how to use text compression based on their
|
|
17
|
+
specific needs. This design prevents lossy text compression from being applied
|
|
18
|
+
automatically, which could lose important context in coding tasks.
|
|
19
|
+
|
|
20
|
+
SCHEMA-PRESERVING: Output contains only items from the original array.
|
|
21
|
+
No wrappers, no generated text, no metadata keys. This ensures downstream
|
|
22
|
+
tools and parsers work unchanged.
|
|
23
|
+
|
|
24
|
+
Safe V1 Compression Recipe - Always keeps:
|
|
25
|
+
- First K items (default 3)
|
|
26
|
+
- Last K items (default 2)
|
|
27
|
+
- Error items (containing 'error', 'exception', 'failed', 'critical')
|
|
28
|
+
- Anomalous numeric items (> 2 std from mean)
|
|
29
|
+
- Items around detected change points
|
|
30
|
+
- Top-K by score if score field present
|
|
31
|
+
- Items with high relevance score to user query (via RelevanceScorer)
|
|
32
|
+
|
|
33
|
+
Key Features:
|
|
34
|
+
- RelevanceScorer: ML-powered or BM25-based relevance matching (replaces regex)
|
|
35
|
+
- Variance-based change point detection (preserve anomalies)
|
|
36
|
+
- Error item detection (never lose error messages)
|
|
37
|
+
- Pattern detection (time series, logs, search results)
|
|
38
|
+
- Strategy selection based on data characteristics
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import hashlib
|
|
44
|
+
import json
|
|
45
|
+
import logging
|
|
46
|
+
import math
|
|
47
|
+
import re
|
|
48
|
+
import statistics
|
|
49
|
+
import threading
|
|
50
|
+
from collections import Counter
|
|
51
|
+
from dataclasses import dataclass, field
|
|
52
|
+
from enum import Enum
|
|
53
|
+
from typing import Any
|
|
54
|
+
|
|
55
|
+
from ..cache.compression_feedback import CompressionFeedback, get_compression_feedback
|
|
56
|
+
from ..cache.compression_store import CompressionStore, get_compression_store
|
|
57
|
+
from ..config import CCRConfig, RelevanceScorerConfig, TransformResult
|
|
58
|
+
from ..relevance import RelevanceScorer, create_scorer
|
|
59
|
+
from ..telemetry import TelemetryCollector, ToolSignature, get_telemetry_collector
|
|
60
|
+
from ..telemetry.models import FieldSemantics
|
|
61
|
+
from ..telemetry.toin import ToolIntelligenceNetwork, get_toin
|
|
62
|
+
from ..tokenizer import Tokenizer
|
|
63
|
+
from ..utils import (
|
|
64
|
+
compute_short_hash,
|
|
65
|
+
create_tool_digest_marker,
|
|
66
|
+
deep_copy_messages,
|
|
67
|
+
safe_json_dumps,
|
|
68
|
+
safe_json_loads,
|
|
69
|
+
)
|
|
70
|
+
from .base import Transform
|
|
71
|
+
|
|
72
|
+
logger = logging.getLogger(__name__)
|
|
73
|
+
|
|
74
|
+
# Legacy patterns for backwards compatibility (extract_query_anchors)
|
|
75
|
+
_UUID_PATTERN = re.compile(
|
|
76
|
+
r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
|
|
77
|
+
)
|
|
78
|
+
_NUMERIC_ID_PATTERN = re.compile(r"\b\d{4,}\b") # 4+ digit numbers (likely IDs)
|
|
79
|
+
_HOSTNAME_PATTERN = re.compile(
|
|
80
|
+
r"\b[a-zA-Z0-9][-a-zA-Z0-9]*\.[a-zA-Z0-9][-a-zA-Z0-9]*(?:\.[a-zA-Z]{2,})?\b"
|
|
81
|
+
)
|
|
82
|
+
_QUOTED_STRING_PATTERN = re.compile(r"['\"]([^'\"]{1,50})['\"]") # Short quoted strings
|
|
83
|
+
_EMAIL_PATTERN = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def extract_query_anchors(text: str) -> set[str]:
|
|
87
|
+
"""Extract query anchors from user text (legacy regex-based method).
|
|
88
|
+
|
|
89
|
+
DEPRECATED: Use RelevanceScorer.score_batch() for better semantic matching.
|
|
90
|
+
|
|
91
|
+
Query anchors are identifiers or values that the user is likely searching for.
|
|
92
|
+
When crushing tool outputs, items matching these anchors should be preserved.
|
|
93
|
+
|
|
94
|
+
Extracts:
|
|
95
|
+
- UUIDs (e.g., "550e8400-e29b-41d4-a716-446655440000")
|
|
96
|
+
- Numeric IDs (4+ digits, e.g., "12345", "1001234")
|
|
97
|
+
- Hostnames (e.g., "api.example.com", "server-01.prod")
|
|
98
|
+
- Quoted strings (e.g., 'Alice', "error_code")
|
|
99
|
+
- Email addresses (e.g., "user@example.com")
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
text: User message text to extract anchors from.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Set of anchor strings (lowercased for case-insensitive matching).
|
|
106
|
+
"""
|
|
107
|
+
anchors: set[str] = set()
|
|
108
|
+
|
|
109
|
+
if not text:
|
|
110
|
+
return anchors
|
|
111
|
+
|
|
112
|
+
# UUIDs
|
|
113
|
+
for match in _UUID_PATTERN.findall(text):
|
|
114
|
+
anchors.add(match.lower())
|
|
115
|
+
|
|
116
|
+
# Numeric IDs
|
|
117
|
+
for match in _NUMERIC_ID_PATTERN.findall(text):
|
|
118
|
+
anchors.add(match)
|
|
119
|
+
|
|
120
|
+
# Hostnames
|
|
121
|
+
for match in _HOSTNAME_PATTERN.findall(text):
|
|
122
|
+
# Filter out common false positives
|
|
123
|
+
if match.lower() not in ("e.g", "i.e", "etc."):
|
|
124
|
+
anchors.add(match.lower())
|
|
125
|
+
|
|
126
|
+
# Quoted strings
|
|
127
|
+
for match in _QUOTED_STRING_PATTERN.findall(text):
|
|
128
|
+
if len(match.strip()) >= 2: # Skip very short matches
|
|
129
|
+
anchors.add(match.lower())
|
|
130
|
+
|
|
131
|
+
# Email addresses
|
|
132
|
+
for match in _EMAIL_PATTERN.findall(text):
|
|
133
|
+
anchors.add(match.lower())
|
|
134
|
+
|
|
135
|
+
return anchors
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def item_matches_anchors(item: dict, anchors: set[str]) -> bool:
|
|
139
|
+
"""Check if an item matches any query anchors (legacy method).
|
|
140
|
+
|
|
141
|
+
DEPRECATED: Use RelevanceScorer for better matching.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
item: Dictionary item from tool output.
|
|
145
|
+
anchors: Set of anchor strings to match.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
True if any anchor is found in the item's string representation.
|
|
149
|
+
"""
|
|
150
|
+
if not anchors:
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
item_str = str(item).lower()
|
|
154
|
+
return any(anchor in item_str for anchor in anchors)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _hash_field_name(field_name: str) -> str:
|
|
158
|
+
"""Hash a field name to match TOIN's anonymized preserve_fields.
|
|
159
|
+
|
|
160
|
+
TOIN stores field names as SHA256[:8] hashes for privacy.
|
|
161
|
+
This function produces the same hash format.
|
|
162
|
+
"""
|
|
163
|
+
return hashlib.sha256(field_name.encode()).hexdigest()[:8]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_preserve_field_values(
|
|
167
|
+
item: dict,
|
|
168
|
+
preserve_field_hashes: list[str],
|
|
169
|
+
) -> list[tuple[str, Any]]:
|
|
170
|
+
"""Get values from item fields that match TOIN's preserve_field hashes.
|
|
171
|
+
|
|
172
|
+
TOIN stores preserve_fields as hashed field names (SHA256[:8]).
|
|
173
|
+
This function iterates over item fields, hashes each, and returns
|
|
174
|
+
matching field names and values.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
item: Dictionary item from tool output.
|
|
178
|
+
preserve_field_hashes: List of SHA256[:8] hashed field names from TOIN.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
List of (field_name, value) tuples for fields that match.
|
|
182
|
+
"""
|
|
183
|
+
if not preserve_field_hashes or not item:
|
|
184
|
+
return []
|
|
185
|
+
|
|
186
|
+
# Convert preserve_fields to set for O(1) lookup
|
|
187
|
+
hash_set = set(preserve_field_hashes)
|
|
188
|
+
|
|
189
|
+
matches = []
|
|
190
|
+
for field_name, value in item.items():
|
|
191
|
+
field_hash = _hash_field_name(field_name)
|
|
192
|
+
if field_hash in hash_set:
|
|
193
|
+
matches.append((field_name, value))
|
|
194
|
+
|
|
195
|
+
return matches
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _item_has_preserve_field_match(
|
|
199
|
+
item: dict,
|
|
200
|
+
preserve_field_hashes: list[str],
|
|
201
|
+
query_context: str,
|
|
202
|
+
) -> bool:
|
|
203
|
+
"""Check if item has a preserve_field value that matches query context.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
item: Dictionary item from tool output.
|
|
207
|
+
preserve_field_hashes: List of SHA256[:8] hashed field names from TOIN.
|
|
208
|
+
query_context: User's query to match against field values.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
True if any preserve_field value matches the query context.
|
|
212
|
+
"""
|
|
213
|
+
if not query_context:
|
|
214
|
+
return False
|
|
215
|
+
|
|
216
|
+
query_lower = query_context.lower()
|
|
217
|
+
|
|
218
|
+
for _field_name, value in _get_preserve_field_values(item, preserve_field_hashes):
|
|
219
|
+
if value is not None:
|
|
220
|
+
value_str = str(value).lower()
|
|
221
|
+
if value_str in query_lower or query_lower in value_str:
|
|
222
|
+
return True
|
|
223
|
+
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class CompressionStrategy(Enum):
|
|
228
|
+
"""Compression strategies based on data patterns."""
|
|
229
|
+
|
|
230
|
+
NONE = "none" # No compression needed
|
|
231
|
+
SKIP = "skip" # Explicitly skip - not safe to crush
|
|
232
|
+
TIME_SERIES = "time_series" # Keep change points, summarize stable
|
|
233
|
+
CLUSTER_SAMPLE = "cluster" # Dedupe similar items
|
|
234
|
+
TOP_N = "top_n" # Keep highest scored items
|
|
235
|
+
SMART_SAMPLE = "smart_sample" # Statistical sampling with constants
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# =====================================================================
|
|
239
|
+
# STATISTICAL FIELD DETECTION (replaces hardcoded string patterns)
|
|
240
|
+
# =====================================================================
|
|
241
|
+
# Instead of matching field names like "id", "score", "error", we use
|
|
242
|
+
# statistical and structural properties of the data to detect field types.
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _is_uuid_format(value: str) -> bool:
|
|
246
|
+
"""Check if a string looks like a UUID (structural pattern)."""
|
|
247
|
+
if not isinstance(value, str) or len(value) != 36:
|
|
248
|
+
return False
|
|
249
|
+
# UUID format: 8-4-4-4-12 hex chars
|
|
250
|
+
parts = value.split("-")
|
|
251
|
+
if len(parts) != 5:
|
|
252
|
+
return False
|
|
253
|
+
expected_lens = [8, 4, 4, 4, 12]
|
|
254
|
+
for part, expected_len in zip(parts, expected_lens):
|
|
255
|
+
if len(part) != expected_len:
|
|
256
|
+
return False
|
|
257
|
+
if not all(c in "0123456789abcdefABCDEF" for c in part):
|
|
258
|
+
return False
|
|
259
|
+
return True
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _calculate_string_entropy(s: str) -> float:
|
|
263
|
+
"""Calculate Shannon entropy of a string, normalized to [0, 1].
|
|
264
|
+
|
|
265
|
+
High entropy (>0.7) suggests random/ID-like content.
|
|
266
|
+
Low entropy (<0.3) suggests repetitive/predictable content.
|
|
267
|
+
"""
|
|
268
|
+
if not s or len(s) < 2:
|
|
269
|
+
return 0.0
|
|
270
|
+
|
|
271
|
+
# Count character frequencies
|
|
272
|
+
freq: dict[str, int] = {}
|
|
273
|
+
for c in s:
|
|
274
|
+
freq[c] = freq.get(c, 0) + 1
|
|
275
|
+
|
|
276
|
+
# Calculate entropy
|
|
277
|
+
import math
|
|
278
|
+
|
|
279
|
+
entropy = 0.0
|
|
280
|
+
length = len(s)
|
|
281
|
+
for count in freq.values():
|
|
282
|
+
p = count / length
|
|
283
|
+
if p > 0:
|
|
284
|
+
entropy -= p * math.log2(p)
|
|
285
|
+
|
|
286
|
+
# Normalize by max possible entropy for this length
|
|
287
|
+
max_entropy = math.log2(min(len(freq), length))
|
|
288
|
+
if max_entropy > 0:
|
|
289
|
+
return entropy / max_entropy
|
|
290
|
+
return 0.0
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _detect_sequential_pattern(values: list[Any], check_order: bool = True) -> bool:
|
|
294
|
+
"""Detect if numeric values form a sequential pattern (like IDs: 1,2,3,...).
|
|
295
|
+
|
|
296
|
+
Returns True if values appear to be auto-incrementing or sequential.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
values: List of values to check.
|
|
300
|
+
check_order: If True, also check if values are in ascending order in the array.
|
|
301
|
+
Score fields are often sorted descending, while IDs are ascending.
|
|
302
|
+
"""
|
|
303
|
+
if len(values) < 5:
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
# Get numeric values
|
|
307
|
+
nums = []
|
|
308
|
+
for v in values:
|
|
309
|
+
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
|
310
|
+
nums.append(v)
|
|
311
|
+
elif isinstance(v, str):
|
|
312
|
+
try:
|
|
313
|
+
nums.append(int(v))
|
|
314
|
+
except ValueError:
|
|
315
|
+
pass
|
|
316
|
+
|
|
317
|
+
if len(nums) < 5:
|
|
318
|
+
return False
|
|
319
|
+
|
|
320
|
+
# Check if sorted values form a near-sequence
|
|
321
|
+
sorted_nums = sorted(nums)
|
|
322
|
+
diffs = [sorted_nums[i + 1] - sorted_nums[i] for i in range(len(sorted_nums) - 1)]
|
|
323
|
+
|
|
324
|
+
if not diffs:
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
# If most differences are 1 (or small constant), it's sequential
|
|
328
|
+
avg_diff = sum(diffs) / len(diffs)
|
|
329
|
+
if 0.5 <= avg_diff <= 2.0:
|
|
330
|
+
# Check consistency - sequential IDs have consistent spacing
|
|
331
|
+
consistent_count = sum(1 for d in diffs if 0.5 <= d <= 2.0)
|
|
332
|
+
is_sequential = consistent_count / len(diffs) > 0.8
|
|
333
|
+
|
|
334
|
+
# Additional check: IDs are typically in ASCENDING order in the array
|
|
335
|
+
# Scores sorted by relevance are typically in DESCENDING order
|
|
336
|
+
if check_order and is_sequential:
|
|
337
|
+
# Check if original order is ascending (like IDs)
|
|
338
|
+
ascending_count = sum(1 for i in range(len(nums) - 1) if nums[i] <= nums[i + 1])
|
|
339
|
+
is_ascending = ascending_count / (len(nums) - 1) > 0.7
|
|
340
|
+
return is_ascending # Only flag as sequential if ascending (ID-like)
|
|
341
|
+
|
|
342
|
+
return is_sequential
|
|
343
|
+
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _detect_id_field_statistically(stats: FieldStats, values: list[Any]) -> tuple[bool, float]:
|
|
348
|
+
"""Detect if a field is an ID field using statistical properties.
|
|
349
|
+
|
|
350
|
+
Returns (is_id_field, confidence).
|
|
351
|
+
|
|
352
|
+
ID fields have:
|
|
353
|
+
- Very high uniqueness (>0.95)
|
|
354
|
+
- Sequential numeric pattern OR UUID format OR high entropy strings
|
|
355
|
+
"""
|
|
356
|
+
# Must have high uniqueness
|
|
357
|
+
if stats.unique_ratio < 0.9:
|
|
358
|
+
return False, 0.0
|
|
359
|
+
|
|
360
|
+
confidence = 0.0
|
|
361
|
+
|
|
362
|
+
# Check for UUID format (structural detection)
|
|
363
|
+
if stats.field_type == "string":
|
|
364
|
+
sample_values = [v for v in values[:20] if isinstance(v, str)]
|
|
365
|
+
uuid_count = sum(1 for v in sample_values if _is_uuid_format(v))
|
|
366
|
+
if sample_values and uuid_count / len(sample_values) > 0.8:
|
|
367
|
+
return True, 0.95
|
|
368
|
+
|
|
369
|
+
# Check for high entropy (random string IDs)
|
|
370
|
+
if sample_values:
|
|
371
|
+
avg_entropy = sum(_calculate_string_entropy(v) for v in sample_values) / len(
|
|
372
|
+
sample_values
|
|
373
|
+
)
|
|
374
|
+
if avg_entropy > 0.7 and stats.unique_ratio > 0.95:
|
|
375
|
+
confidence = 0.8
|
|
376
|
+
return True, confidence
|
|
377
|
+
|
|
378
|
+
# Check for sequential numeric pattern
|
|
379
|
+
if stats.field_type == "numeric":
|
|
380
|
+
if _detect_sequential_pattern(values) and stats.unique_ratio > 0.95:
|
|
381
|
+
return True, 0.9
|
|
382
|
+
|
|
383
|
+
# High uniqueness numeric with high range suggests ID
|
|
384
|
+
if stats.min_val is not None and stats.max_val is not None:
|
|
385
|
+
value_range = stats.max_val - stats.min_val
|
|
386
|
+
if value_range > 0 and stats.unique_ratio > 0.95:
|
|
387
|
+
return True, 0.85
|
|
388
|
+
|
|
389
|
+
# Very high uniqueness alone is a signal (even without other patterns)
|
|
390
|
+
if stats.unique_ratio > 0.98:
|
|
391
|
+
return True, 0.7
|
|
392
|
+
|
|
393
|
+
return False, 0.0
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _detect_score_field_statistically(stats: FieldStats, items: list[dict]) -> tuple[bool, float]:
|
|
397
|
+
"""Detect if a field is a score/ranking field using statistical properties.
|
|
398
|
+
|
|
399
|
+
Returns (is_score_field, confidence).
|
|
400
|
+
|
|
401
|
+
Score fields have:
|
|
402
|
+
- Numeric type
|
|
403
|
+
- Bounded range (0-1, 0-10, 0-100, or similar)
|
|
404
|
+
- NOT sequential (unlike IDs)
|
|
405
|
+
- Often the data appears sorted by this field (descending)
|
|
406
|
+
"""
|
|
407
|
+
if stats.field_type != "numeric":
|
|
408
|
+
return False, 0.0
|
|
409
|
+
|
|
410
|
+
if stats.min_val is None or stats.max_val is None:
|
|
411
|
+
return False, 0.0
|
|
412
|
+
|
|
413
|
+
confidence = 0.0
|
|
414
|
+
|
|
415
|
+
# Check for bounded range typical of scores
|
|
416
|
+
stats.max_val - stats.min_val
|
|
417
|
+
min_val, max_val = stats.min_val, stats.max_val
|
|
418
|
+
|
|
419
|
+
# Common score ranges: [0,1], [0,10], [0,100], [-1,1], [0,5]
|
|
420
|
+
is_bounded = False
|
|
421
|
+
if 0 <= min_val <= 1 and 0 <= max_val <= 1: # [0,1] range
|
|
422
|
+
is_bounded = True
|
|
423
|
+
confidence += 0.4
|
|
424
|
+
elif 0 <= min_val <= 10 and 0 <= max_val <= 10: # [0,10] range
|
|
425
|
+
is_bounded = True
|
|
426
|
+
confidence += 0.3
|
|
427
|
+
elif 0 <= min_val <= 100 and 0 <= max_val <= 100: # [0,100] range
|
|
428
|
+
is_bounded = True
|
|
429
|
+
confidence += 0.25
|
|
430
|
+
elif -1 <= min_val and max_val <= 1: # [-1,1] range
|
|
431
|
+
is_bounded = True
|
|
432
|
+
confidence += 0.35
|
|
433
|
+
|
|
434
|
+
if not is_bounded:
|
|
435
|
+
return False, 0.0
|
|
436
|
+
|
|
437
|
+
# Should NOT be sequential (IDs are sequential, scores are not)
|
|
438
|
+
sample_values = [item.get(stats.name) for item in items[:50] if stats.name in item]
|
|
439
|
+
if _detect_sequential_pattern(sample_values):
|
|
440
|
+
return False, 0.0
|
|
441
|
+
|
|
442
|
+
# Check if data appears sorted by this field (descending = relevance sorted)
|
|
443
|
+
# Filter out NaN/Inf which break comparisons
|
|
444
|
+
values_in_order: list[float] = []
|
|
445
|
+
for item in items:
|
|
446
|
+
if stats.name in item:
|
|
447
|
+
val = item.get(stats.name)
|
|
448
|
+
if isinstance(val, (int, float)) and math.isfinite(val):
|
|
449
|
+
values_in_order.append(float(val))
|
|
450
|
+
if len(values_in_order) >= 5:
|
|
451
|
+
# Check for descending sort
|
|
452
|
+
descending_count = sum(
|
|
453
|
+
1
|
|
454
|
+
for i in range(len(values_in_order) - 1)
|
|
455
|
+
if values_in_order[i] >= values_in_order[i + 1]
|
|
456
|
+
)
|
|
457
|
+
if descending_count / (len(values_in_order) - 1) > 0.7:
|
|
458
|
+
confidence += 0.3
|
|
459
|
+
|
|
460
|
+
# Score fields often have floating point values
|
|
461
|
+
# Filter out NaN/Inf which can't be converted to int
|
|
462
|
+
float_count = sum(
|
|
463
|
+
1 for v in values_in_order[:20] if isinstance(v, float) and math.isfinite(v) and v != int(v)
|
|
464
|
+
)
|
|
465
|
+
if float_count > len(values_in_order[:20]) * 0.3:
|
|
466
|
+
confidence += 0.1
|
|
467
|
+
|
|
468
|
+
return confidence >= 0.4, min(confidence, 0.95)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _detect_structural_outliers(items: list[dict]) -> list[int]:
|
|
472
|
+
"""Detect items that are structural outliers (error-like items).
|
|
473
|
+
|
|
474
|
+
Instead of looking for "error" keywords, we detect:
|
|
475
|
+
1. Items with extra fields that others don't have
|
|
476
|
+
2. Items with rare status/state values
|
|
477
|
+
3. Items with significantly different structure
|
|
478
|
+
|
|
479
|
+
Returns indices of outlier items.
|
|
480
|
+
"""
|
|
481
|
+
if len(items) < 5:
|
|
482
|
+
return []
|
|
483
|
+
|
|
484
|
+
outlier_indices: list[int] = []
|
|
485
|
+
|
|
486
|
+
# 1. Detect items with extra fields
|
|
487
|
+
# Find the "common" field set (fields present in >80% of items)
|
|
488
|
+
field_counts: dict[str, int] = {}
|
|
489
|
+
for item in items:
|
|
490
|
+
if isinstance(item, dict):
|
|
491
|
+
for key in item.keys():
|
|
492
|
+
field_counts[key] = field_counts.get(key, 0) + 1
|
|
493
|
+
|
|
494
|
+
n = len(items)
|
|
495
|
+
common_fields = {k for k, v in field_counts.items() if v >= n * 0.8}
|
|
496
|
+
rare_fields = {k for k, v in field_counts.items() if v < n * 0.2}
|
|
497
|
+
|
|
498
|
+
for i, item in enumerate(items):
|
|
499
|
+
if not isinstance(item, dict):
|
|
500
|
+
continue
|
|
501
|
+
|
|
502
|
+
item_fields = set(item.keys())
|
|
503
|
+
|
|
504
|
+
# Has rare fields that most items don't have
|
|
505
|
+
has_rare = bool(item_fields & rare_fields)
|
|
506
|
+
if has_rare:
|
|
507
|
+
outlier_indices.append(i)
|
|
508
|
+
continue
|
|
509
|
+
|
|
510
|
+
# 2. Detect rare status/state values
|
|
511
|
+
# Find fields that look like status fields (low cardinality, categorical)
|
|
512
|
+
status_outliers = _detect_rare_status_values(items, common_fields)
|
|
513
|
+
outlier_indices.extend(status_outliers)
|
|
514
|
+
|
|
515
|
+
return list(set(outlier_indices))
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _detect_rare_status_values(items: list[dict], common_fields: set[str]) -> list[int]:
|
|
519
|
+
"""Detect items with rare values in status-like fields.
|
|
520
|
+
|
|
521
|
+
A status field has low cardinality (few distinct values).
|
|
522
|
+
If 95%+ have the same value, items with different values are interesting.
|
|
523
|
+
"""
|
|
524
|
+
outlier_indices: list[int] = []
|
|
525
|
+
|
|
526
|
+
# Find potential status fields (low cardinality)
|
|
527
|
+
for field_name in common_fields:
|
|
528
|
+
values = [
|
|
529
|
+
item.get(field_name) for item in items if isinstance(item, dict) and field_name in item
|
|
530
|
+
]
|
|
531
|
+
|
|
532
|
+
# Skip if too few values or non-hashable
|
|
533
|
+
try:
|
|
534
|
+
unique_values = {str(v) for v in values if v is not None}
|
|
535
|
+
except Exception:
|
|
536
|
+
continue
|
|
537
|
+
|
|
538
|
+
# Status field = low cardinality (2-10 distinct values)
|
|
539
|
+
if not (2 <= len(unique_values) <= 10):
|
|
540
|
+
continue
|
|
541
|
+
|
|
542
|
+
# Count value frequencies
|
|
543
|
+
value_counts: dict[str, int] = {}
|
|
544
|
+
for v in values:
|
|
545
|
+
key = str(v) if v is not None else "__none__"
|
|
546
|
+
value_counts[key] = value_counts.get(key, 0) + 1
|
|
547
|
+
|
|
548
|
+
# Find the dominant value
|
|
549
|
+
if not value_counts:
|
|
550
|
+
continue
|
|
551
|
+
|
|
552
|
+
max_count = max(value_counts.values())
|
|
553
|
+
total = len(values)
|
|
554
|
+
|
|
555
|
+
# If one value dominates (>90%), others are interesting
|
|
556
|
+
if max_count >= total * 0.9:
|
|
557
|
+
dominant_value = max(value_counts.keys(), key=lambda k: value_counts[k])
|
|
558
|
+
|
|
559
|
+
for i, item in enumerate(items):
|
|
560
|
+
if not isinstance(item, dict) or field_name not in item:
|
|
561
|
+
continue
|
|
562
|
+
item_value = str(item[field_name]) if item[field_name] is not None else "__none__"
|
|
563
|
+
if item_value != dominant_value:
|
|
564
|
+
outlier_indices.append(i)
|
|
565
|
+
|
|
566
|
+
return outlier_indices
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
# Error keywords for PRESERVATION guarantee (not crushability detection)
|
|
570
|
+
# This is for the quality guarantee: "ALL error items are ALWAYS preserved"
|
|
571
|
+
# regardless of how common they are. Used in _prioritize_indices().
|
|
572
|
+
_ERROR_KEYWORDS_FOR_PRESERVATION = frozenset(
|
|
573
|
+
{
|
|
574
|
+
"error",
|
|
575
|
+
"exception",
|
|
576
|
+
"failed",
|
|
577
|
+
"failure",
|
|
578
|
+
"critical",
|
|
579
|
+
"fatal",
|
|
580
|
+
"crash",
|
|
581
|
+
"panic",
|
|
582
|
+
"abort",
|
|
583
|
+
"timeout",
|
|
584
|
+
"denied",
|
|
585
|
+
"rejected",
|
|
586
|
+
}
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _detect_error_items_for_preservation(items: list[dict]) -> list[int]:
|
|
591
|
+
"""Detect items containing error keywords for PRESERVATION guarantee.
|
|
592
|
+
|
|
593
|
+
This is NOT for crushability analysis - it's for ensuring ALL error items
|
|
594
|
+
are retained during compression. The quality guarantee is that error items
|
|
595
|
+
are NEVER dropped, even if errors are common in the dataset.
|
|
596
|
+
|
|
597
|
+
Uses keywords because error semantics are well-defined across domains.
|
|
598
|
+
"""
|
|
599
|
+
error_indices: list[int] = []
|
|
600
|
+
|
|
601
|
+
for i, item in enumerate(items):
|
|
602
|
+
if not isinstance(item, dict):
|
|
603
|
+
continue
|
|
604
|
+
|
|
605
|
+
# Serialize item to check all content
|
|
606
|
+
try:
|
|
607
|
+
item_str = json.dumps(item).lower()
|
|
608
|
+
except Exception:
|
|
609
|
+
continue
|
|
610
|
+
|
|
611
|
+
# Check if any error keyword is present
|
|
612
|
+
for keyword in _ERROR_KEYWORDS_FOR_PRESERVATION:
|
|
613
|
+
if keyword in item_str:
|
|
614
|
+
error_indices.append(i)
|
|
615
|
+
break
|
|
616
|
+
|
|
617
|
+
return error_indices
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def _detect_items_by_learned_semantics(
|
|
621
|
+
items: list[dict],
|
|
622
|
+
field_semantics: dict[str, FieldSemantics],
|
|
623
|
+
) -> list[int]:
|
|
624
|
+
"""Detect items with important values based on learned field semantics.
|
|
625
|
+
|
|
626
|
+
This is the TOIN Evolution integration - uses learned field semantic types
|
|
627
|
+
to identify items that should be preserved during compression.
|
|
628
|
+
|
|
629
|
+
Key insight: Instead of hardcoded patterns, we learn from user behavior
|
|
630
|
+
which field values are actually important (e.g., error indicators, rare
|
|
631
|
+
status values, identifiers that get queried).
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
items: List of items to analyze.
|
|
635
|
+
field_semantics: Learned field semantics from TOIN (field_hash -> FieldSemantics).
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
List of indices for items containing important values.
|
|
639
|
+
"""
|
|
640
|
+
if not field_semantics or not items:
|
|
641
|
+
return []
|
|
642
|
+
|
|
643
|
+
important_indices: list[int] = []
|
|
644
|
+
|
|
645
|
+
# Build a quick lookup for field_hash -> FieldSemantics
|
|
646
|
+
# Pre-filter to fields with sufficient confidence
|
|
647
|
+
confident_semantics = {
|
|
648
|
+
fh: fs
|
|
649
|
+
for fh, fs in field_semantics.items()
|
|
650
|
+
if fs.confidence >= 0.3 and fs.inferred_type != "unknown"
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
if not confident_semantics:
|
|
654
|
+
return []
|
|
655
|
+
|
|
656
|
+
for i, item in enumerate(items):
|
|
657
|
+
if not isinstance(item, dict):
|
|
658
|
+
continue
|
|
659
|
+
|
|
660
|
+
for field_name, value in item.items():
|
|
661
|
+
# Hash the field name to match TOIN's format
|
|
662
|
+
field_hash = hashlib.sha256(field_name.encode()).hexdigest()[:8]
|
|
663
|
+
|
|
664
|
+
if field_hash not in confident_semantics:
|
|
665
|
+
continue
|
|
666
|
+
|
|
667
|
+
field_sem = confident_semantics[field_hash]
|
|
668
|
+
|
|
669
|
+
# Hash the value to check importance
|
|
670
|
+
if value is None:
|
|
671
|
+
value_canonical = "null"
|
|
672
|
+
elif isinstance(value, bool):
|
|
673
|
+
value_canonical = "true" if value else "false"
|
|
674
|
+
elif isinstance(value, (int, float)):
|
|
675
|
+
value_canonical = str(value)
|
|
676
|
+
elif isinstance(value, str):
|
|
677
|
+
value_canonical = value
|
|
678
|
+
elif isinstance(value, (list, dict)):
|
|
679
|
+
try:
|
|
680
|
+
value_canonical = json.dumps(value, sort_keys=True, default=str)
|
|
681
|
+
except (TypeError, ValueError):
|
|
682
|
+
value_canonical = str(value)
|
|
683
|
+
else:
|
|
684
|
+
value_canonical = str(value)
|
|
685
|
+
|
|
686
|
+
value_hash = hashlib.sha256(value_canonical.encode()).hexdigest()[:8]
|
|
687
|
+
|
|
688
|
+
# Check if this value is important based on learned semantics
|
|
689
|
+
if field_sem.is_value_important(value_hash):
|
|
690
|
+
important_indices.append(i)
|
|
691
|
+
break # Only need to mark item once
|
|
692
|
+
|
|
693
|
+
return important_indices
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
@dataclass
|
|
697
|
+
class CrushabilityAnalysis:
|
|
698
|
+
"""Analysis of whether an array is safe to crush.
|
|
699
|
+
|
|
700
|
+
The key insight: if we don't have a reliable SIGNAL to determine
|
|
701
|
+
which items are important, we should NOT crush at all.
|
|
702
|
+
|
|
703
|
+
Signals include:
|
|
704
|
+
- Score/rank fields (search results)
|
|
705
|
+
- Error keywords (logs)
|
|
706
|
+
- Numeric anomalies (metrics)
|
|
707
|
+
- Low uniqueness (repetitive data where sampling is representative)
|
|
708
|
+
|
|
709
|
+
High variability + No signal = DON'T CRUSH
|
|
710
|
+
"""
|
|
711
|
+
|
|
712
|
+
crushable: bool
|
|
713
|
+
confidence: float # 0.0 to 1.0
|
|
714
|
+
reason: str
|
|
715
|
+
signals_present: list[str] = field(default_factory=list)
|
|
716
|
+
signals_absent: list[str] = field(default_factory=list)
|
|
717
|
+
|
|
718
|
+
# Detailed metrics
|
|
719
|
+
has_id_field: bool = False
|
|
720
|
+
id_uniqueness: float = 0.0
|
|
721
|
+
avg_string_uniqueness: float = 0.0
|
|
722
|
+
has_score_field: bool = False
|
|
723
|
+
error_item_count: int = 0
|
|
724
|
+
anomaly_count: int = 0
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
@dataclass
|
|
728
|
+
class FieldStats:
|
|
729
|
+
"""Statistics for a single field across array items."""
|
|
730
|
+
|
|
731
|
+
name: str
|
|
732
|
+
field_type: str # "numeric", "string", "boolean", "object", "array", "null"
|
|
733
|
+
count: int
|
|
734
|
+
unique_count: int
|
|
735
|
+
unique_ratio: float
|
|
736
|
+
is_constant: bool
|
|
737
|
+
constant_value: Any = None
|
|
738
|
+
|
|
739
|
+
# Numeric-specific stats
|
|
740
|
+
min_val: float | None = None
|
|
741
|
+
max_val: float | None = None
|
|
742
|
+
mean_val: float | None = None
|
|
743
|
+
variance: float | None = None
|
|
744
|
+
change_points: list[int] = field(default_factory=list)
|
|
745
|
+
|
|
746
|
+
# String-specific stats
|
|
747
|
+
avg_length: float | None = None
|
|
748
|
+
top_values: list[tuple[str, int]] = field(default_factory=list)
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
@dataclass
|
|
752
|
+
class ArrayAnalysis:
|
|
753
|
+
"""Complete analysis of an array."""
|
|
754
|
+
|
|
755
|
+
item_count: int
|
|
756
|
+
field_stats: dict[str, FieldStats]
|
|
757
|
+
detected_pattern: str # "time_series", "logs", "search_results", "generic"
|
|
758
|
+
recommended_strategy: CompressionStrategy
|
|
759
|
+
constant_fields: dict[str, Any]
|
|
760
|
+
estimated_reduction: float
|
|
761
|
+
crushability: CrushabilityAnalysis | None = None # Whether it's safe to crush
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
@dataclass
|
|
765
|
+
class CompressionPlan:
|
|
766
|
+
"""Plan for how to compress an array."""
|
|
767
|
+
|
|
768
|
+
strategy: CompressionStrategy
|
|
769
|
+
keep_indices: list[int] = field(default_factory=list)
|
|
770
|
+
constant_fields: dict[str, Any] = field(default_factory=dict)
|
|
771
|
+
summary_ranges: list[tuple[int, int, dict]] = field(default_factory=list)
|
|
772
|
+
cluster_field: str | None = None
|
|
773
|
+
sort_field: str | None = None
|
|
774
|
+
keep_count: int = 10
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
@dataclass
|
|
778
|
+
class CrushResult:
|
|
779
|
+
"""Result from SmartCrusher.crush() method.
|
|
780
|
+
|
|
781
|
+
Used by ContentRouter when routing JSON arrays to SmartCrusher.
|
|
782
|
+
"""
|
|
783
|
+
|
|
784
|
+
compressed: str
|
|
785
|
+
original: str
|
|
786
|
+
was_modified: bool
|
|
787
|
+
strategy: str = "passthrough"
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
@dataclass
|
|
791
|
+
class SmartCrusherConfig:
|
|
792
|
+
"""Configuration for smart crusher.
|
|
793
|
+
|
|
794
|
+
SCHEMA-PRESERVING: Output contains only items from the original array.
|
|
795
|
+
No wrappers, no generated text, no metadata keys.
|
|
796
|
+
"""
|
|
797
|
+
|
|
798
|
+
enabled: bool = True
|
|
799
|
+
min_items_to_analyze: int = 5 # Don't analyze tiny arrays
|
|
800
|
+
min_tokens_to_crush: int = 200 # Only crush if > N tokens
|
|
801
|
+
variance_threshold: float = 2.0 # Std devs for change point detection
|
|
802
|
+
uniqueness_threshold: float = 0.1 # Below this = nearly constant
|
|
803
|
+
similarity_threshold: float = 0.8 # For clustering similar strings
|
|
804
|
+
max_items_after_crush: int = 15 # Target max items in output
|
|
805
|
+
preserve_change_points: bool = True
|
|
806
|
+
factor_out_constants: bool = False # Disabled - preserves original schema
|
|
807
|
+
include_summaries: bool = False # Disabled - no generated text
|
|
808
|
+
|
|
809
|
+
# Feedback loop integration
|
|
810
|
+
use_feedback_hints: bool = True # Use learned patterns to adjust compression
|
|
811
|
+
|
|
812
|
+
# LOW FIX #21: Make TOIN confidence threshold configurable
|
|
813
|
+
# Minimum confidence required to apply TOIN recommendations
|
|
814
|
+
toin_confidence_threshold: float = 0.5
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
class SmartAnalyzer:
|
|
818
|
+
"""Analyzes JSON arrays to determine optimal compression strategy."""
|
|
819
|
+
|
|
820
|
+
def __init__(self, config: SmartCrusherConfig | None = None):
|
|
821
|
+
self.config = config or SmartCrusherConfig()
|
|
822
|
+
|
|
823
|
+
def analyze_array(self, items: list[dict]) -> ArrayAnalysis:
|
|
824
|
+
"""Perform complete statistical analysis of an array."""
|
|
825
|
+
if not items or not isinstance(items[0], dict):
|
|
826
|
+
return ArrayAnalysis(
|
|
827
|
+
item_count=len(items) if items else 0,
|
|
828
|
+
field_stats={},
|
|
829
|
+
detected_pattern="generic",
|
|
830
|
+
recommended_strategy=CompressionStrategy.NONE,
|
|
831
|
+
constant_fields={},
|
|
832
|
+
estimated_reduction=0.0,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
# Analyze each field
|
|
836
|
+
field_stats = {}
|
|
837
|
+
all_keys: set[str] = set()
|
|
838
|
+
for item in items:
|
|
839
|
+
if isinstance(item, dict):
|
|
840
|
+
all_keys.update(item.keys())
|
|
841
|
+
|
|
842
|
+
for key in all_keys:
|
|
843
|
+
field_stats[key] = self._analyze_field(key, items)
|
|
844
|
+
|
|
845
|
+
# Detect pattern
|
|
846
|
+
pattern = self._detect_pattern(field_stats, items)
|
|
847
|
+
|
|
848
|
+
# Extract constants
|
|
849
|
+
constant_fields = {k: v.constant_value for k, v in field_stats.items() if v.is_constant}
|
|
850
|
+
|
|
851
|
+
# CRITICAL: Analyze crushability BEFORE selecting strategy
|
|
852
|
+
crushability = self.analyze_crushability(items, field_stats)
|
|
853
|
+
|
|
854
|
+
# Select strategy (respects crushability)
|
|
855
|
+
strategy = self._select_strategy(field_stats, pattern, len(items), crushability)
|
|
856
|
+
|
|
857
|
+
# Estimate reduction (0 if not crushable)
|
|
858
|
+
if strategy == CompressionStrategy.SKIP:
|
|
859
|
+
reduction = 0.0
|
|
860
|
+
else:
|
|
861
|
+
reduction = self._estimate_reduction(field_stats, strategy, len(items))
|
|
862
|
+
|
|
863
|
+
return ArrayAnalysis(
|
|
864
|
+
item_count=len(items),
|
|
865
|
+
field_stats=field_stats,
|
|
866
|
+
detected_pattern=pattern,
|
|
867
|
+
recommended_strategy=strategy,
|
|
868
|
+
constant_fields=constant_fields,
|
|
869
|
+
estimated_reduction=reduction,
|
|
870
|
+
crushability=crushability,
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
def _analyze_field(self, key: str, items: list[dict]) -> FieldStats:
|
|
874
|
+
"""Analyze a single field across all items."""
|
|
875
|
+
values = [item.get(key) for item in items if isinstance(item, dict)]
|
|
876
|
+
non_null_values = [v for v in values if v is not None]
|
|
877
|
+
|
|
878
|
+
if not non_null_values:
|
|
879
|
+
return FieldStats(
|
|
880
|
+
name=key,
|
|
881
|
+
field_type="null",
|
|
882
|
+
count=len(values),
|
|
883
|
+
unique_count=0,
|
|
884
|
+
unique_ratio=0.0,
|
|
885
|
+
is_constant=True,
|
|
886
|
+
constant_value=None,
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
# Determine type from first non-null value
|
|
890
|
+
first_val = non_null_values[0]
|
|
891
|
+
if isinstance(first_val, bool):
|
|
892
|
+
field_type = "boolean"
|
|
893
|
+
elif isinstance(first_val, (int, float)):
|
|
894
|
+
field_type = "numeric"
|
|
895
|
+
elif isinstance(first_val, str):
|
|
896
|
+
field_type = "string"
|
|
897
|
+
elif isinstance(first_val, dict):
|
|
898
|
+
field_type = "object"
|
|
899
|
+
elif isinstance(first_val, list):
|
|
900
|
+
field_type = "array"
|
|
901
|
+
else:
|
|
902
|
+
field_type = "unknown"
|
|
903
|
+
|
|
904
|
+
# Compute uniqueness
|
|
905
|
+
str_values = [str(v) for v in values]
|
|
906
|
+
unique_values = set(str_values)
|
|
907
|
+
unique_count = len(unique_values)
|
|
908
|
+
unique_ratio = unique_count / len(values) if values else 0
|
|
909
|
+
|
|
910
|
+
# Check if constant
|
|
911
|
+
is_constant = unique_count == 1
|
|
912
|
+
constant_value = non_null_values[0] if is_constant else None
|
|
913
|
+
|
|
914
|
+
stats = FieldStats(
|
|
915
|
+
name=key,
|
|
916
|
+
field_type=field_type,
|
|
917
|
+
count=len(values),
|
|
918
|
+
unique_count=unique_count,
|
|
919
|
+
unique_ratio=unique_ratio,
|
|
920
|
+
is_constant=is_constant,
|
|
921
|
+
constant_value=constant_value,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Numeric-specific analysis
|
|
925
|
+
if field_type == "numeric":
|
|
926
|
+
# Filter out NaN and Infinity which break statistics functions
|
|
927
|
+
nums = [v for v in non_null_values if isinstance(v, (int, float)) and math.isfinite(v)]
|
|
928
|
+
if nums:
|
|
929
|
+
try:
|
|
930
|
+
stats.min_val = min(nums)
|
|
931
|
+
stats.max_val = max(nums)
|
|
932
|
+
stats.mean_val = statistics.mean(nums)
|
|
933
|
+
stats.variance = statistics.variance(nums) if len(nums) > 1 else 0
|
|
934
|
+
stats.change_points = self._detect_change_points(nums)
|
|
935
|
+
except (OverflowError, ValueError):
|
|
936
|
+
# Extreme values that overflow - skip detailed statistics
|
|
937
|
+
stats.min_val = None
|
|
938
|
+
stats.max_val = None
|
|
939
|
+
stats.mean_val = None
|
|
940
|
+
stats.variance = 0
|
|
941
|
+
stats.change_points = []
|
|
942
|
+
|
|
943
|
+
# String-specific analysis
|
|
944
|
+
elif field_type == "string":
|
|
945
|
+
strs = [v for v in non_null_values if isinstance(v, str)]
|
|
946
|
+
if strs:
|
|
947
|
+
stats.avg_length = statistics.mean(len(s) for s in strs)
|
|
948
|
+
stats.top_values = Counter(strs).most_common(5)
|
|
949
|
+
|
|
950
|
+
return stats
|
|
951
|
+
|
|
952
|
+
def _detect_change_points(self, values: list[float], window: int = 5) -> list[int]:
|
|
953
|
+
"""Detect indices where values change significantly."""
|
|
954
|
+
if len(values) < window * 2:
|
|
955
|
+
return []
|
|
956
|
+
|
|
957
|
+
change_points = []
|
|
958
|
+
|
|
959
|
+
# Calculate overall statistics
|
|
960
|
+
overall_std = statistics.stdev(values) if len(values) > 1 else 0
|
|
961
|
+
if overall_std == 0:
|
|
962
|
+
return []
|
|
963
|
+
|
|
964
|
+
threshold = self.config.variance_threshold * overall_std
|
|
965
|
+
|
|
966
|
+
# Sliding window comparison
|
|
967
|
+
for i in range(window, len(values) - window):
|
|
968
|
+
before_mean = statistics.mean(values[i - window : i])
|
|
969
|
+
after_mean = statistics.mean(values[i : i + window])
|
|
970
|
+
|
|
971
|
+
if abs(after_mean - before_mean) > threshold:
|
|
972
|
+
change_points.append(i)
|
|
973
|
+
|
|
974
|
+
# Deduplicate nearby change points
|
|
975
|
+
if change_points:
|
|
976
|
+
deduped = [change_points[0]]
|
|
977
|
+
for cp in change_points[1:]:
|
|
978
|
+
if cp - deduped[-1] > window:
|
|
979
|
+
deduped.append(cp)
|
|
980
|
+
return deduped
|
|
981
|
+
|
|
982
|
+
return []
|
|
983
|
+
|
|
984
|
+
def _detect_pattern(self, field_stats: dict[str, FieldStats], items: list[dict]) -> str:
|
|
985
|
+
"""Detect the data pattern using STATISTICAL analysis (no hardcoded field names).
|
|
986
|
+
|
|
987
|
+
Pattern detection:
|
|
988
|
+
- TIME_SERIES: Has a temporal field (detected by value format) + numeric variance
|
|
989
|
+
- LOGS: Has a high-cardinality string field + low-cardinality categorical field
|
|
990
|
+
- SEARCH_RESULTS: Has a score-like field (bounded numeric, possibly sorted)
|
|
991
|
+
- GENERIC: Default
|
|
992
|
+
"""
|
|
993
|
+
# Check for time series pattern using STRUCTURAL detection
|
|
994
|
+
has_timestamp = self._detect_temporal_field(field_stats, items)
|
|
995
|
+
|
|
996
|
+
numeric_fields = [k for k, v in field_stats.items() if v.field_type == "numeric"]
|
|
997
|
+
has_numeric_with_variance = any(
|
|
998
|
+
(field_stats[k].variance is not None and (field_stats[k].variance or 0) > 0)
|
|
999
|
+
for k in numeric_fields
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
if has_timestamp and has_numeric_with_variance:
|
|
1003
|
+
return "time_series"
|
|
1004
|
+
|
|
1005
|
+
# Check for logs pattern using STATISTICAL detection
|
|
1006
|
+
# Logs have: high-cardinality string (message) + low-cardinality categorical (level)
|
|
1007
|
+
has_message_like = False
|
|
1008
|
+
has_level_like = False
|
|
1009
|
+
|
|
1010
|
+
for _name, stats in field_stats.items():
|
|
1011
|
+
if stats.field_type == "string":
|
|
1012
|
+
# High-cardinality string = likely message field
|
|
1013
|
+
if stats.unique_ratio > 0.5 and stats.avg_length and stats.avg_length > 20:
|
|
1014
|
+
has_message_like = True
|
|
1015
|
+
# Low-cardinality string = likely level/status field
|
|
1016
|
+
elif stats.unique_ratio < 0.1 and 2 <= stats.unique_count <= 10:
|
|
1017
|
+
has_level_like = True
|
|
1018
|
+
|
|
1019
|
+
if has_message_like and has_level_like:
|
|
1020
|
+
return "logs"
|
|
1021
|
+
|
|
1022
|
+
# Check for search results pattern using STATISTICAL score detection
|
|
1023
|
+
for _name, stats in field_stats.items():
|
|
1024
|
+
is_score, confidence = _detect_score_field_statistically(stats, items)
|
|
1025
|
+
if is_score and confidence >= 0.5:
|
|
1026
|
+
return "search_results"
|
|
1027
|
+
|
|
1028
|
+
return "generic"
|
|
1029
|
+
|
|
1030
|
+
def _detect_temporal_field(self, field_stats: dict[str, FieldStats], items: list[dict]) -> bool:
|
|
1031
|
+
"""Detect if any field contains temporal values (dates/timestamps).
|
|
1032
|
+
|
|
1033
|
+
Uses STRUCTURAL detection based on value format, not field names.
|
|
1034
|
+
"""
|
|
1035
|
+
# Check string fields for ISO 8601 patterns
|
|
1036
|
+
iso_datetime_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}")
|
|
1037
|
+
iso_date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
|
1038
|
+
|
|
1039
|
+
for name, stats in field_stats.items():
|
|
1040
|
+
if stats.field_type == "string":
|
|
1041
|
+
# Sample some values
|
|
1042
|
+
sample_values = [
|
|
1043
|
+
item.get(name) for item in items[:10] if isinstance(item.get(name), str)
|
|
1044
|
+
]
|
|
1045
|
+
if sample_values:
|
|
1046
|
+
# Check if values look like dates/datetimes
|
|
1047
|
+
iso_count = sum(
|
|
1048
|
+
1
|
|
1049
|
+
for v in sample_values
|
|
1050
|
+
if v is not None
|
|
1051
|
+
and (iso_datetime_pattern.match(v) or iso_date_pattern.match(v))
|
|
1052
|
+
)
|
|
1053
|
+
if iso_count / len(sample_values) > 0.5:
|
|
1054
|
+
return True
|
|
1055
|
+
|
|
1056
|
+
# Check numeric fields for Unix timestamp range
|
|
1057
|
+
elif stats.field_type == "numeric":
|
|
1058
|
+
if stats.min_val and stats.max_val:
|
|
1059
|
+
# Unix timestamps (seconds): 1000000000 to 2000000000 (roughly 2001-2033)
|
|
1060
|
+
# Unix timestamps (milliseconds): 1000000000000 to 2000000000000
|
|
1061
|
+
is_unix_seconds = 1000000000 <= stats.min_val <= 2000000000
|
|
1062
|
+
is_unix_millis = 1000000000000 <= stats.min_val <= 2000000000000
|
|
1063
|
+
if is_unix_seconds or is_unix_millis:
|
|
1064
|
+
return True
|
|
1065
|
+
|
|
1066
|
+
return False
|
|
1067
|
+
|
|
1068
|
+
def analyze_crushability(
|
|
1069
|
+
self,
|
|
1070
|
+
items: list[dict],
|
|
1071
|
+
field_stats: dict[str, FieldStats],
|
|
1072
|
+
) -> CrushabilityAnalysis:
|
|
1073
|
+
"""Analyze whether it's SAFE to crush this array.
|
|
1074
|
+
|
|
1075
|
+
The key insight: High variability + No importance signal = DON'T CRUSH.
|
|
1076
|
+
|
|
1077
|
+
We use STATISTICAL detection (no hardcoded field names):
|
|
1078
|
+
1. ID fields detected by uniqueness + sequential/UUID/entropy patterns
|
|
1079
|
+
2. Score fields detected by bounded range + sorted order
|
|
1080
|
+
3. Error items detected by structural outliers (rare fields, rare status values)
|
|
1081
|
+
4. Numeric anomalies (importance signal)
|
|
1082
|
+
5. Low uniqueness (safe to sample)
|
|
1083
|
+
|
|
1084
|
+
Returns:
|
|
1085
|
+
CrushabilityAnalysis with decision and reasoning.
|
|
1086
|
+
"""
|
|
1087
|
+
signals_present: list[str] = []
|
|
1088
|
+
signals_absent: list[str] = []
|
|
1089
|
+
|
|
1090
|
+
# 1. Detect ID field STATISTICALLY (no hardcoded field names)
|
|
1091
|
+
id_field_name = None
|
|
1092
|
+
id_uniqueness = 0.0
|
|
1093
|
+
id_confidence = 0.0
|
|
1094
|
+
for name, stats in field_stats.items():
|
|
1095
|
+
values = [item.get(name) for item in items if isinstance(item, dict)]
|
|
1096
|
+
is_id, confidence = _detect_id_field_statistically(stats, values)
|
|
1097
|
+
if is_id and confidence > id_confidence:
|
|
1098
|
+
id_field_name = name
|
|
1099
|
+
id_uniqueness = stats.unique_ratio
|
|
1100
|
+
id_confidence = confidence
|
|
1101
|
+
|
|
1102
|
+
has_id_field = id_field_name is not None and id_confidence >= 0.7
|
|
1103
|
+
|
|
1104
|
+
# 2. Detect score/rank field STATISTICALLY (no hardcoded field names)
|
|
1105
|
+
has_score_field = False
|
|
1106
|
+
for name, stats in field_stats.items():
|
|
1107
|
+
is_score, confidence = _detect_score_field_statistically(stats, items)
|
|
1108
|
+
if is_score:
|
|
1109
|
+
has_score_field = True
|
|
1110
|
+
signals_present.append(f"score_field:{name}(conf={confidence:.2f})")
|
|
1111
|
+
break
|
|
1112
|
+
if not has_score_field:
|
|
1113
|
+
signals_absent.append("score_field")
|
|
1114
|
+
|
|
1115
|
+
# 3. Detect error items via STRUCTURAL OUTLIERS (no hardcoded keywords)
|
|
1116
|
+
outlier_indices = _detect_structural_outliers(items)
|
|
1117
|
+
structural_outlier_count = len(outlier_indices)
|
|
1118
|
+
|
|
1119
|
+
if structural_outlier_count > 0:
|
|
1120
|
+
signals_present.append(f"structural_outliers:{structural_outlier_count}")
|
|
1121
|
+
else:
|
|
1122
|
+
signals_absent.append("structural_outliers")
|
|
1123
|
+
|
|
1124
|
+
# 3b. Also detect errors via keywords in content (for log/message-style data)
|
|
1125
|
+
# This catches errors that are in the content but not structural outliers
|
|
1126
|
+
# (e.g., Slack messages where error is in the text field)
|
|
1127
|
+
error_keyword_indices = _detect_error_items_for_preservation(items)
|
|
1128
|
+
keyword_error_count = len(error_keyword_indices)
|
|
1129
|
+
|
|
1130
|
+
if keyword_error_count > 0 and structural_outlier_count == 0:
|
|
1131
|
+
signals_present.append(f"error_keywords:{keyword_error_count}")
|
|
1132
|
+
|
|
1133
|
+
# Combined error count for crushability analysis
|
|
1134
|
+
error_count = max(structural_outlier_count, keyword_error_count)
|
|
1135
|
+
|
|
1136
|
+
# 4. Count numeric anomalies (importance signal)
|
|
1137
|
+
anomaly_count = 0
|
|
1138
|
+
anomaly_indices: set[int] = set()
|
|
1139
|
+
for stats in field_stats.values():
|
|
1140
|
+
if stats.field_type == "numeric" and stats.mean_val is not None and stats.variance:
|
|
1141
|
+
std = stats.variance**0.5
|
|
1142
|
+
if std > 0:
|
|
1143
|
+
threshold = self.config.variance_threshold * std
|
|
1144
|
+
for i, item in enumerate(items):
|
|
1145
|
+
val = item.get(stats.name)
|
|
1146
|
+
if isinstance(val, (int, float)):
|
|
1147
|
+
if abs(val - stats.mean_val) > threshold:
|
|
1148
|
+
anomaly_indices.add(i)
|
|
1149
|
+
|
|
1150
|
+
anomaly_count = len(anomaly_indices)
|
|
1151
|
+
if anomaly_count > 0:
|
|
1152
|
+
signals_present.append(f"anomalies:{anomaly_count}")
|
|
1153
|
+
else:
|
|
1154
|
+
signals_absent.append("anomalies")
|
|
1155
|
+
|
|
1156
|
+
# 5. Compute average string uniqueness (EXCLUDING statistically-detected ID fields)
|
|
1157
|
+
string_stats = [
|
|
1158
|
+
s for s in field_stats.values() if s.field_type == "string" and s.name != id_field_name
|
|
1159
|
+
]
|
|
1160
|
+
avg_string_uniqueness = (
|
|
1161
|
+
statistics.mean(s.unique_ratio for s in string_stats) if string_stats else 0.0
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
# Compute uniqueness of non-ID numeric fields
|
|
1165
|
+
non_id_numeric_stats = [
|
|
1166
|
+
s for s in field_stats.values() if s.field_type == "numeric" and s.name != id_field_name
|
|
1167
|
+
]
|
|
1168
|
+
avg_non_id_numeric_uniqueness = (
|
|
1169
|
+
statistics.mean(s.unique_ratio for s in non_id_numeric_stats)
|
|
1170
|
+
if non_id_numeric_stats
|
|
1171
|
+
else 0.0
|
|
1172
|
+
)
|
|
1173
|
+
|
|
1174
|
+
# Combined uniqueness metric (including ID fields)
|
|
1175
|
+
max_uniqueness = max(avg_string_uniqueness, id_uniqueness, 0.0)
|
|
1176
|
+
|
|
1177
|
+
# Non-ID content uniqueness (for detecting repetitive content with unique IDs)
|
|
1178
|
+
non_id_content_uniqueness = max(avg_string_uniqueness, avg_non_id_numeric_uniqueness)
|
|
1179
|
+
|
|
1180
|
+
# 6. Check for change points (importance signal for time series)
|
|
1181
|
+
has_change_points = any(
|
|
1182
|
+
stats.change_points for stats in field_stats.values() if stats.field_type == "numeric"
|
|
1183
|
+
)
|
|
1184
|
+
if has_change_points:
|
|
1185
|
+
signals_present.append("change_points")
|
|
1186
|
+
|
|
1187
|
+
# DECISION LOGIC
|
|
1188
|
+
has_any_signal = len(signals_present) > 0
|
|
1189
|
+
|
|
1190
|
+
# Case 0: Repetitive content with unique IDs
|
|
1191
|
+
# If all non-ID fields are nearly constant, data is safe to sample
|
|
1192
|
+
# even if there's a unique ID field (e.g., status="success" for all items)
|
|
1193
|
+
if non_id_content_uniqueness < 0.1 and has_id_field:
|
|
1194
|
+
signals_present.append("repetitive_content")
|
|
1195
|
+
return CrushabilityAnalysis(
|
|
1196
|
+
crushable=True,
|
|
1197
|
+
confidence=0.85,
|
|
1198
|
+
reason="repetitive_content_with_ids",
|
|
1199
|
+
signals_present=signals_present,
|
|
1200
|
+
signals_absent=signals_absent,
|
|
1201
|
+
has_id_field=has_id_field,
|
|
1202
|
+
id_uniqueness=id_uniqueness,
|
|
1203
|
+
avg_string_uniqueness=avg_string_uniqueness,
|
|
1204
|
+
has_score_field=has_score_field,
|
|
1205
|
+
error_item_count=error_count,
|
|
1206
|
+
anomaly_count=anomaly_count,
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1209
|
+
# Case 1: Low uniqueness - safe to sample (data is repetitive)
|
|
1210
|
+
if max_uniqueness < 0.3:
|
|
1211
|
+
return CrushabilityAnalysis(
|
|
1212
|
+
crushable=True,
|
|
1213
|
+
confidence=0.9,
|
|
1214
|
+
reason="low_uniqueness_safe_to_sample",
|
|
1215
|
+
signals_present=signals_present,
|
|
1216
|
+
signals_absent=signals_absent,
|
|
1217
|
+
has_id_field=has_id_field,
|
|
1218
|
+
id_uniqueness=id_uniqueness,
|
|
1219
|
+
avg_string_uniqueness=avg_string_uniqueness,
|
|
1220
|
+
has_score_field=has_score_field,
|
|
1221
|
+
error_item_count=error_count,
|
|
1222
|
+
anomaly_count=anomaly_count,
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# Case 2: High uniqueness + ID field + NO signal = DON'T CRUSH
|
|
1226
|
+
# This is the critical case: DB results, file listings, user lists
|
|
1227
|
+
if has_id_field and max_uniqueness > 0.8 and not has_any_signal:
|
|
1228
|
+
return CrushabilityAnalysis(
|
|
1229
|
+
crushable=False,
|
|
1230
|
+
confidence=0.85,
|
|
1231
|
+
reason="unique_entities_no_signal",
|
|
1232
|
+
signals_present=signals_present,
|
|
1233
|
+
signals_absent=signals_absent,
|
|
1234
|
+
has_id_field=has_id_field,
|
|
1235
|
+
id_uniqueness=id_uniqueness,
|
|
1236
|
+
avg_string_uniqueness=avg_string_uniqueness,
|
|
1237
|
+
has_score_field=has_score_field,
|
|
1238
|
+
error_item_count=error_count,
|
|
1239
|
+
anomaly_count=anomaly_count,
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
# Case 3: High uniqueness + has signal = CRUSH using signal
|
|
1243
|
+
if max_uniqueness > 0.8 and has_any_signal:
|
|
1244
|
+
return CrushabilityAnalysis(
|
|
1245
|
+
crushable=True,
|
|
1246
|
+
confidence=0.7,
|
|
1247
|
+
reason="unique_entities_with_signal",
|
|
1248
|
+
signals_present=signals_present,
|
|
1249
|
+
signals_absent=signals_absent,
|
|
1250
|
+
has_id_field=has_id_field,
|
|
1251
|
+
id_uniqueness=id_uniqueness,
|
|
1252
|
+
avg_string_uniqueness=avg_string_uniqueness,
|
|
1253
|
+
has_score_field=has_score_field,
|
|
1254
|
+
error_item_count=error_count,
|
|
1255
|
+
anomaly_count=anomaly_count,
|
|
1256
|
+
)
|
|
1257
|
+
|
|
1258
|
+
# Case 4: Medium uniqueness + no signal = be cautious, don't crush
|
|
1259
|
+
if not has_any_signal:
|
|
1260
|
+
return CrushabilityAnalysis(
|
|
1261
|
+
crushable=False,
|
|
1262
|
+
confidence=0.6,
|
|
1263
|
+
reason="medium_uniqueness_no_signal",
|
|
1264
|
+
signals_present=signals_present,
|
|
1265
|
+
signals_absent=signals_absent,
|
|
1266
|
+
has_id_field=has_id_field,
|
|
1267
|
+
id_uniqueness=id_uniqueness,
|
|
1268
|
+
avg_string_uniqueness=avg_string_uniqueness,
|
|
1269
|
+
has_score_field=has_score_field,
|
|
1270
|
+
error_item_count=error_count,
|
|
1271
|
+
anomaly_count=anomaly_count,
|
|
1272
|
+
)
|
|
1273
|
+
|
|
1274
|
+
# Case 5: Medium uniqueness + has signal = crush with caution
|
|
1275
|
+
return CrushabilityAnalysis(
|
|
1276
|
+
crushable=True,
|
|
1277
|
+
confidence=0.5,
|
|
1278
|
+
reason="medium_uniqueness_with_signal",
|
|
1279
|
+
signals_present=signals_present,
|
|
1280
|
+
signals_absent=signals_absent,
|
|
1281
|
+
has_id_field=has_id_field,
|
|
1282
|
+
id_uniqueness=id_uniqueness,
|
|
1283
|
+
avg_string_uniqueness=avg_string_uniqueness,
|
|
1284
|
+
has_score_field=has_score_field,
|
|
1285
|
+
error_item_count=error_count,
|
|
1286
|
+
anomaly_count=anomaly_count,
|
|
1287
|
+
)
|
|
1288
|
+
|
|
1289
|
+
def _select_strategy(
|
|
1290
|
+
self,
|
|
1291
|
+
field_stats: dict[str, FieldStats],
|
|
1292
|
+
pattern: str,
|
|
1293
|
+
item_count: int,
|
|
1294
|
+
crushability: CrushabilityAnalysis | None = None,
|
|
1295
|
+
) -> CompressionStrategy:
|
|
1296
|
+
"""Select optimal compression strategy based on analysis."""
|
|
1297
|
+
if item_count < self.config.min_items_to_analyze:
|
|
1298
|
+
return CompressionStrategy.NONE
|
|
1299
|
+
|
|
1300
|
+
# CRITICAL: Check crushability first
|
|
1301
|
+
if crushability is not None and not crushability.crushable:
|
|
1302
|
+
return CompressionStrategy.SKIP
|
|
1303
|
+
|
|
1304
|
+
if pattern == "time_series":
|
|
1305
|
+
# Check if there are change points worth preserving
|
|
1306
|
+
numeric_fields = [v for v in field_stats.values() if v.field_type == "numeric"]
|
|
1307
|
+
has_change_points = any(f.change_points for f in numeric_fields)
|
|
1308
|
+
if has_change_points:
|
|
1309
|
+
return CompressionStrategy.TIME_SERIES
|
|
1310
|
+
|
|
1311
|
+
if pattern == "logs":
|
|
1312
|
+
# Check if messages are clusterable (low-medium uniqueness)
|
|
1313
|
+
message_field = next(
|
|
1314
|
+
(v for k, v in field_stats.items() if "message" in k.lower()), None
|
|
1315
|
+
)
|
|
1316
|
+
if message_field and message_field.unique_ratio < 0.5:
|
|
1317
|
+
return CompressionStrategy.CLUSTER_SAMPLE
|
|
1318
|
+
|
|
1319
|
+
if pattern == "search_results":
|
|
1320
|
+
return CompressionStrategy.TOP_N
|
|
1321
|
+
|
|
1322
|
+
# Default: smart sampling
|
|
1323
|
+
return CompressionStrategy.SMART_SAMPLE
|
|
1324
|
+
|
|
1325
|
+
def _estimate_reduction(
|
|
1326
|
+
self, field_stats: dict[str, FieldStats], strategy: CompressionStrategy, item_count: int
|
|
1327
|
+
) -> float:
|
|
1328
|
+
"""Estimate token reduction ratio."""
|
|
1329
|
+
if strategy == CompressionStrategy.NONE:
|
|
1330
|
+
return 0.0
|
|
1331
|
+
|
|
1332
|
+
# Count constant fields (will be factored out)
|
|
1333
|
+
constant_ratio = sum(1 for v in field_stats.values() if v.is_constant) / len(field_stats)
|
|
1334
|
+
|
|
1335
|
+
# Estimate based on strategy
|
|
1336
|
+
base_reduction = {
|
|
1337
|
+
CompressionStrategy.TIME_SERIES: 0.7,
|
|
1338
|
+
CompressionStrategy.CLUSTER_SAMPLE: 0.8,
|
|
1339
|
+
CompressionStrategy.TOP_N: 0.6,
|
|
1340
|
+
CompressionStrategy.SMART_SAMPLE: 0.5,
|
|
1341
|
+
}.get(strategy, 0.3)
|
|
1342
|
+
|
|
1343
|
+
# Adjust for constants
|
|
1344
|
+
reduction = base_reduction + (constant_ratio * 0.2)
|
|
1345
|
+
|
|
1346
|
+
return min(reduction, 0.95)
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
class SmartCrusher(Transform):
|
|
1350
|
+
"""
|
|
1351
|
+
Intelligent tool output compression using statistical analysis.
|
|
1352
|
+
|
|
1353
|
+
Unlike fixed-rule crushing, SmartCrusher:
|
|
1354
|
+
1. Analyzes JSON structure and computes field statistics
|
|
1355
|
+
2. Detects data patterns (time series, logs, search results)
|
|
1356
|
+
3. Identifies constant fields to factor out
|
|
1357
|
+
4. Finds change points in numeric data to preserve
|
|
1358
|
+
5. Applies optimal compression strategy per data type
|
|
1359
|
+
6. Uses RelevanceScorer for semantic matching of user queries
|
|
1360
|
+
|
|
1361
|
+
This results in higher compression with lower information loss.
|
|
1362
|
+
"""
|
|
1363
|
+
|
|
1364
|
+
name = "smart_crusher"
|
|
1365
|
+
|
|
1366
|
+
def __init__(
|
|
1367
|
+
self,
|
|
1368
|
+
config: SmartCrusherConfig | None = None,
|
|
1369
|
+
relevance_config: RelevanceScorerConfig | None = None,
|
|
1370
|
+
scorer: RelevanceScorer | None = None,
|
|
1371
|
+
ccr_config: CCRConfig | None = None,
|
|
1372
|
+
):
|
|
1373
|
+
self.config = config or SmartCrusherConfig()
|
|
1374
|
+
self.analyzer = SmartAnalyzer(self.config)
|
|
1375
|
+
|
|
1376
|
+
# CCR (Compress-Cache-Retrieve) configuration
|
|
1377
|
+
# When no ccr_config provided, default to caching enabled but markers disabled
|
|
1378
|
+
# This maintains backward compatibility - callers must opt-in to markers
|
|
1379
|
+
if ccr_config is None:
|
|
1380
|
+
self._ccr_config = CCRConfig(
|
|
1381
|
+
enabled=True, # Still cache for potential retrieval
|
|
1382
|
+
inject_retrieval_marker=False, # Don't break JSON parsing by default
|
|
1383
|
+
)
|
|
1384
|
+
else:
|
|
1385
|
+
self._ccr_config = ccr_config
|
|
1386
|
+
self._compression_store: CompressionStore | None = None
|
|
1387
|
+
|
|
1388
|
+
# Feedback loop for learning compression patterns
|
|
1389
|
+
self._feedback: CompressionFeedback | None = None
|
|
1390
|
+
|
|
1391
|
+
# CRITICAL FIX: Lock for thread-safe lazy initialization
|
|
1392
|
+
# Without this, multiple threads could call _get_* methods simultaneously
|
|
1393
|
+
# and potentially create redundant initialization calls.
|
|
1394
|
+
self._lazy_init_lock = threading.Lock()
|
|
1395
|
+
|
|
1396
|
+
# Initialize relevance scorer
|
|
1397
|
+
if scorer is not None:
|
|
1398
|
+
self._scorer = scorer
|
|
1399
|
+
else:
|
|
1400
|
+
rel_config = relevance_config or RelevanceScorerConfig()
|
|
1401
|
+
# Build kwargs based on tier - BM25 params only apply to bm25 tier
|
|
1402
|
+
scorer_kwargs = {}
|
|
1403
|
+
if rel_config.tier == "bm25":
|
|
1404
|
+
scorer_kwargs = {"k1": rel_config.bm25_k1, "b": rel_config.bm25_b}
|
|
1405
|
+
elif rel_config.tier == "hybrid":
|
|
1406
|
+
scorer_kwargs = {
|
|
1407
|
+
"alpha": rel_config.hybrid_alpha,
|
|
1408
|
+
"adaptive": rel_config.adaptive_alpha,
|
|
1409
|
+
}
|
|
1410
|
+
self._scorer = create_scorer(tier=rel_config.tier, **scorer_kwargs)
|
|
1411
|
+
# Use threshold from config, or default from RelevanceScorerConfig
|
|
1412
|
+
rel_cfg = relevance_config or RelevanceScorerConfig()
|
|
1413
|
+
self._relevance_threshold = rel_cfg.relevance_threshold
|
|
1414
|
+
|
|
1415
|
+
# NOTE: Error detection now uses structural outlier detection (_detect_structural_outliers)
|
|
1416
|
+
# instead of hardcoded keywords. This scales to any data domain.
|
|
1417
|
+
|
|
1418
|
+
def crush(self, content: str, query: str = "") -> CrushResult:
|
|
1419
|
+
"""Crush content string directly (for use by ContentRouter).
|
|
1420
|
+
|
|
1421
|
+
This is a simplified interface for compressing a single content string,
|
|
1422
|
+
used by ContentRouter when routing JSON arrays to SmartCrusher.
|
|
1423
|
+
|
|
1424
|
+
Args:
|
|
1425
|
+
content: JSON string content to compress.
|
|
1426
|
+
query: Query context for relevance-based compression.
|
|
1427
|
+
|
|
1428
|
+
Returns:
|
|
1429
|
+
CrushResult with compressed content and metadata.
|
|
1430
|
+
"""
|
|
1431
|
+
compressed, was_modified, analysis_info = self._smart_crush_content(
|
|
1432
|
+
content, query_context=query
|
|
1433
|
+
)
|
|
1434
|
+
return CrushResult(
|
|
1435
|
+
compressed=compressed,
|
|
1436
|
+
original=content,
|
|
1437
|
+
was_modified=was_modified,
|
|
1438
|
+
strategy=analysis_info or "passthrough",
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
def _get_compression_store(self) -> CompressionStore:
|
|
1442
|
+
"""Get the compression store for CCR (lazy initialization).
|
|
1443
|
+
|
|
1444
|
+
CRITICAL FIX: Thread-safe double-checked locking pattern.
|
|
1445
|
+
"""
|
|
1446
|
+
if self._compression_store is None:
|
|
1447
|
+
with self._lazy_init_lock:
|
|
1448
|
+
# Double-check after acquiring lock
|
|
1449
|
+
if self._compression_store is None:
|
|
1450
|
+
self._compression_store = get_compression_store(
|
|
1451
|
+
max_entries=self._ccr_config.store_max_entries,
|
|
1452
|
+
default_ttl=self._ccr_config.store_ttl_seconds,
|
|
1453
|
+
)
|
|
1454
|
+
return self._compression_store
|
|
1455
|
+
|
|
1456
|
+
def _get_feedback(self) -> CompressionFeedback:
|
|
1457
|
+
"""Get the feedback analyzer (lazy initialization).
|
|
1458
|
+
|
|
1459
|
+
CRITICAL FIX: Thread-safe double-checked locking pattern.
|
|
1460
|
+
"""
|
|
1461
|
+
if self._feedback is None:
|
|
1462
|
+
with self._lazy_init_lock:
|
|
1463
|
+
if self._feedback is None:
|
|
1464
|
+
self._feedback = get_compression_feedback()
|
|
1465
|
+
return self._feedback
|
|
1466
|
+
|
|
1467
|
+
def _get_telemetry(self) -> TelemetryCollector:
|
|
1468
|
+
"""Get the telemetry collector (lazy initialization).
|
|
1469
|
+
|
|
1470
|
+
CRITICAL FIX: Thread-safe double-checked locking pattern.
|
|
1471
|
+
"""
|
|
1472
|
+
# Use getattr to avoid hasattr race condition
|
|
1473
|
+
if getattr(self, "_telemetry", None) is None:
|
|
1474
|
+
with self._lazy_init_lock:
|
|
1475
|
+
if getattr(self, "_telemetry", None) is None:
|
|
1476
|
+
self._telemetry = get_telemetry_collector()
|
|
1477
|
+
return self._telemetry
|
|
1478
|
+
|
|
1479
|
+
def _get_toin(self) -> ToolIntelligenceNetwork:
|
|
1480
|
+
"""Get the TOIN instance (lazy initialization).
|
|
1481
|
+
|
|
1482
|
+
CRITICAL FIX: Thread-safe double-checked locking pattern.
|
|
1483
|
+
"""
|
|
1484
|
+
# Use getattr to avoid hasattr race condition
|
|
1485
|
+
if getattr(self, "_toin", None) is None:
|
|
1486
|
+
with self._lazy_init_lock:
|
|
1487
|
+
if getattr(self, "_toin", None) is None:
|
|
1488
|
+
self._toin = get_toin()
|
|
1489
|
+
return self._toin
|
|
1490
|
+
|
|
1491
|
+
def _record_telemetry(
|
|
1492
|
+
self,
|
|
1493
|
+
items: list[dict],
|
|
1494
|
+
result: list,
|
|
1495
|
+
analysis: ArrayAnalysis,
|
|
1496
|
+
plan: CompressionPlan,
|
|
1497
|
+
tool_name: str | None = None,
|
|
1498
|
+
) -> None:
|
|
1499
|
+
"""Record compression telemetry for the data flywheel.
|
|
1500
|
+
|
|
1501
|
+
This collects anonymized statistics about compression patterns to
|
|
1502
|
+
enable cross-user learning and improve compression over time.
|
|
1503
|
+
|
|
1504
|
+
Privacy guarantees:
|
|
1505
|
+
- No actual data values are stored
|
|
1506
|
+
- Tool names can be hashed
|
|
1507
|
+
- Only structural patterns are captured
|
|
1508
|
+
"""
|
|
1509
|
+
try:
|
|
1510
|
+
telemetry = self._get_telemetry()
|
|
1511
|
+
|
|
1512
|
+
# Calculate what was kept
|
|
1513
|
+
kept_first_n = sum(1 for i in plan.keep_indices if i < 3)
|
|
1514
|
+
kept_last_n = sum(1 for i in plan.keep_indices if i >= len(items) - 2)
|
|
1515
|
+
|
|
1516
|
+
# Count error items in result
|
|
1517
|
+
error_indices = set(_detect_error_items_for_preservation(items))
|
|
1518
|
+
kept_errors = sum(1 for i in plan.keep_indices if i in error_indices)
|
|
1519
|
+
|
|
1520
|
+
# Count anomalies (approximate from change points)
|
|
1521
|
+
anomaly_count = 0
|
|
1522
|
+
for stats in analysis.field_stats.values():
|
|
1523
|
+
if stats.change_points:
|
|
1524
|
+
anomaly_count += len(stats.change_points)
|
|
1525
|
+
kept_anomalies = min(anomaly_count, len(plan.keep_indices))
|
|
1526
|
+
|
|
1527
|
+
# Crushability info
|
|
1528
|
+
crushability_score = None
|
|
1529
|
+
crushability_reason = None
|
|
1530
|
+
if analysis.crushability:
|
|
1531
|
+
crushability_score = analysis.crushability.confidence
|
|
1532
|
+
crushability_reason = analysis.crushability.reason
|
|
1533
|
+
|
|
1534
|
+
# Record the event
|
|
1535
|
+
telemetry.record_compression(
|
|
1536
|
+
items=items[:100], # Sample for structure analysis
|
|
1537
|
+
original_count=len(items),
|
|
1538
|
+
compressed_count=len(result),
|
|
1539
|
+
original_tokens=0, # Not available here
|
|
1540
|
+
compressed_tokens=0, # Not available here
|
|
1541
|
+
strategy=analysis.recommended_strategy.value,
|
|
1542
|
+
tool_name=tool_name,
|
|
1543
|
+
strategy_reason=analysis.detected_pattern,
|
|
1544
|
+
crushability_score=crushability_score,
|
|
1545
|
+
crushability_reason=crushability_reason,
|
|
1546
|
+
kept_first_n=kept_first_n,
|
|
1547
|
+
kept_last_n=kept_last_n,
|
|
1548
|
+
kept_errors=kept_errors,
|
|
1549
|
+
kept_anomalies=kept_anomalies,
|
|
1550
|
+
kept_by_relevance=0, # Would need to track separately
|
|
1551
|
+
kept_by_score=0, # Would need to track separately
|
|
1552
|
+
)
|
|
1553
|
+
except Exception:
|
|
1554
|
+
# Telemetry should never break compression
|
|
1555
|
+
pass
|
|
1556
|
+
|
|
1557
|
+
def _prioritize_indices(
|
|
1558
|
+
self,
|
|
1559
|
+
keep_indices: set[int],
|
|
1560
|
+
items: list[dict],
|
|
1561
|
+
n: int,
|
|
1562
|
+
analysis: ArrayAnalysis | None = None,
|
|
1563
|
+
max_items: int | None = None,
|
|
1564
|
+
field_semantics: dict[str, FieldSemantics] | None = None,
|
|
1565
|
+
) -> set[int]:
|
|
1566
|
+
"""Prioritize indices when we exceed max_items, ALWAYS keeping critical items.
|
|
1567
|
+
|
|
1568
|
+
Priority order:
|
|
1569
|
+
1. ALL error items (non-negotiable) - items with error keywords
|
|
1570
|
+
2. ALL structural outliers (non-negotiable) - items with rare fields/status values
|
|
1571
|
+
3. ALL numeric anomalies (non-negotiable) - e.g., unusual values like 999999
|
|
1572
|
+
4. ALL items with important values (learned) - TOIN field semantics
|
|
1573
|
+
5. First 3 items (context)
|
|
1574
|
+
6. Last 2 items (context)
|
|
1575
|
+
7. Other important items by index order
|
|
1576
|
+
|
|
1577
|
+
Uses BOTH keyword detection (for preservation guarantee) AND statistical detection,
|
|
1578
|
+
PLUS learned field semantics from TOIN for zero-latency signal detection.
|
|
1579
|
+
|
|
1580
|
+
HIGH FIX: Note that this function may return MORE items than effective_max
|
|
1581
|
+
when critical items (errors, outliers, anomalies) exceed the limit. This is
|
|
1582
|
+
intentional to preserve the quality guarantee. A warning is logged when this
|
|
1583
|
+
happens to help diagnose cases where compression is less effective than expected.
|
|
1584
|
+
|
|
1585
|
+
Args:
|
|
1586
|
+
keep_indices: Initial set of indices to keep.
|
|
1587
|
+
items: The items being compressed.
|
|
1588
|
+
n: Total number of items.
|
|
1589
|
+
analysis: Optional analysis results for anomaly detection.
|
|
1590
|
+
max_items: Thread-safe max items limit (defaults to config value).
|
|
1591
|
+
field_semantics: Optional learned field semantics from TOIN.
|
|
1592
|
+
|
|
1593
|
+
Returns:
|
|
1594
|
+
Set of indices to keep (may exceed max_items if critical items require it).
|
|
1595
|
+
"""
|
|
1596
|
+
# Use provided max_items or fall back to config
|
|
1597
|
+
effective_max = max_items if max_items is not None else self.config.max_items_after_crush
|
|
1598
|
+
|
|
1599
|
+
if len(keep_indices) <= effective_max:
|
|
1600
|
+
return keep_indices
|
|
1601
|
+
|
|
1602
|
+
# Use provided field_semantics or fall back to instance variable (set by crush())
|
|
1603
|
+
effective_field_semantics = field_semantics or getattr(
|
|
1604
|
+
self, "_current_field_semantics", None
|
|
1605
|
+
)
|
|
1606
|
+
|
|
1607
|
+
# Identify error items using KEYWORD detection (preservation guarantee)
|
|
1608
|
+
# This ensures ALL error items are kept, regardless of frequency
|
|
1609
|
+
error_indices = set(_detect_error_items_for_preservation(items))
|
|
1610
|
+
|
|
1611
|
+
# Identify structural outlier indices using STATISTICAL detection
|
|
1612
|
+
# (items with rare fields or rare status values)
|
|
1613
|
+
outlier_indices = set(_detect_structural_outliers(items))
|
|
1614
|
+
|
|
1615
|
+
# Identify numeric anomalies (MUST keep ALL of them)
|
|
1616
|
+
anomaly_indices = set()
|
|
1617
|
+
if analysis and analysis.field_stats:
|
|
1618
|
+
for field_name, stats in analysis.field_stats.items():
|
|
1619
|
+
if stats.field_type == "numeric" and stats.mean_val is not None and stats.variance:
|
|
1620
|
+
std = stats.variance**0.5
|
|
1621
|
+
if std > 0:
|
|
1622
|
+
threshold = self.config.variance_threshold * std
|
|
1623
|
+
for i, item in enumerate(items):
|
|
1624
|
+
val = item.get(field_name)
|
|
1625
|
+
if isinstance(val, (int, float)):
|
|
1626
|
+
if abs(val - stats.mean_val) > threshold:
|
|
1627
|
+
anomaly_indices.add(i)
|
|
1628
|
+
|
|
1629
|
+
# === TOIN Evolution: Identify items with important values (learned) ===
|
|
1630
|
+
# Uses learned field semantics for zero-latency signal detection
|
|
1631
|
+
learned_important_indices: set[int] = set()
|
|
1632
|
+
if effective_field_semantics:
|
|
1633
|
+
learned_important_indices = set(
|
|
1634
|
+
_detect_items_by_learned_semantics(items, effective_field_semantics)
|
|
1635
|
+
)
|
|
1636
|
+
|
|
1637
|
+
# Start with all critical items (these are non-negotiable)
|
|
1638
|
+
# Error items are ALWAYS preserved (quality guarantee)
|
|
1639
|
+
prioritized = error_indices | outlier_indices | anomaly_indices | learned_important_indices
|
|
1640
|
+
|
|
1641
|
+
# HIGH FIX: Log warning if critical items alone exceed the limit
|
|
1642
|
+
# This helps diagnose why compression may be less effective than expected
|
|
1643
|
+
critical_count = len(prioritized)
|
|
1644
|
+
if critical_count > effective_max:
|
|
1645
|
+
logger.warning(
|
|
1646
|
+
"Critical items (%d) exceed max_items (%d): errors=%d outliers=%d anomalies=%d learned=%d. "
|
|
1647
|
+
"Quality guarantee takes precedence - keeping all critical items.",
|
|
1648
|
+
critical_count,
|
|
1649
|
+
effective_max,
|
|
1650
|
+
len(error_indices),
|
|
1651
|
+
len(outlier_indices),
|
|
1652
|
+
len(anomaly_indices),
|
|
1653
|
+
len(learned_important_indices),
|
|
1654
|
+
)
|
|
1655
|
+
|
|
1656
|
+
# Add first/last items if we have room
|
|
1657
|
+
remaining_slots = effective_max - len(prioritized)
|
|
1658
|
+
if remaining_slots > 0:
|
|
1659
|
+
# First 3 items
|
|
1660
|
+
for i in range(min(3, n)):
|
|
1661
|
+
if i not in prioritized and remaining_slots > 0:
|
|
1662
|
+
prioritized.add(i)
|
|
1663
|
+
remaining_slots -= 1
|
|
1664
|
+
# Last 2 items
|
|
1665
|
+
for i in range(max(0, n - 2), n):
|
|
1666
|
+
if i not in prioritized and remaining_slots > 0:
|
|
1667
|
+
prioritized.add(i)
|
|
1668
|
+
remaining_slots -= 1
|
|
1669
|
+
|
|
1670
|
+
# Fill remaining slots with other important indices (by index order)
|
|
1671
|
+
if remaining_slots > 0:
|
|
1672
|
+
other_indices = sorted(keep_indices - prioritized)
|
|
1673
|
+
for i in other_indices:
|
|
1674
|
+
if remaining_slots <= 0:
|
|
1675
|
+
break
|
|
1676
|
+
prioritized.add(i)
|
|
1677
|
+
remaining_slots -= 1
|
|
1678
|
+
|
|
1679
|
+
return prioritized
|
|
1680
|
+
|
|
1681
|
+
def should_apply(
|
|
1682
|
+
self,
|
|
1683
|
+
messages: list[dict[str, Any]],
|
|
1684
|
+
tokenizer: Tokenizer,
|
|
1685
|
+
**kwargs: Any,
|
|
1686
|
+
) -> bool:
|
|
1687
|
+
"""Check if any tool messages would benefit from smart crushing."""
|
|
1688
|
+
if not self.config.enabled:
|
|
1689
|
+
return False
|
|
1690
|
+
|
|
1691
|
+
for msg in messages:
|
|
1692
|
+
# OpenAI style: role="tool"
|
|
1693
|
+
if msg.get("role") == "tool":
|
|
1694
|
+
content = msg.get("content", "")
|
|
1695
|
+
if isinstance(content, str):
|
|
1696
|
+
tokens = tokenizer.count_text(content)
|
|
1697
|
+
if tokens > self.config.min_tokens_to_crush:
|
|
1698
|
+
# Check if it's JSON with arrays
|
|
1699
|
+
parsed, success = safe_json_loads(content)
|
|
1700
|
+
if success and self._has_crushable_arrays(parsed):
|
|
1701
|
+
return True
|
|
1702
|
+
|
|
1703
|
+
# Anthropic style: role="user" with tool_result content blocks
|
|
1704
|
+
content = msg.get("content")
|
|
1705
|
+
if isinstance(content, list):
|
|
1706
|
+
for block in content:
|
|
1707
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
1708
|
+
tool_content = block.get("content", "")
|
|
1709
|
+
if isinstance(tool_content, str):
|
|
1710
|
+
tokens = tokenizer.count_text(tool_content)
|
|
1711
|
+
if tokens > self.config.min_tokens_to_crush:
|
|
1712
|
+
parsed, success = safe_json_loads(tool_content)
|
|
1713
|
+
if success and self._has_crushable_arrays(parsed):
|
|
1714
|
+
return True
|
|
1715
|
+
|
|
1716
|
+
return False
|
|
1717
|
+
|
|
1718
|
+
def _has_crushable_arrays(self, data: Any, depth: int = 0) -> bool:
|
|
1719
|
+
"""Check if data contains arrays large enough to crush."""
|
|
1720
|
+
if depth > 5:
|
|
1721
|
+
return False
|
|
1722
|
+
|
|
1723
|
+
if isinstance(data, list):
|
|
1724
|
+
if len(data) >= self.config.min_items_to_analyze:
|
|
1725
|
+
if data and isinstance(data[0], dict):
|
|
1726
|
+
return True
|
|
1727
|
+
for item in data[:10]: # Check first few items
|
|
1728
|
+
if self._has_crushable_arrays(item, depth + 1):
|
|
1729
|
+
return True
|
|
1730
|
+
|
|
1731
|
+
elif isinstance(data, dict):
|
|
1732
|
+
for value in data.values():
|
|
1733
|
+
if self._has_crushable_arrays(value, depth + 1):
|
|
1734
|
+
return True
|
|
1735
|
+
|
|
1736
|
+
return False
|
|
1737
|
+
|
|
1738
|
+
def apply(
|
|
1739
|
+
self,
|
|
1740
|
+
messages: list[dict[str, Any]],
|
|
1741
|
+
tokenizer: Tokenizer,
|
|
1742
|
+
**kwargs: Any,
|
|
1743
|
+
) -> TransformResult:
|
|
1744
|
+
"""Apply smart crushing to messages."""
|
|
1745
|
+
tokens_before = tokenizer.count_messages(messages)
|
|
1746
|
+
result_messages = deep_copy_messages(messages)
|
|
1747
|
+
transforms_applied: list[str] = []
|
|
1748
|
+
markers_inserted: list[str] = []
|
|
1749
|
+
warnings: list[str] = []
|
|
1750
|
+
|
|
1751
|
+
# Extract query context from recent user messages for relevance scoring
|
|
1752
|
+
query_context = self._extract_context_from_messages(result_messages)
|
|
1753
|
+
|
|
1754
|
+
crushed_count = 0
|
|
1755
|
+
|
|
1756
|
+
for msg in result_messages:
|
|
1757
|
+
# OpenAI style
|
|
1758
|
+
if msg.get("role") == "tool":
|
|
1759
|
+
content = msg.get("content", "")
|
|
1760
|
+
if not isinstance(content, str):
|
|
1761
|
+
continue
|
|
1762
|
+
|
|
1763
|
+
tokens = tokenizer.count_text(content)
|
|
1764
|
+
if tokens <= self.config.min_tokens_to_crush:
|
|
1765
|
+
continue
|
|
1766
|
+
|
|
1767
|
+
crushed, was_modified, analysis_info = self._smart_crush_content(
|
|
1768
|
+
content, query_context
|
|
1769
|
+
)
|
|
1770
|
+
|
|
1771
|
+
if was_modified:
|
|
1772
|
+
original_hash = compute_short_hash(content)
|
|
1773
|
+
marker = create_tool_digest_marker(original_hash)
|
|
1774
|
+
msg["content"] = crushed + "\n" + marker
|
|
1775
|
+
crushed_count += 1
|
|
1776
|
+
markers_inserted.append(marker)
|
|
1777
|
+
if analysis_info:
|
|
1778
|
+
transforms_applied.append(f"smart:{analysis_info}")
|
|
1779
|
+
|
|
1780
|
+
# Anthropic style
|
|
1781
|
+
content = msg.get("content")
|
|
1782
|
+
if isinstance(content, list):
|
|
1783
|
+
for i, block in enumerate(content):
|
|
1784
|
+
if not isinstance(block, dict):
|
|
1785
|
+
continue
|
|
1786
|
+
if block.get("type") != "tool_result":
|
|
1787
|
+
continue
|
|
1788
|
+
|
|
1789
|
+
tool_content = block.get("content", "")
|
|
1790
|
+
if not isinstance(tool_content, str):
|
|
1791
|
+
continue
|
|
1792
|
+
|
|
1793
|
+
tokens = tokenizer.count_text(tool_content)
|
|
1794
|
+
if tokens <= self.config.min_tokens_to_crush:
|
|
1795
|
+
continue
|
|
1796
|
+
|
|
1797
|
+
crushed, was_modified, analysis_info = self._smart_crush_content(
|
|
1798
|
+
tool_content, query_context
|
|
1799
|
+
)
|
|
1800
|
+
|
|
1801
|
+
if was_modified:
|
|
1802
|
+
original_hash = compute_short_hash(tool_content)
|
|
1803
|
+
marker = create_tool_digest_marker(original_hash)
|
|
1804
|
+
content[i]["content"] = crushed + "\n" + marker
|
|
1805
|
+
crushed_count += 1
|
|
1806
|
+
markers_inserted.append(marker)
|
|
1807
|
+
if analysis_info:
|
|
1808
|
+
transforms_applied.append(f"smart:{analysis_info}")
|
|
1809
|
+
|
|
1810
|
+
if crushed_count > 0:
|
|
1811
|
+
transforms_applied.insert(0, f"smart_crush:{crushed_count}")
|
|
1812
|
+
|
|
1813
|
+
tokens_after = tokenizer.count_messages(result_messages)
|
|
1814
|
+
|
|
1815
|
+
return TransformResult(
|
|
1816
|
+
messages=result_messages,
|
|
1817
|
+
tokens_before=tokens_before,
|
|
1818
|
+
tokens_after=tokens_after,
|
|
1819
|
+
transforms_applied=transforms_applied,
|
|
1820
|
+
markers_inserted=markers_inserted,
|
|
1821
|
+
warnings=warnings,
|
|
1822
|
+
)
|
|
1823
|
+
|
|
1824
|
+
def _extract_context_from_messages(self, messages: list[dict[str, Any]]) -> str:
|
|
1825
|
+
"""Extract query context from recent messages for relevance scoring.
|
|
1826
|
+
|
|
1827
|
+
Builds a context string from:
|
|
1828
|
+
- Recent user messages (what the user is asking about)
|
|
1829
|
+
- Recent tool call arguments (what data was requested)
|
|
1830
|
+
|
|
1831
|
+
This context is used by RelevanceScorer to determine which items
|
|
1832
|
+
to preserve during crushing.
|
|
1833
|
+
|
|
1834
|
+
Args:
|
|
1835
|
+
messages: Full message list.
|
|
1836
|
+
|
|
1837
|
+
Returns:
|
|
1838
|
+
Context string for relevance scoring.
|
|
1839
|
+
"""
|
|
1840
|
+
context_parts: list[str] = []
|
|
1841
|
+
|
|
1842
|
+
# Look at last 5 user messages (most relevant to recent tool calls)
|
|
1843
|
+
user_message_count = 0
|
|
1844
|
+
for msg in reversed(messages):
|
|
1845
|
+
if msg.get("role") == "user":
|
|
1846
|
+
content = msg.get("content")
|
|
1847
|
+
if isinstance(content, str):
|
|
1848
|
+
context_parts.append(content)
|
|
1849
|
+
elif isinstance(content, list):
|
|
1850
|
+
# Anthropic style - extract from text blocks
|
|
1851
|
+
for block in content:
|
|
1852
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
1853
|
+
text = block.get("text", "")
|
|
1854
|
+
if text:
|
|
1855
|
+
context_parts.append(text)
|
|
1856
|
+
|
|
1857
|
+
user_message_count += 1
|
|
1858
|
+
if user_message_count >= 5:
|
|
1859
|
+
break
|
|
1860
|
+
|
|
1861
|
+
# Also check assistant tool_calls for function arguments
|
|
1862
|
+
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
1863
|
+
for tc in msg.get("tool_calls", []):
|
|
1864
|
+
if isinstance(tc, dict):
|
|
1865
|
+
func = tc.get("function", {})
|
|
1866
|
+
args = func.get("arguments", "")
|
|
1867
|
+
if isinstance(args, str) and args:
|
|
1868
|
+
context_parts.append(args)
|
|
1869
|
+
|
|
1870
|
+
return " ".join(context_parts)
|
|
1871
|
+
|
|
1872
|
+
def _smart_crush_content(
|
|
1873
|
+
self, content: str, query_context: str = "", tool_name: str | None = None
|
|
1874
|
+
) -> tuple[str, bool, str]:
|
|
1875
|
+
"""
|
|
1876
|
+
Apply smart crushing to content.
|
|
1877
|
+
|
|
1878
|
+
Handles both JSON (existing SmartCrusher logic) and plain text content
|
|
1879
|
+
(search results, logs, generic text) using specialized compressors.
|
|
1880
|
+
|
|
1881
|
+
Args:
|
|
1882
|
+
content: Content to crush (JSON or plain text).
|
|
1883
|
+
query_context: Context string from user messages for relevance scoring.
|
|
1884
|
+
tool_name: Name of the tool that produced this output.
|
|
1885
|
+
|
|
1886
|
+
Returns:
|
|
1887
|
+
Tuple of (crushed_content, was_modified, analysis_info).
|
|
1888
|
+
"""
|
|
1889
|
+
parsed, success = safe_json_loads(content)
|
|
1890
|
+
if not success:
|
|
1891
|
+
# Not JSON - pass through unchanged
|
|
1892
|
+
# Text compression utilities (SearchCompressor, LogCompressor, TextCompressor)
|
|
1893
|
+
# are available as standalone tools for applications to use explicitly
|
|
1894
|
+
return content, False, ""
|
|
1895
|
+
|
|
1896
|
+
# Recursively process and crush arrays
|
|
1897
|
+
crushed, info, ccr_markers = self._process_value(
|
|
1898
|
+
parsed, query_context=query_context, tool_name=tool_name
|
|
1899
|
+
)
|
|
1900
|
+
|
|
1901
|
+
result = safe_json_dumps(crushed, indent=None)
|
|
1902
|
+
was_modified = result != content.strip()
|
|
1903
|
+
|
|
1904
|
+
# CCR: Inject retrieval markers if compression happened and CCR is enabled
|
|
1905
|
+
if was_modified and ccr_markers and self._ccr_config.inject_retrieval_marker:
|
|
1906
|
+
for ccr_hash, original_count, compressed_count in ccr_markers:
|
|
1907
|
+
marker = self._ccr_config.marker_template.format(
|
|
1908
|
+
original_count=original_count,
|
|
1909
|
+
compressed_count=compressed_count,
|
|
1910
|
+
hash=ccr_hash,
|
|
1911
|
+
)
|
|
1912
|
+
result += marker
|
|
1913
|
+
|
|
1914
|
+
return result, was_modified, info
|
|
1915
|
+
|
|
1916
|
+
def _process_value(
|
|
1917
|
+
self, value: Any, depth: int = 0, query_context: str = "", tool_name: str | None = None
|
|
1918
|
+
) -> tuple[Any, str, list[tuple[str, int, int]]]:
|
|
1919
|
+
"""Recursively process a value, crushing arrays where appropriate.
|
|
1920
|
+
|
|
1921
|
+
Returns:
|
|
1922
|
+
Tuple of (processed_value, info_string, ccr_markers).
|
|
1923
|
+
ccr_markers is a list of (hash, original_count, compressed_count) tuples.
|
|
1924
|
+
"""
|
|
1925
|
+
info_parts = []
|
|
1926
|
+
ccr_markers: list[tuple[str, int, int]] = []
|
|
1927
|
+
|
|
1928
|
+
if isinstance(value, list):
|
|
1929
|
+
# Check if this array should be crushed
|
|
1930
|
+
# Must have enough items AND all items must be dicts (not mixed types)
|
|
1931
|
+
all_dicts = value and all(isinstance(item, dict) for item in value)
|
|
1932
|
+
if len(value) >= self.config.min_items_to_analyze and all_dicts:
|
|
1933
|
+
crushed, strategy, ccr_hash = self._crush_array(value, query_context, tool_name)
|
|
1934
|
+
info_parts.append(f"{strategy}({len(value)}->{len(crushed)})")
|
|
1935
|
+
|
|
1936
|
+
# Track CCR marker for later injection
|
|
1937
|
+
if ccr_hash:
|
|
1938
|
+
ccr_markers.append((ccr_hash, len(value), len(crushed)))
|
|
1939
|
+
|
|
1940
|
+
return crushed, ",".join(info_parts), ccr_markers
|
|
1941
|
+
else:
|
|
1942
|
+
# Process items recursively
|
|
1943
|
+
processed = []
|
|
1944
|
+
for item in value:
|
|
1945
|
+
p_item, p_info, p_markers = self._process_value(
|
|
1946
|
+
item, depth + 1, query_context, tool_name
|
|
1947
|
+
)
|
|
1948
|
+
processed.append(p_item)
|
|
1949
|
+
if p_info:
|
|
1950
|
+
info_parts.append(p_info)
|
|
1951
|
+
ccr_markers.extend(p_markers)
|
|
1952
|
+
return processed, ",".join(info_parts), ccr_markers
|
|
1953
|
+
|
|
1954
|
+
elif isinstance(value, dict):
|
|
1955
|
+
# Process values recursively
|
|
1956
|
+
processed_dict: dict[str, Any] = {}
|
|
1957
|
+
for k, v in value.items():
|
|
1958
|
+
p_val, p_info, p_markers = self._process_value(
|
|
1959
|
+
v, depth + 1, query_context, tool_name
|
|
1960
|
+
)
|
|
1961
|
+
processed_dict[k] = p_val
|
|
1962
|
+
if p_info:
|
|
1963
|
+
info_parts.append(p_info)
|
|
1964
|
+
ccr_markers.extend(p_markers)
|
|
1965
|
+
return processed_dict, ",".join(info_parts), ccr_markers
|
|
1966
|
+
|
|
1967
|
+
else:
|
|
1968
|
+
return value, "", []
|
|
1969
|
+
|
|
1970
|
+
def _crush_array(
|
|
1971
|
+
self, items: list[dict], query_context: str = "", tool_name: str | None = None
|
|
1972
|
+
) -> tuple[list, str, str | None]:
|
|
1973
|
+
"""Crush an array using statistical analysis and relevance scoring.
|
|
1974
|
+
|
|
1975
|
+
IMPORTANT: If crushability analysis determines it's not safe to crush
|
|
1976
|
+
(high variability + no importance signal), returns original array unchanged.
|
|
1977
|
+
|
|
1978
|
+
TOIN-aware: Consults the Tool Output Intelligence Network for cross-user
|
|
1979
|
+
learned patterns. High retrieval rate across all users → compress less.
|
|
1980
|
+
|
|
1981
|
+
Feedback-aware: Uses learned patterns to adjust compression aggressiveness.
|
|
1982
|
+
High retrieval rate for a tool → compress less aggressively.
|
|
1983
|
+
|
|
1984
|
+
Returns:
|
|
1985
|
+
Tuple of (crushed_items, strategy_info, ccr_hash).
|
|
1986
|
+
ccr_hash is the hash for retrieval if CCR is enabled, None otherwise.
|
|
1987
|
+
"""
|
|
1988
|
+
# BOUNDARY CHECK: If already at or below max_items, no compression needed
|
|
1989
|
+
if len(items) <= self.config.max_items_after_crush:
|
|
1990
|
+
return items, "none:at_limit", None
|
|
1991
|
+
|
|
1992
|
+
# Get feedback hints if enabled
|
|
1993
|
+
# THREAD-SAFETY: Use a local effective_max_items instead of mutating shared config
|
|
1994
|
+
effective_max_items = self.config.max_items_after_crush
|
|
1995
|
+
hints_applied = False
|
|
1996
|
+
toin_hint_applied = False
|
|
1997
|
+
|
|
1998
|
+
# Create ToolSignature for TOIN lookup
|
|
1999
|
+
tool_signature = ToolSignature.from_items(items)
|
|
2000
|
+
|
|
2001
|
+
# TOIN: Get cross-user learned recommendations
|
|
2002
|
+
toin = self._get_toin()
|
|
2003
|
+
toin_hint = toin.get_recommendation(tool_signature, query_context)
|
|
2004
|
+
|
|
2005
|
+
if toin_hint.skip_compression:
|
|
2006
|
+
return items, f"skip:toin({toin_hint.reason})", None
|
|
2007
|
+
|
|
2008
|
+
# Apply TOIN recommendations if from network or local learning
|
|
2009
|
+
toin_preserve_fields: list[str] = []
|
|
2010
|
+
toin_recommended_strategy: str | None = None
|
|
2011
|
+
toin_compression_level: str | None = None
|
|
2012
|
+
# LOW FIX #21: Use configurable threshold instead of hardcoded 0.5
|
|
2013
|
+
if (
|
|
2014
|
+
toin_hint.source in ("network", "local")
|
|
2015
|
+
and toin_hint.confidence >= self.config.toin_confidence_threshold
|
|
2016
|
+
):
|
|
2017
|
+
# TOIN recommendations take precedence over local feedback
|
|
2018
|
+
effective_max_items = toin_hint.max_items
|
|
2019
|
+
toin_preserve_fields = toin_hint.preserve_fields # Fields to never remove
|
|
2020
|
+
toin_hint_applied = True
|
|
2021
|
+
# Store strategy and compression level for later use
|
|
2022
|
+
if toin_hint.recommended_strategy != "default":
|
|
2023
|
+
toin_recommended_strategy = toin_hint.recommended_strategy
|
|
2024
|
+
if toin_hint.compression_level != "moderate":
|
|
2025
|
+
toin_compression_level = toin_hint.compression_level
|
|
2026
|
+
|
|
2027
|
+
# === TOIN Evolution: Extract field semantics for signal detection ===
|
|
2028
|
+
# Store temporarily on instance for use in _prioritize_indices
|
|
2029
|
+
# This enables learned signal detection without changing all method signatures
|
|
2030
|
+
self._current_field_semantics = (
|
|
2031
|
+
toin_hint.field_semantics if toin_hint.field_semantics else None
|
|
2032
|
+
)
|
|
2033
|
+
|
|
2034
|
+
# Local feedback hints (if TOIN didn't apply)
|
|
2035
|
+
if not toin_hint_applied and self.config.use_feedback_hints and tool_name:
|
|
2036
|
+
feedback = self._get_feedback()
|
|
2037
|
+
hints = feedback.get_compression_hints(tool_name)
|
|
2038
|
+
|
|
2039
|
+
# Check if hints recommend skipping compression
|
|
2040
|
+
if hints.skip_compression:
|
|
2041
|
+
return items, f"skip:feedback({hints.reason})", None
|
|
2042
|
+
|
|
2043
|
+
# Adjust max_items based on feedback
|
|
2044
|
+
if hints.suggested_items is not None:
|
|
2045
|
+
effective_max_items = hints.suggested_items
|
|
2046
|
+
hints_applied = True
|
|
2047
|
+
|
|
2048
|
+
# Use preserve_fields from local feedback (hash them for TOIN compatibility)
|
|
2049
|
+
# Note: CompressionFeedback stores actual field names, but _plan methods
|
|
2050
|
+
# expect SHA256[:8] hashes for privacy-preserving comparison
|
|
2051
|
+
if hints.preserve_fields:
|
|
2052
|
+
toin_preserve_fields = [_hash_field_name(field) for field in hints.preserve_fields]
|
|
2053
|
+
|
|
2054
|
+
# Use recommended_strategy from local feedback if not already set by TOIN
|
|
2055
|
+
if hints.recommended_strategy and not toin_recommended_strategy:
|
|
2056
|
+
toin_recommended_strategy = hints.recommended_strategy
|
|
2057
|
+
|
|
2058
|
+
try:
|
|
2059
|
+
# Analyze the array (includes crushability check)
|
|
2060
|
+
analysis = self.analyzer.analyze_array(items)
|
|
2061
|
+
|
|
2062
|
+
# CRITICAL: If not crushable, return original array unchanged
|
|
2063
|
+
if analysis.recommended_strategy == CompressionStrategy.SKIP:
|
|
2064
|
+
reason = ""
|
|
2065
|
+
if analysis.crushability:
|
|
2066
|
+
reason = f"skip:{analysis.crushability.reason}"
|
|
2067
|
+
return items, reason, None
|
|
2068
|
+
|
|
2069
|
+
# Apply TOIN strategy recommendation if available
|
|
2070
|
+
# TOIN learns which strategies work best from cross-user patterns
|
|
2071
|
+
if toin_recommended_strategy:
|
|
2072
|
+
try:
|
|
2073
|
+
toin_strategy = CompressionStrategy(toin_recommended_strategy)
|
|
2074
|
+
# Only override if TOIN suggests a valid non-SKIP strategy
|
|
2075
|
+
if toin_strategy != CompressionStrategy.SKIP:
|
|
2076
|
+
analysis.recommended_strategy = toin_strategy
|
|
2077
|
+
except ValueError:
|
|
2078
|
+
pass # Invalid strategy name, keep analyzer's choice
|
|
2079
|
+
|
|
2080
|
+
# Apply TOIN compression level to adjust effective_max_items
|
|
2081
|
+
if toin_compression_level:
|
|
2082
|
+
if toin_compression_level == "none":
|
|
2083
|
+
# Don't compress - return original
|
|
2084
|
+
return items, "skip:toin_level_none", None
|
|
2085
|
+
elif toin_compression_level == "conservative":
|
|
2086
|
+
# Be conservative - keep more items
|
|
2087
|
+
effective_max_items = max(effective_max_items, min(50, len(items) // 2))
|
|
2088
|
+
elif toin_compression_level == "aggressive":
|
|
2089
|
+
# Be aggressive - keep fewer items
|
|
2090
|
+
effective_max_items = min(effective_max_items, 15)
|
|
2091
|
+
|
|
2092
|
+
# Create compression plan with relevance scoring
|
|
2093
|
+
# Pass TOIN preserve_fields so items with those fields get priority
|
|
2094
|
+
# Pass effective_max_items for thread-safe compression
|
|
2095
|
+
plan = self._create_plan(
|
|
2096
|
+
analysis,
|
|
2097
|
+
items,
|
|
2098
|
+
query_context,
|
|
2099
|
+
preserve_fields=toin_preserve_fields or None,
|
|
2100
|
+
effective_max_items=effective_max_items,
|
|
2101
|
+
)
|
|
2102
|
+
|
|
2103
|
+
# Execute compression
|
|
2104
|
+
result = self._execute_plan(plan, items, analysis)
|
|
2105
|
+
|
|
2106
|
+
# CCR: Store original content for retrieval if enabled
|
|
2107
|
+
ccr_hash = None
|
|
2108
|
+
if (
|
|
2109
|
+
self._ccr_config.enabled
|
|
2110
|
+
and len(items) >= self._ccr_config.min_items_to_cache
|
|
2111
|
+
and len(result) < len(items) # Only cache if compression actually happened
|
|
2112
|
+
):
|
|
2113
|
+
store = self._get_compression_store()
|
|
2114
|
+
original_json = json.dumps(items, default=str)
|
|
2115
|
+
compressed_json = json.dumps(result, default=str)
|
|
2116
|
+
|
|
2117
|
+
ccr_hash = store.store(
|
|
2118
|
+
original=original_json,
|
|
2119
|
+
compressed=compressed_json,
|
|
2120
|
+
original_item_count=len(items),
|
|
2121
|
+
compressed_item_count=len(result),
|
|
2122
|
+
tool_name=tool_name,
|
|
2123
|
+
query_context=query_context,
|
|
2124
|
+
# CRITICAL: Pass the tool_signature_hash so retrieval events
|
|
2125
|
+
# can be correlated with compression events in TOIN
|
|
2126
|
+
tool_signature_hash=tool_signature.structure_hash,
|
|
2127
|
+
compression_strategy=analysis.recommended_strategy.value,
|
|
2128
|
+
)
|
|
2129
|
+
|
|
2130
|
+
# Record compression event for feedback loop
|
|
2131
|
+
if self.config.use_feedback_hints and tool_name:
|
|
2132
|
+
feedback = self._get_feedback()
|
|
2133
|
+
feedback.record_compression(
|
|
2134
|
+
tool_name=tool_name,
|
|
2135
|
+
original_count=len(items),
|
|
2136
|
+
compressed_count=len(result),
|
|
2137
|
+
strategy=analysis.recommended_strategy.value,
|
|
2138
|
+
tool_signature_hash=tool_signature.structure_hash,
|
|
2139
|
+
)
|
|
2140
|
+
|
|
2141
|
+
# Record telemetry for data flywheel
|
|
2142
|
+
self._record_telemetry(
|
|
2143
|
+
items=items,
|
|
2144
|
+
result=result,
|
|
2145
|
+
analysis=analysis,
|
|
2146
|
+
plan=plan,
|
|
2147
|
+
tool_name=tool_name,
|
|
2148
|
+
)
|
|
2149
|
+
|
|
2150
|
+
# TOIN: Record compression event for cross-user learning
|
|
2151
|
+
try:
|
|
2152
|
+
# Calculate token counts (approximate)
|
|
2153
|
+
original_tokens = len(json.dumps(items, default=str)) // 4
|
|
2154
|
+
compressed_tokens = len(json.dumps(result, default=str)) // 4
|
|
2155
|
+
|
|
2156
|
+
toin.record_compression(
|
|
2157
|
+
tool_signature=tool_signature,
|
|
2158
|
+
original_count=len(items),
|
|
2159
|
+
compressed_count=len(result),
|
|
2160
|
+
original_tokens=original_tokens,
|
|
2161
|
+
compressed_tokens=compressed_tokens,
|
|
2162
|
+
strategy=analysis.recommended_strategy.value,
|
|
2163
|
+
query_context=query_context,
|
|
2164
|
+
items=items, # Pass items for field-level semantic learning
|
|
2165
|
+
)
|
|
2166
|
+
except Exception:
|
|
2167
|
+
# TOIN should never break compression
|
|
2168
|
+
pass
|
|
2169
|
+
|
|
2170
|
+
strategy_info = analysis.recommended_strategy.value
|
|
2171
|
+
if toin_hint_applied:
|
|
2172
|
+
toin_parts = [f"items={toin_hint.max_items}", f"conf={toin_hint.confidence:.2f}"]
|
|
2173
|
+
if toin_recommended_strategy:
|
|
2174
|
+
toin_parts.append(f"strategy={toin_recommended_strategy}")
|
|
2175
|
+
if toin_compression_level and toin_compression_level != "moderate":
|
|
2176
|
+
toin_parts.append(f"level={toin_compression_level}")
|
|
2177
|
+
strategy_info += f"(toin:{','.join(toin_parts)})"
|
|
2178
|
+
elif hints_applied:
|
|
2179
|
+
strategy_info += f"(feedback:{effective_max_items})"
|
|
2180
|
+
|
|
2181
|
+
# Clean up temporary instance variable
|
|
2182
|
+
self._current_field_semantics = None
|
|
2183
|
+
return result, strategy_info, ccr_hash
|
|
2184
|
+
|
|
2185
|
+
except Exception:
|
|
2186
|
+
# Clean up temporary instance variable
|
|
2187
|
+
self._current_field_semantics = None
|
|
2188
|
+
# Re-raise any exceptions (removed finally block since we no longer mutate config)
|
|
2189
|
+
raise
|
|
2190
|
+
|
|
2191
|
+
def _create_plan(
|
|
2192
|
+
self,
|
|
2193
|
+
analysis: ArrayAnalysis,
|
|
2194
|
+
items: list[dict],
|
|
2195
|
+
query_context: str = "",
|
|
2196
|
+
preserve_fields: list[str] | None = None,
|
|
2197
|
+
effective_max_items: int | None = None,
|
|
2198
|
+
) -> CompressionPlan:
|
|
2199
|
+
"""Create a detailed compression plan using relevance scoring.
|
|
2200
|
+
|
|
2201
|
+
Args:
|
|
2202
|
+
analysis: The array analysis results.
|
|
2203
|
+
items: The items to compress.
|
|
2204
|
+
query_context: Context string from user messages for relevance scoring.
|
|
2205
|
+
preserve_fields: TOIN-learned fields that users commonly retrieve.
|
|
2206
|
+
Items with values in these fields get higher priority.
|
|
2207
|
+
effective_max_items: Thread-safe max items limit (defaults to config value).
|
|
2208
|
+
"""
|
|
2209
|
+
# Use provided effective_max_items or fall back to config
|
|
2210
|
+
max_items = (
|
|
2211
|
+
effective_max_items
|
|
2212
|
+
if effective_max_items is not None
|
|
2213
|
+
else self.config.max_items_after_crush
|
|
2214
|
+
)
|
|
2215
|
+
|
|
2216
|
+
plan = CompressionPlan(
|
|
2217
|
+
strategy=analysis.recommended_strategy,
|
|
2218
|
+
constant_fields=analysis.constant_fields if self.config.factor_out_constants else {},
|
|
2219
|
+
)
|
|
2220
|
+
|
|
2221
|
+
# Handle SKIP - keep all items (shouldn't normally reach here)
|
|
2222
|
+
if analysis.recommended_strategy == CompressionStrategy.SKIP:
|
|
2223
|
+
plan.keep_indices = list(range(len(items)))
|
|
2224
|
+
return plan
|
|
2225
|
+
|
|
2226
|
+
if analysis.recommended_strategy == CompressionStrategy.TIME_SERIES:
|
|
2227
|
+
plan = self._plan_time_series(
|
|
2228
|
+
analysis, items, plan, query_context, preserve_fields, max_items
|
|
2229
|
+
)
|
|
2230
|
+
|
|
2231
|
+
elif analysis.recommended_strategy == CompressionStrategy.CLUSTER_SAMPLE:
|
|
2232
|
+
plan = self._plan_cluster_sample(
|
|
2233
|
+
analysis, items, plan, query_context, preserve_fields, max_items
|
|
2234
|
+
)
|
|
2235
|
+
|
|
2236
|
+
elif analysis.recommended_strategy == CompressionStrategy.TOP_N:
|
|
2237
|
+
plan = self._plan_top_n(
|
|
2238
|
+
analysis, items, plan, query_context, preserve_fields, max_items
|
|
2239
|
+
)
|
|
2240
|
+
|
|
2241
|
+
else: # SMART_SAMPLE or NONE
|
|
2242
|
+
plan = self._plan_smart_sample(
|
|
2243
|
+
analysis, items, plan, query_context, preserve_fields, max_items
|
|
2244
|
+
)
|
|
2245
|
+
|
|
2246
|
+
return plan
|
|
2247
|
+
|
|
2248
|
+
def _plan_time_series(
|
|
2249
|
+
self,
|
|
2250
|
+
analysis: ArrayAnalysis,
|
|
2251
|
+
items: list[dict],
|
|
2252
|
+
plan: CompressionPlan,
|
|
2253
|
+
query_context: str = "",
|
|
2254
|
+
preserve_fields: list[str] | None = None,
|
|
2255
|
+
max_items: int | None = None,
|
|
2256
|
+
) -> CompressionPlan:
|
|
2257
|
+
"""Plan compression for time series data.
|
|
2258
|
+
|
|
2259
|
+
Keeps items around change points (anomalies) plus first/last items.
|
|
2260
|
+
Uses STATISTICAL outlier detection for important items.
|
|
2261
|
+
Uses RelevanceScorer for semantic matching of user queries.
|
|
2262
|
+
|
|
2263
|
+
Args:
|
|
2264
|
+
preserve_fields: TOIN-learned fields that users commonly retrieve.
|
|
2265
|
+
Items where query_context matches these field values get priority.
|
|
2266
|
+
max_items: Thread-safe max items limit (defaults to config value).
|
|
2267
|
+
"""
|
|
2268
|
+
# Use provided max_items or fall back to config
|
|
2269
|
+
effective_max = max_items if max_items is not None else self.config.max_items_after_crush
|
|
2270
|
+
n = len(items)
|
|
2271
|
+
keep_indices = set()
|
|
2272
|
+
|
|
2273
|
+
# 1. First 3 items
|
|
2274
|
+
for i in range(min(3, n)):
|
|
2275
|
+
keep_indices.add(i)
|
|
2276
|
+
|
|
2277
|
+
# 2. Last 2 items
|
|
2278
|
+
for i in range(max(0, n - 2), n):
|
|
2279
|
+
keep_indices.add(i)
|
|
2280
|
+
|
|
2281
|
+
# 3. Items around change points from numeric fields
|
|
2282
|
+
for stats in analysis.field_stats.values():
|
|
2283
|
+
if stats.change_points:
|
|
2284
|
+
for cp in stats.change_points:
|
|
2285
|
+
# Keep a window around each change point
|
|
2286
|
+
for offset in range(-2, 3):
|
|
2287
|
+
idx = cp + offset
|
|
2288
|
+
if 0 <= idx < n:
|
|
2289
|
+
keep_indices.add(idx)
|
|
2290
|
+
|
|
2291
|
+
# 4. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
|
|
2292
|
+
outlier_indices = _detect_structural_outliers(items)
|
|
2293
|
+
keep_indices.update(outlier_indices)
|
|
2294
|
+
|
|
2295
|
+
# 4b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
|
|
2296
|
+
# This is critical - errors must ALWAYS be preserved regardless of structure
|
|
2297
|
+
error_indices = _detect_error_items_for_preservation(items)
|
|
2298
|
+
keep_indices.update(error_indices)
|
|
2299
|
+
|
|
2300
|
+
# 5. Items matching query anchors (DETERMINISTIC exact match)
|
|
2301
|
+
# Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
|
|
2302
|
+
if query_context:
|
|
2303
|
+
anchors = extract_query_anchors(query_context)
|
|
2304
|
+
for i, item in enumerate(items):
|
|
2305
|
+
if item_matches_anchors(item, anchors):
|
|
2306
|
+
keep_indices.add(i)
|
|
2307
|
+
|
|
2308
|
+
# 6. Items with high relevance to query context (PROBABILISTIC semantic match)
|
|
2309
|
+
if query_context:
|
|
2310
|
+
item_strs = [json.dumps(item, default=str) for item in items]
|
|
2311
|
+
scores = self._scorer.score_batch(item_strs, query_context)
|
|
2312
|
+
for i, score in enumerate(scores):
|
|
2313
|
+
if score.score >= self._relevance_threshold:
|
|
2314
|
+
keep_indices.add(i)
|
|
2315
|
+
|
|
2316
|
+
# 6b. TOIN preserve_fields: boost items where query matches these fields
|
|
2317
|
+
# Note: preserve_fields are SHA256[:8] hashes, use helper to match
|
|
2318
|
+
if preserve_fields and query_context:
|
|
2319
|
+
for i, item in enumerate(items):
|
|
2320
|
+
if _item_has_preserve_field_match(item, preserve_fields, query_context):
|
|
2321
|
+
keep_indices.add(i)
|
|
2322
|
+
|
|
2323
|
+
# Limit to effective_max while ALWAYS preserving outliers and anomalies
|
|
2324
|
+
keep_indices = self._prioritize_indices(keep_indices, items, n, analysis, effective_max)
|
|
2325
|
+
|
|
2326
|
+
plan.keep_indices = sorted(keep_indices)
|
|
2327
|
+
return plan
|
|
2328
|
+
|
|
2329
|
+
def _plan_cluster_sample(
|
|
2330
|
+
self,
|
|
2331
|
+
analysis: ArrayAnalysis,
|
|
2332
|
+
items: list[dict],
|
|
2333
|
+
plan: CompressionPlan,
|
|
2334
|
+
query_context: str = "",
|
|
2335
|
+
preserve_fields: list[str] | None = None,
|
|
2336
|
+
max_items: int | None = None,
|
|
2337
|
+
) -> CompressionPlan:
|
|
2338
|
+
"""Plan compression for clusterable data (like logs).
|
|
2339
|
+
|
|
2340
|
+
Uses clustering plus STATISTICAL outlier detection.
|
|
2341
|
+
Uses RelevanceScorer for semantic matching of user queries.
|
|
2342
|
+
|
|
2343
|
+
Args:
|
|
2344
|
+
preserve_fields: TOIN-learned fields that users commonly retrieve.
|
|
2345
|
+
Items where query_context matches these field values get priority.
|
|
2346
|
+
max_items: Thread-safe max items limit (defaults to config value).
|
|
2347
|
+
"""
|
|
2348
|
+
# Use provided max_items or fall back to config
|
|
2349
|
+
effective_max = max_items if max_items is not None else self.config.max_items_after_crush
|
|
2350
|
+
n = len(items)
|
|
2351
|
+
keep_indices = set()
|
|
2352
|
+
|
|
2353
|
+
# 1. First 3 items
|
|
2354
|
+
for i in range(min(3, n)):
|
|
2355
|
+
keep_indices.add(i)
|
|
2356
|
+
|
|
2357
|
+
# 2. Last 2 items
|
|
2358
|
+
for i in range(max(0, n - 2), n):
|
|
2359
|
+
keep_indices.add(i)
|
|
2360
|
+
|
|
2361
|
+
# 3. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
|
|
2362
|
+
outlier_indices = _detect_structural_outliers(items)
|
|
2363
|
+
keep_indices.update(outlier_indices)
|
|
2364
|
+
|
|
2365
|
+
# 3b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
|
|
2366
|
+
# This is critical - errors must ALWAYS be preserved regardless of structure
|
|
2367
|
+
error_indices = _detect_error_items_for_preservation(items)
|
|
2368
|
+
keep_indices.update(error_indices)
|
|
2369
|
+
|
|
2370
|
+
# 4. Cluster by message-like field and keep representatives
|
|
2371
|
+
# Find a high-cardinality string field (likely message field)
|
|
2372
|
+
message_field = None
|
|
2373
|
+
max_uniqueness = 0.0
|
|
2374
|
+
for name, stats in analysis.field_stats.items():
|
|
2375
|
+
if stats.field_type == "string" and stats.unique_ratio > max_uniqueness:
|
|
2376
|
+
# Prefer fields with moderate to high uniqueness (message-like)
|
|
2377
|
+
if stats.unique_ratio > 0.3:
|
|
2378
|
+
message_field = name
|
|
2379
|
+
max_uniqueness = stats.unique_ratio
|
|
2380
|
+
|
|
2381
|
+
if message_field:
|
|
2382
|
+
plan.cluster_field = message_field
|
|
2383
|
+
|
|
2384
|
+
# Simple clustering: group by first 50 chars of message
|
|
2385
|
+
clusters: dict[str, list[int]] = {}
|
|
2386
|
+
for i, item in enumerate(items):
|
|
2387
|
+
msg = str(item.get(message_field, ""))[:50]
|
|
2388
|
+
msg_hash = hashlib.md5(msg.encode()).hexdigest()[:8]
|
|
2389
|
+
if msg_hash not in clusters:
|
|
2390
|
+
clusters[msg_hash] = []
|
|
2391
|
+
clusters[msg_hash].append(i)
|
|
2392
|
+
|
|
2393
|
+
# Keep 1-2 representatives from each cluster
|
|
2394
|
+
for indices in clusters.values():
|
|
2395
|
+
for idx in indices[:2]:
|
|
2396
|
+
keep_indices.add(idx)
|
|
2397
|
+
|
|
2398
|
+
# 5. Items matching query anchors (DETERMINISTIC exact match)
|
|
2399
|
+
# Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
|
|
2400
|
+
if query_context:
|
|
2401
|
+
anchors = extract_query_anchors(query_context)
|
|
2402
|
+
for i, item in enumerate(items):
|
|
2403
|
+
if item_matches_anchors(item, anchors):
|
|
2404
|
+
keep_indices.add(i)
|
|
2405
|
+
|
|
2406
|
+
# 6. Items with high relevance to query context (PROBABILISTIC semantic match)
|
|
2407
|
+
if query_context:
|
|
2408
|
+
item_strs = [json.dumps(item, default=str) for item in items]
|
|
2409
|
+
scores = self._scorer.score_batch(item_strs, query_context)
|
|
2410
|
+
for i, score in enumerate(scores):
|
|
2411
|
+
if score.score >= self._relevance_threshold:
|
|
2412
|
+
keep_indices.add(i)
|
|
2413
|
+
|
|
2414
|
+
# 6b. TOIN preserve_fields: boost items where query matches these fields
|
|
2415
|
+
# Note: preserve_fields are SHA256[:8] hashes, use helper to match
|
|
2416
|
+
if preserve_fields and query_context:
|
|
2417
|
+
for i, item in enumerate(items):
|
|
2418
|
+
if _item_has_preserve_field_match(item, preserve_fields, query_context):
|
|
2419
|
+
keep_indices.add(i)
|
|
2420
|
+
|
|
2421
|
+
# Limit total while ALWAYS preserving outliers and anomalies
|
|
2422
|
+
keep_indices = self._prioritize_indices(keep_indices, items, n, analysis, effective_max)
|
|
2423
|
+
|
|
2424
|
+
plan.keep_indices = sorted(keep_indices)
|
|
2425
|
+
return plan
|
|
2426
|
+
|
|
2427
|
+
def _plan_top_n(
|
|
2428
|
+
self,
|
|
2429
|
+
analysis: ArrayAnalysis,
|
|
2430
|
+
items: list[dict],
|
|
2431
|
+
plan: CompressionPlan,
|
|
2432
|
+
query_context: str = "",
|
|
2433
|
+
preserve_fields: list[str] | None = None,
|
|
2434
|
+
max_items: int | None = None,
|
|
2435
|
+
) -> CompressionPlan:
|
|
2436
|
+
"""Plan compression for scored/ranked data.
|
|
2437
|
+
|
|
2438
|
+
For data with a score/relevance field, that field IS the primary relevance
|
|
2439
|
+
signal. Our internal relevance scoring is SECONDARY - it's used to find
|
|
2440
|
+
potential "needle" items that the original scoring might have missed.
|
|
2441
|
+
|
|
2442
|
+
Strategy:
|
|
2443
|
+
1. Keep top N by score (the original system's relevance ranking)
|
|
2444
|
+
2. Add structural outliers (errors, anomalies)
|
|
2445
|
+
3. Add high-confidence relevance matches (needles the user is looking for)
|
|
2446
|
+
|
|
2447
|
+
Args:
|
|
2448
|
+
preserve_fields: TOIN-learned fields that users commonly retrieve.
|
|
2449
|
+
Items where query_context matches these field values get priority.
|
|
2450
|
+
max_items: Thread-safe max items limit (defaults to config value).
|
|
2451
|
+
"""
|
|
2452
|
+
# Use provided max_items or fall back to config
|
|
2453
|
+
effective_max = max_items if max_items is not None else self.config.max_items_after_crush
|
|
2454
|
+
|
|
2455
|
+
# Find score field using STATISTICAL detection (no hardcoded field names)
|
|
2456
|
+
score_field = None
|
|
2457
|
+
max_confidence = 0.0
|
|
2458
|
+
for name, stats in analysis.field_stats.items():
|
|
2459
|
+
is_score, confidence = _detect_score_field_statistically(stats, items)
|
|
2460
|
+
if is_score and confidence > max_confidence:
|
|
2461
|
+
score_field = name
|
|
2462
|
+
max_confidence = confidence
|
|
2463
|
+
|
|
2464
|
+
if not score_field:
|
|
2465
|
+
return self._plan_smart_sample(
|
|
2466
|
+
analysis, items, plan, query_context, preserve_fields, effective_max
|
|
2467
|
+
)
|
|
2468
|
+
|
|
2469
|
+
plan.sort_field = score_field
|
|
2470
|
+
keep_indices = set()
|
|
2471
|
+
|
|
2472
|
+
# 1. TOP N by score FIRST (the primary relevance signal)
|
|
2473
|
+
# The original system's score field is the authoritative ranking
|
|
2474
|
+
scored_items = [(i, item.get(score_field, 0)) for i, item in enumerate(items)]
|
|
2475
|
+
scored_items.sort(key=lambda x: x[1], reverse=True)
|
|
2476
|
+
|
|
2477
|
+
# Reserve slots for outliers
|
|
2478
|
+
top_count = max(0, effective_max - 3)
|
|
2479
|
+
for idx, _ in scored_items[:top_count]:
|
|
2480
|
+
keep_indices.add(idx)
|
|
2481
|
+
|
|
2482
|
+
# 2. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
|
|
2483
|
+
outlier_indices = _detect_structural_outliers(items)
|
|
2484
|
+
keep_indices.update(outlier_indices)
|
|
2485
|
+
|
|
2486
|
+
# 2b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
|
|
2487
|
+
# This is critical - errors must ALWAYS be preserved regardless of structure
|
|
2488
|
+
error_indices = _detect_error_items_for_preservation(items)
|
|
2489
|
+
keep_indices.update(error_indices)
|
|
2490
|
+
|
|
2491
|
+
# 3. Items matching query anchors (DETERMINISTIC exact match) - ADDITIVE
|
|
2492
|
+
# Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
|
|
2493
|
+
# These are ALWAYS preserved since they represent explicit user intent
|
|
2494
|
+
if query_context:
|
|
2495
|
+
anchors = extract_query_anchors(query_context)
|
|
2496
|
+
for i, item in enumerate(items):
|
|
2497
|
+
if i not in keep_indices and item_matches_anchors(item, anchors):
|
|
2498
|
+
keep_indices.add(i)
|
|
2499
|
+
|
|
2500
|
+
# 4. HIGH-CONFIDENCE relevance matches (potential needles) - ADDITIVE only
|
|
2501
|
+
# Only add items that are NOT already in top N but match the query strongly
|
|
2502
|
+
# Use a higher threshold (0.5) since the score field already captures relevance
|
|
2503
|
+
if query_context:
|
|
2504
|
+
item_strs = [json.dumps(item, default=str) for item in items]
|
|
2505
|
+
scores = self._scorer.score_batch(item_strs, query_context)
|
|
2506
|
+
# Higher threshold and limit count to avoid adding everything
|
|
2507
|
+
high_threshold = max(0.5, self._relevance_threshold * 2)
|
|
2508
|
+
added_count = 0
|
|
2509
|
+
max_relevance_adds = 3 # Limit additional relevance matches
|
|
2510
|
+
for i, score in enumerate(scores):
|
|
2511
|
+
if i not in keep_indices and score.score >= high_threshold:
|
|
2512
|
+
keep_indices.add(i)
|
|
2513
|
+
added_count += 1
|
|
2514
|
+
if added_count >= max_relevance_adds:
|
|
2515
|
+
break
|
|
2516
|
+
|
|
2517
|
+
# 4b. TOIN preserve_fields: boost items where query matches these fields
|
|
2518
|
+
# Note: preserve_fields are SHA256[:8] hashes, use helper to match
|
|
2519
|
+
if preserve_fields and query_context:
|
|
2520
|
+
for i, item in enumerate(items):
|
|
2521
|
+
if i not in keep_indices: # Only add if not already kept
|
|
2522
|
+
if _item_has_preserve_field_match(item, preserve_fields, query_context):
|
|
2523
|
+
keep_indices.add(i)
|
|
2524
|
+
|
|
2525
|
+
plan.keep_count = len(keep_indices)
|
|
2526
|
+
plan.keep_indices = sorted(keep_indices)
|
|
2527
|
+
return plan
|
|
2528
|
+
|
|
2529
|
+
def _plan_smart_sample(
|
|
2530
|
+
self,
|
|
2531
|
+
analysis: ArrayAnalysis,
|
|
2532
|
+
items: list[dict],
|
|
2533
|
+
plan: CompressionPlan,
|
|
2534
|
+
query_context: str = "",
|
|
2535
|
+
preserve_fields: list[str] | None = None,
|
|
2536
|
+
max_items: int | None = None,
|
|
2537
|
+
) -> CompressionPlan:
|
|
2538
|
+
"""Plan smart statistical sampling using STATISTICAL detection.
|
|
2539
|
+
|
|
2540
|
+
Always keeps:
|
|
2541
|
+
- First K items (default 3)
|
|
2542
|
+
- Last K items (default 2)
|
|
2543
|
+
- Structural outliers (items with rare fields or rare status values)
|
|
2544
|
+
- Anomalous numeric items (> 2 std from mean)
|
|
2545
|
+
- Items around change points
|
|
2546
|
+
- Items with high relevance to query context (via RelevanceScorer)
|
|
2547
|
+
|
|
2548
|
+
Uses STATISTICAL detection instead of hardcoded keywords.
|
|
2549
|
+
|
|
2550
|
+
Args:
|
|
2551
|
+
preserve_fields: TOIN-learned fields that users commonly retrieve.
|
|
2552
|
+
Items where query_context matches these field values get priority.
|
|
2553
|
+
max_items: Thread-safe max items limit (defaults to config value).
|
|
2554
|
+
"""
|
|
2555
|
+
# Use provided max_items or fall back to config
|
|
2556
|
+
effective_max = max_items if max_items is not None else self.config.max_items_after_crush
|
|
2557
|
+
|
|
2558
|
+
n = len(items)
|
|
2559
|
+
keep_indices = set()
|
|
2560
|
+
|
|
2561
|
+
# 1. First K items (default 3)
|
|
2562
|
+
for i in range(min(3, n)):
|
|
2563
|
+
keep_indices.add(i)
|
|
2564
|
+
|
|
2565
|
+
# 2. Last K items (default 2)
|
|
2566
|
+
for i in range(max(0, n - 2), n):
|
|
2567
|
+
keep_indices.add(i)
|
|
2568
|
+
|
|
2569
|
+
# 3. Structural outlier items (STATISTICAL detection - no hardcoded keywords)
|
|
2570
|
+
outlier_indices = _detect_structural_outliers(items)
|
|
2571
|
+
keep_indices.update(outlier_indices)
|
|
2572
|
+
|
|
2573
|
+
# 3b. Error items via KEYWORD detection (PRESERVATION GUARANTEE)
|
|
2574
|
+
# This is critical - errors must ALWAYS be preserved regardless of structure
|
|
2575
|
+
error_indices = _detect_error_items_for_preservation(items)
|
|
2576
|
+
keep_indices.update(error_indices)
|
|
2577
|
+
|
|
2578
|
+
# 4. Anomalous numeric items (> 2 std from mean)
|
|
2579
|
+
for name, stats in analysis.field_stats.items():
|
|
2580
|
+
if stats.field_type == "numeric" and stats.mean_val is not None and stats.variance:
|
|
2581
|
+
std = stats.variance**0.5
|
|
2582
|
+
if std > 0:
|
|
2583
|
+
threshold = self.config.variance_threshold * std
|
|
2584
|
+
for i, item in enumerate(items):
|
|
2585
|
+
val = item.get(name)
|
|
2586
|
+
if isinstance(val, (int, float)):
|
|
2587
|
+
if abs(val - stats.mean_val) > threshold:
|
|
2588
|
+
keep_indices.add(i)
|
|
2589
|
+
|
|
2590
|
+
# 5. Items around change points (if detected)
|
|
2591
|
+
if self.config.preserve_change_points:
|
|
2592
|
+
for stats in analysis.field_stats.values():
|
|
2593
|
+
if stats.change_points:
|
|
2594
|
+
for cp in stats.change_points:
|
|
2595
|
+
# Keep items around change point
|
|
2596
|
+
for offset in range(-1, 2):
|
|
2597
|
+
idx = cp + offset
|
|
2598
|
+
if 0 <= idx < n:
|
|
2599
|
+
keep_indices.add(idx)
|
|
2600
|
+
|
|
2601
|
+
# 6. Items matching query anchors (DETERMINISTIC exact match)
|
|
2602
|
+
# Anchors provide reliable preservation for specific entity lookups (UUIDs, IDs, names)
|
|
2603
|
+
if query_context:
|
|
2604
|
+
anchors = extract_query_anchors(query_context)
|
|
2605
|
+
for i, item in enumerate(items):
|
|
2606
|
+
if item_matches_anchors(item, anchors):
|
|
2607
|
+
keep_indices.add(i)
|
|
2608
|
+
|
|
2609
|
+
# 7. Items with high relevance to query context (PROBABILISTIC semantic match)
|
|
2610
|
+
if query_context:
|
|
2611
|
+
item_strs = [json.dumps(item, default=str) for item in items]
|
|
2612
|
+
scores = self._scorer.score_batch(item_strs, query_context)
|
|
2613
|
+
for i, score in enumerate(scores):
|
|
2614
|
+
if score.score >= self._relevance_threshold:
|
|
2615
|
+
keep_indices.add(i)
|
|
2616
|
+
|
|
2617
|
+
# 7b. TOIN preserve_fields: boost items where query matches these fields
|
|
2618
|
+
# Note: preserve_fields are SHA256[:8] hashes, use helper to match
|
|
2619
|
+
if preserve_fields and query_context:
|
|
2620
|
+
for i, item in enumerate(items):
|
|
2621
|
+
if _item_has_preserve_field_match(item, preserve_fields, query_context):
|
|
2622
|
+
keep_indices.add(i)
|
|
2623
|
+
|
|
2624
|
+
# Limit to effective_max while ALWAYS preserving outliers and anomalies
|
|
2625
|
+
keep_indices = self._prioritize_indices(keep_indices, items, n, analysis, effective_max)
|
|
2626
|
+
|
|
2627
|
+
plan.keep_indices = sorted(keep_indices)
|
|
2628
|
+
return plan
|
|
2629
|
+
|
|
2630
|
+
def _execute_plan(
|
|
2631
|
+
self, plan: CompressionPlan, items: list[dict], analysis: ArrayAnalysis
|
|
2632
|
+
) -> list:
|
|
2633
|
+
"""Execute a compression plan and return crushed array.
|
|
2634
|
+
|
|
2635
|
+
SCHEMA-PRESERVING: Returns only items from the original array.
|
|
2636
|
+
No wrappers, no generated text, no metadata keys.
|
|
2637
|
+
"""
|
|
2638
|
+
result = []
|
|
2639
|
+
|
|
2640
|
+
# Return only the kept items, preserving original schema
|
|
2641
|
+
for idx in sorted(plan.keep_indices):
|
|
2642
|
+
if 0 <= idx < len(items):
|
|
2643
|
+
# Copy item unchanged - no modifications to schema
|
|
2644
|
+
result.append(items[idx].copy())
|
|
2645
|
+
|
|
2646
|
+
return result
|
|
2647
|
+
|
|
2648
|
+
|
|
2649
|
+
def smart_crush_tool_output(
|
|
2650
|
+
content: str,
|
|
2651
|
+
config: SmartCrusherConfig | None = None,
|
|
2652
|
+
ccr_config: CCRConfig | None = None,
|
|
2653
|
+
) -> tuple[str, bool, str]:
|
|
2654
|
+
"""
|
|
2655
|
+
Convenience function to smart-crush a single tool output.
|
|
2656
|
+
|
|
2657
|
+
NOTE: CCR markers are DISABLED by default in this convenience function
|
|
2658
|
+
to maintain backward compatibility (output remains valid JSON).
|
|
2659
|
+
To enable CCR markers, pass a CCRConfig with inject_retrieval_marker=True.
|
|
2660
|
+
|
|
2661
|
+
Args:
|
|
2662
|
+
content: The tool output content (JSON string).
|
|
2663
|
+
config: Optional SmartCrusher configuration.
|
|
2664
|
+
ccr_config: Optional CCR (Compress-Cache-Retrieve) configuration.
|
|
2665
|
+
By default, CCR is enabled (caching) but markers are disabled.
|
|
2666
|
+
|
|
2667
|
+
Returns:
|
|
2668
|
+
Tuple of (crushed_content, was_modified, analysis_info).
|
|
2669
|
+
"""
|
|
2670
|
+
cfg = config or SmartCrusherConfig()
|
|
2671
|
+
|
|
2672
|
+
# Default: CCR enabled for caching, but markers disabled for clean JSON output
|
|
2673
|
+
if ccr_config is None:
|
|
2674
|
+
ccr_cfg = CCRConfig(
|
|
2675
|
+
enabled=True, # Still cache for retrieval
|
|
2676
|
+
inject_retrieval_marker=False, # Don't break JSON output
|
|
2677
|
+
)
|
|
2678
|
+
else:
|
|
2679
|
+
ccr_cfg = ccr_config
|
|
2680
|
+
|
|
2681
|
+
crusher = SmartCrusher(cfg, ccr_config=ccr_cfg)
|
|
2682
|
+
return crusher._smart_crush_content(content)
|