contextops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextops/__init__.py +3 -0
- contextops/analyzers/__init__.py +1 -0
- contextops/analyzers/density.py +146 -0
- contextops/analyzers/redundancy.py +362 -0
- contextops/analyzers/structure.py +123 -0
- contextops/analyzers/tokens.py +76 -0
- contextops/api/__init__.py +1 -0
- contextops/api/diff.py +124 -0
- contextops/api/inspect.py +52 -0
- contextops/api/stability.py +264 -0
- contextops/cli/__init__.py +1 -0
- contextops/cli/main.py +320 -0
- contextops/cli/renderer.py +424 -0
- contextops/core/__init__.py +1 -0
- contextops/core/config.py +61 -0
- contextops/core/engine.py +355 -0
- contextops/core/models.py +245 -0
- contextops/core/normalizer.py +187 -0
- contextops-0.1.0.dist-info/METADATA +272 -0
- contextops-0.1.0.dist-info/RECORD +24 -0
- contextops-0.1.0.dist-info/WHEEL +5 -0
- contextops-0.1.0.dist-info/entry_points.txt +2 -0
- contextops-0.1.0.dist-info/licenses/LICENSE +21 -0
- contextops-0.1.0.dist-info/top_level.txt +1 -0
contextops/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Analyzers subpackage
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Density Analyzer — Phase 2.5 (Metric Orthogonalization).
|
|
3
|
+
|
|
4
|
+
Computes the structural Density Signal from raw context text.
|
|
5
|
+
|
|
6
|
+
Signal contract:
|
|
7
|
+
- Reads ONLY from raw ContextBundle item content strings.
|
|
8
|
+
- Does NOT read wasted_tokens, redundancy findings, or any other analyzer output.
|
|
9
|
+
- Is the sole authoritative input for density_penalty in the scoring engine.
|
|
10
|
+
|
|
11
|
+
Three orthogonal character buckets (exhaustive, non-overlapping):
|
|
12
|
+
payload_chars = alphanumeric (actual information)
|
|
13
|
+
syntax_chars = non-alphanum, non-whitespace (brackets, punctuation, markup)
|
|
14
|
+
whitespace_chars = whitespace (layout/formatting overhead)
|
|
15
|
+
total_chars = payload + syntax + whitespace (always sums to 1.0)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import math
|
|
21
|
+
import re
|
|
22
|
+
from collections import Counter
|
|
23
|
+
from contextops.core.models import ContextBundle, DensitySignal
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def normalize_density_input(text: str) -> list[str]:
|
|
27
|
+
"""
|
|
28
|
+
Standardize text preprocessing for density metrics to prevent metric drift.
|
|
29
|
+
|
|
30
|
+
Rules (frozen — do not change without updating all callers):
|
|
31
|
+
1. Lowercase
|
|
32
|
+
2. Replace non-alphanumeric (including underscore) with spaces
|
|
33
|
+
3. Split on whitespace only
|
|
34
|
+
|
|
35
|
+
The regex uses [^a-z0-9\\s] (not \\w) to ensure underscores are treated
|
|
36
|
+
as punctuation, not part of identifiers. This makes snake_case and kebab-case
|
|
37
|
+
consistent (both split into component words).
|
|
38
|
+
"""
|
|
39
|
+
text = text.lower()
|
|
40
|
+
text = re.sub(r'[^a-z0-9\s]', ' ', text)
|
|
41
|
+
return text.split()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _calc_format_overhead(text: str) -> float:
|
|
45
|
+
"""
|
|
46
|
+
Format Overhead (FO): Ratio of syntax chars to total chars.
|
|
47
|
+
|
|
48
|
+
FO = syntax_chars / total_chars
|
|
49
|
+
where syntax_chars = non-alphanumeric AND non-whitespace characters
|
|
50
|
+
(brackets, punctuation, markup, operators, etc.)
|
|
51
|
+
|
|
52
|
+
Range: 0.0 (no syntax overhead) → 1.0 (all syntax, no payload or whitespace).
|
|
53
|
+
Does NOT include whitespace — that is measured separately by WL.
|
|
54
|
+
"""
|
|
55
|
+
total_chars = len(text)
|
|
56
|
+
if total_chars == 0:
|
|
57
|
+
return 0.0
|
|
58
|
+
|
|
59
|
+
syntax_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
|
|
60
|
+
return max(0.0, min(1.0, syntax_chars / total_chars))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _calc_whitespace_waste(text: str) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Whitespace Waste (WL): Ratio of whitespace chars to total chars.
|
|
66
|
+
|
|
67
|
+
WL = whitespace_chars / total_chars
|
|
68
|
+
where whitespace_chars = space, tab, newline, carriage return, etc.
|
|
69
|
+
|
|
70
|
+
Range: 0.0 (no whitespace) → 1.0 (all whitespace).
|
|
71
|
+
Does NOT include syntax chars — that is measured separately by FO.
|
|
72
|
+
"""
|
|
73
|
+
total_chars = len(text)
|
|
74
|
+
if total_chars == 0:
|
|
75
|
+
return 0.0
|
|
76
|
+
|
|
77
|
+
whitespace_chars = sum(1 for c in text if c.isspace())
|
|
78
|
+
return max(0.0, min(1.0, whitespace_chars / total_chars))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _calc_entropy_compression(text: str) -> float:
|
|
82
|
+
"""
|
|
83
|
+
Entropy Compression (EC): Statistical measure of repetitive boilerplate.
|
|
84
|
+
|
|
85
|
+
EC = 1 - normalized_shannon_entropy
|
|
86
|
+
|
|
87
|
+
High EC → low entropy → highly repetitive token distribution.
|
|
88
|
+
Low EC → high entropy → diverse vocabulary (good).
|
|
89
|
+
|
|
90
|
+
Normalization: entropy / log2(unique_words) so range is always 0.0–1.0.
|
|
91
|
+
"""
|
|
92
|
+
words = normalize_density_input(text)
|
|
93
|
+
if not words:
|
|
94
|
+
return 0.0
|
|
95
|
+
|
|
96
|
+
total_words = len(words)
|
|
97
|
+
word_counts = Counter(words)
|
|
98
|
+
unique_words = len(word_counts)
|
|
99
|
+
|
|
100
|
+
if unique_words <= 1:
|
|
101
|
+
return 1.0 # single word repeated — maximum compression
|
|
102
|
+
|
|
103
|
+
# Shannon entropy
|
|
104
|
+
entropy = 0.0
|
|
105
|
+
for count in word_counts.values():
|
|
106
|
+
p = count / total_words
|
|
107
|
+
entropy -= p * math.log2(p)
|
|
108
|
+
|
|
109
|
+
max_entropy = math.log2(unique_words)
|
|
110
|
+
normalized_entropy = entropy / max_entropy
|
|
111
|
+
|
|
112
|
+
return max(0.0, min(1.0, 1.0 - normalized_entropy))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def compute_density_signal(bundle: ContextBundle) -> DensitySignal:
|
|
116
|
+
"""
|
|
117
|
+
Compute the structural Density Signal from raw context content.
|
|
118
|
+
|
|
119
|
+
Signal contract: reads ONLY raw item.content strings.
|
|
120
|
+
Must NOT read token_count, wasted_tokens, or any analyzer output.
|
|
121
|
+
|
|
122
|
+
Weights (initial): FO=0.4, WL=0.2, EC=0.4
|
|
123
|
+
These are calibrated so typical clean context scores near 0.1–0.3,
|
|
124
|
+
and heavily bloated context scores near 0.6–0.9.
|
|
125
|
+
"""
|
|
126
|
+
if not bundle.items:
|
|
127
|
+
return DensitySignal(0.0, 0.0, 0.0, 0.0)
|
|
128
|
+
|
|
129
|
+
total_text = "\n".join(item.content for item in bundle.items)
|
|
130
|
+
|
|
131
|
+
if not total_text.strip():
|
|
132
|
+
return DensitySignal(0.0, 0.0, 0.0, 0.0)
|
|
133
|
+
|
|
134
|
+
fo = _calc_format_overhead(total_text)
|
|
135
|
+
wl = _calc_whitespace_waste(total_text)
|
|
136
|
+
ec = _calc_entropy_compression(total_text)
|
|
137
|
+
|
|
138
|
+
# Weights: w_fo=0.4, w_wl=0.2, w_ec=0.4
|
|
139
|
+
total_signal = (0.4 * fo) + (0.2 * wl) + (0.4 * ec)
|
|
140
|
+
|
|
141
|
+
return DensitySignal(
|
|
142
|
+
format_overhead=round(fo, 3),
|
|
143
|
+
whitespace_waste=round(wl, 3),
|
|
144
|
+
entropy_compression=round(ec, 3),
|
|
145
|
+
total_density_signal=round(total_signal, 3),
|
|
146
|
+
)
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redundancy Analyzer.
|
|
3
|
+
|
|
4
|
+
Detects near-duplicate, overlapping, and boilerplate context items.
|
|
5
|
+
Uses a deterministic hybrid heuristic approach:
|
|
6
|
+
1. Exact match detection (fast path)
|
|
7
|
+
2. Jaccard similarity on word sets
|
|
8
|
+
|
|
9
|
+
Critical design rule: NEVER blindly flag overlap as waste.
|
|
10
|
+
Adjacent chunks from the same source get EXPECTED_OVERLAP.
|
|
11
|
+
Only independent sources with high similarity get REDUNDANT_CONTEXT.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
import string
|
|
18
|
+
|
|
19
|
+
from contextops.core.models import (
|
|
20
|
+
ContextBundle,
|
|
21
|
+
ContextItem,
|
|
22
|
+
RedundancyClassification,
|
|
23
|
+
RedundancyFinding,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ── Thresholds (fixed, deterministic, CI-safe) ──────────────────────────
|
|
28
|
+
|
|
29
|
+
EXACT_MATCH_THRESHOLD: float = 1.0
|
|
30
|
+
HIGH_SIMILARITY_THRESHOLD: float = 0.75
|
|
31
|
+
MODERATE_SIMILARITY_THRESHOLD: float = 0.45
|
|
32
|
+
|
|
33
|
+
# Words that indicate boilerplate when they dominate the content
|
|
34
|
+
BOILERPLATE_SIGNALS: set[str] = {
|
|
35
|
+
"please", "always", "must", "never", "ensure", "remember",
|
|
36
|
+
"important", "note", "follow", "instructions", "guidelines",
|
|
37
|
+
"rules", "format", "respond", "output", "do not",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Lightweight synonym mapping for intent duplication
|
|
41
|
+
SYNONYM_MAP: dict[str, str] = {
|
|
42
|
+
"concise": "short",
|
|
43
|
+
"brief": "short",
|
|
44
|
+
"minimal": "short",
|
|
45
|
+
"quickly": "fast",
|
|
46
|
+
"rapidly": "fast",
|
|
47
|
+
"accurate": "correct",
|
|
48
|
+
"exact": "correct",
|
|
49
|
+
"precise": "correct",
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _get_source_base(s: str) -> str:
|
|
54
|
+
"""Strip trailing numeric suffixes and extensions to get a canonical source base name.
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
'doc_1' -> 'doc'
|
|
58
|
+
'page1.md' -> 'page'
|
|
59
|
+
'chunk-3' -> 'chunk'
|
|
60
|
+
'readme.md' -> 'readme.md' (no suffix stripped)
|
|
61
|
+
"""
|
|
62
|
+
base = re.sub(r'[_\-]?\d+(?:\.[a-zA-Z0-9]+)?$', '', s)
|
|
63
|
+
return base if base else s
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _get_ordered_tokens(text: str) -> list[str]:
|
|
67
|
+
"""Split text into an ordered list of lowercase words, mapping synonyms."""
|
|
68
|
+
text = text.lower().translate(str.maketrans("", "", string.punctuation))
|
|
69
|
+
words = text.split()
|
|
70
|
+
return [SYNONYM_MAP.get(w, w) for w in words]
|
|
71
|
+
|
|
72
|
+
def _tokenize_words(text: str) -> set[str]:
|
|
73
|
+
"""Split text into a lowercase word set, stripping punctuation and mapping synonyms."""
|
|
74
|
+
return set(_get_ordered_tokens(text))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _jaccard_similarity(set_a: set[str], set_b: set[str]) -> float:
|
|
78
|
+
"""
|
|
79
|
+
Compute Jaccard similarity between two word sets.
|
|
80
|
+
|
|
81
|
+
Returns 0.0 if both sets are empty, otherwise |intersection| / |union|.
|
|
82
|
+
Deterministic. No randomness. CI-safe.
|
|
83
|
+
"""
|
|
84
|
+
if not set_a and not set_b:
|
|
85
|
+
return 0.0
|
|
86
|
+
intersection = set_a & set_b
|
|
87
|
+
union = set_a | set_b
|
|
88
|
+
return len(intersection) / len(union)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _is_adjacent_source(item_a: ContextItem, item_b: ContextItem) -> bool:
|
|
92
|
+
"""
|
|
93
|
+
Check if two items are from adjacent positions in the same source.
|
|
94
|
+
|
|
95
|
+
Adjacent chunks from the same document are expected to overlap.
|
|
96
|
+
This prevents false positives in RAG pipelines with sliding windows.
|
|
97
|
+
"""
|
|
98
|
+
if not item_a.source or not item_b.source:
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# Same source base (e.g., "chunk_3" and "chunk_4" from same doc)
|
|
102
|
+
source_a = item_a.source
|
|
103
|
+
source_b = item_b.source
|
|
104
|
+
|
|
105
|
+
# Check if metadata indicates adjacency
|
|
106
|
+
idx_a = item_a.metadata.get("index") or item_a.metadata.get("chunk_index")
|
|
107
|
+
idx_b = item_b.metadata.get("index") or item_b.metadata.get("chunk_index")
|
|
108
|
+
|
|
109
|
+
if idx_a is not None and idx_b is not None:
|
|
110
|
+
try:
|
|
111
|
+
return abs(int(idx_a) - int(idx_b)) <= 1
|
|
112
|
+
except (ValueError, TypeError):
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
# Check if sources share a base name (e.g., "docs/api.md", "page1.md", "doc-2")
|
|
116
|
+
base_a = _get_source_base(source_a)
|
|
117
|
+
base_b = _get_source_base(source_b)
|
|
118
|
+
return base_a == base_b and source_a != source_b
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _is_boilerplate(item: ContextItem) -> bool:
|
|
122
|
+
"""
|
|
123
|
+
Check if an item's content is primarily boilerplate instructions.
|
|
124
|
+
|
|
125
|
+
Returns True if a high proportion of word occurrences match boilerplate signals.
|
|
126
|
+
Uses an ordered token list (not a set) so word frequency is correctly measured.
|
|
127
|
+
e.g. "please please please do this" → 3/5 = 60% signal density, correctly fires.
|
|
128
|
+
"""
|
|
129
|
+
words = _get_ordered_tokens(item.content) # list, preserves frequency
|
|
130
|
+
if len(words) < 5:
|
|
131
|
+
return False
|
|
132
|
+
signal_count = sum(1 for w in words if w in BOILERPLATE_SIGNALS)
|
|
133
|
+
return (signal_count / len(words)) > 0.25
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _classify(
|
|
137
|
+
item_a: ContextItem,
|
|
138
|
+
item_b: ContextItem,
|
|
139
|
+
similarity: float,
|
|
140
|
+
) -> RedundancyClassification:
|
|
141
|
+
"""
|
|
142
|
+
Classify the type of redundancy between two items.
|
|
143
|
+
|
|
144
|
+
Rules:
|
|
145
|
+
1. Adjacent chunks from same source → EXPECTED_OVERLAP
|
|
146
|
+
2. Both are boilerplate → BOILERPLATE
|
|
147
|
+
3. Everything else with high similarity → REDUNDANT_CONTEXT
|
|
148
|
+
"""
|
|
149
|
+
if _is_adjacent_source(item_a, item_b):
|
|
150
|
+
return RedundancyClassification.EXPECTED_OVERLAP
|
|
151
|
+
|
|
152
|
+
if _is_boilerplate(item_a) and _is_boilerplate(item_b):
|
|
153
|
+
return RedundancyClassification.BOILERPLATE
|
|
154
|
+
|
|
155
|
+
return RedundancyClassification.REDUNDANT_CONTEXT
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def analyze_redundancy(bundle: ContextBundle) -> tuple[list[RedundancyFinding], int]:
|
|
159
|
+
"""
|
|
160
|
+
Detect redundant pairs in a ContextBundle and calculate global final waste.
|
|
161
|
+
|
|
162
|
+
Uses a hybrid approach:
|
|
163
|
+
1. Fast path chunk-to-chunk exact/Jaccard similarity for human-readable findings.
|
|
164
|
+
2. Global Multi-Scale N-grams (8, 12, 16) for authoritative penalty math.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
tuple of (list of RedundancyFinding, final_wasted_tokens integer)
|
|
168
|
+
"""
|
|
169
|
+
findings: list[RedundancyFinding] = []
|
|
170
|
+
items = bundle.items
|
|
171
|
+
|
|
172
|
+
# --- 1. Raw Signal Extraction (Multi-Scale N-grams) ---
|
|
173
|
+
SCALES = [8, 12, 16]
|
|
174
|
+
|
|
175
|
+
item_tokens = []
|
|
176
|
+
total_token_count = 0
|
|
177
|
+
for item in items:
|
|
178
|
+
tokens = _get_ordered_tokens(item.content)
|
|
179
|
+
item_tokens.append((item, tokens, total_token_count))
|
|
180
|
+
total_token_count += len(tokens)
|
|
181
|
+
|
|
182
|
+
# Fast-path: if content is very large and all items are unique, skip N-gram scan
|
|
183
|
+
MAX_NGRAM_TOKENS = 10000 # Only run N-gram scan below this token count
|
|
184
|
+
|
|
185
|
+
skip_ngram = False
|
|
186
|
+
if total_token_count > MAX_NGRAM_TOKENS:
|
|
187
|
+
# Check if there are any content hash collisions (potential duplicates)
|
|
188
|
+
from hashlib import md5 as _md5
|
|
189
|
+
content_hashes = set()
|
|
190
|
+
has_collision = False
|
|
191
|
+
for item in items:
|
|
192
|
+
h = _md5(item.content.strip().lower().encode()).hexdigest()
|
|
193
|
+
if h in content_hashes:
|
|
194
|
+
has_collision = True
|
|
195
|
+
break
|
|
196
|
+
content_hashes.add(h)
|
|
197
|
+
|
|
198
|
+
if not has_collision:
|
|
199
|
+
# All items are unique — no token-level redundancy possible
|
|
200
|
+
skip_ngram = True
|
|
201
|
+
|
|
202
|
+
scale_waste_counts = {}
|
|
203
|
+
|
|
204
|
+
if not skip_ngram:
|
|
205
|
+
for N in SCALES:
|
|
206
|
+
token_redundancy_score = [0] * total_token_count
|
|
207
|
+
seen_ngrams = {}
|
|
208
|
+
|
|
209
|
+
for item, tokens, offset in item_tokens:
|
|
210
|
+
if len(tokens) < N:
|
|
211
|
+
continue
|
|
212
|
+
for i in range(len(tokens) - N + 1):
|
|
213
|
+
ngram = tuple(tokens[i:i+N])
|
|
214
|
+
seen_ngrams.setdefault(ngram, []).append((offset + i, item))
|
|
215
|
+
|
|
216
|
+
for ngram, occurrences in seen_ngrams.items():
|
|
217
|
+
if len(occurrences) > 1:
|
|
218
|
+
occ_items = [occ[1] for occ in occurrences]
|
|
219
|
+
|
|
220
|
+
# Respect Boilerplate and Expected Overlap
|
|
221
|
+
if all(_is_boilerplate(it) for it in occ_items):
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
is_waste = False
|
|
225
|
+
for i in range(len(occ_items)):
|
|
226
|
+
for j in range(i+1, len(occ_items)):
|
|
227
|
+
if occ_items[i].id != occ_items[j].id and not _is_adjacent_source(occ_items[i], occ_items[j]):
|
|
228
|
+
is_waste = True
|
|
229
|
+
break
|
|
230
|
+
if is_waste: break
|
|
231
|
+
|
|
232
|
+
# Self-duplication is waste unless boilerplate
|
|
233
|
+
if not is_waste and all(occ_items[0].id == it.id for it in occ_items):
|
|
234
|
+
is_waste = True
|
|
235
|
+
|
|
236
|
+
if is_waste:
|
|
237
|
+
for start_idx, _ in occurrences:
|
|
238
|
+
for j in range(start_idx, start_idx + N):
|
|
239
|
+
token_redundancy_score[j] += 1
|
|
240
|
+
|
|
241
|
+
scale_waste_counts[N] = sum(token_redundancy_score)
|
|
242
|
+
|
|
243
|
+
# --- 2. Structural Aggregation (Weighted Summation & Compression) ---
|
|
244
|
+
weighted_sum = (
|
|
245
|
+
0.4 * scale_waste_counts.get(8, 0) +
|
|
246
|
+
0.35 * scale_waste_counts.get(12, 0) +
|
|
247
|
+
0.25 * scale_waste_counts.get(16, 0)
|
|
248
|
+
)
|
|
249
|
+
import math
|
|
250
|
+
final_wasted_tokens = int(math.sqrt(weighted_sum)) if weighted_sum > 0 else 0
|
|
251
|
+
|
|
252
|
+
# --- 3. Generate Human-Readable Findings ---
|
|
253
|
+
# For large bundles, limit pairwise comparisons using content-hash bucketing.
|
|
254
|
+
# Exact duplicates are grouped first (O(n)), then cross-group Jaccard is
|
|
255
|
+
# limited to a sample to keep total work bounded.
|
|
256
|
+
|
|
257
|
+
MAX_PAIRWISE_ITEMS = 50 # Only do full O(n²) when n ≤ 50
|
|
258
|
+
|
|
259
|
+
if len(items) <= MAX_PAIRWISE_ITEMS:
|
|
260
|
+
pairs_to_check = [(i, j) for i in range(len(items)) for j in range(i + 1, len(items))]
|
|
261
|
+
else:
|
|
262
|
+
# Hash-bucket fast path: group items by stripped content hash
|
|
263
|
+
from hashlib import md5
|
|
264
|
+
buckets: dict[str, list[int]] = {}
|
|
265
|
+
for idx, item in enumerate(items):
|
|
266
|
+
h = md5(item.content.strip().lower().encode()).hexdigest()[:16]
|
|
267
|
+
buckets.setdefault(h, []).append(idx)
|
|
268
|
+
|
|
269
|
+
pairs_to_check = []
|
|
270
|
+
# Always compare within same hash bucket (exact/near duplicates)
|
|
271
|
+
for indices in buckets.values():
|
|
272
|
+
for a_pos in range(len(indices)):
|
|
273
|
+
for b_pos in range(a_pos + 1, len(indices)):
|
|
274
|
+
pairs_to_check.append((indices[a_pos], indices[b_pos]))
|
|
275
|
+
|
|
276
|
+
# Add a bounded sample of cross-bucket pairs for fuzzy detection
|
|
277
|
+
all_indices = list(range(len(items)))
|
|
278
|
+
bucket_keys = list(buckets.keys())
|
|
279
|
+
cross_pairs_added = 0
|
|
280
|
+
max_cross_pairs = MAX_PAIRWISE_ITEMS * 10 # cap at ~500
|
|
281
|
+
for bi in range(len(bucket_keys)):
|
|
282
|
+
if cross_pairs_added >= max_cross_pairs:
|
|
283
|
+
break
|
|
284
|
+
for bj in range(bi + 1, len(bucket_keys)):
|
|
285
|
+
if cross_pairs_added >= max_cross_pairs:
|
|
286
|
+
break
|
|
287
|
+
# Compare first item from each bucket
|
|
288
|
+
pairs_to_check.append((buckets[bucket_keys[bi]][0], buckets[bucket_keys[bj]][0]))
|
|
289
|
+
cross_pairs_added += 1
|
|
290
|
+
|
|
291
|
+
for i_idx, j_idx in pairs_to_check:
|
|
292
|
+
item_a = items[i_idx]
|
|
293
|
+
item_b = items[j_idx]
|
|
294
|
+
|
|
295
|
+
# Skip empty content
|
|
296
|
+
if not item_a.content.strip() or not item_b.content.strip():
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
# Fast path: exact match
|
|
300
|
+
if item_a.content.strip() == item_b.content.strip():
|
|
301
|
+
similarity = EXACT_MATCH_THRESHOLD
|
|
302
|
+
else:
|
|
303
|
+
words_a = _tokenize_words(item_a.content)
|
|
304
|
+
words_b = _tokenize_words(item_b.content)
|
|
305
|
+
similarity = _jaccard_similarity(words_a, words_b)
|
|
306
|
+
|
|
307
|
+
# Only report if above moderate threshold
|
|
308
|
+
if similarity < MODERATE_SIMILARITY_THRESHOLD:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
classification = _classify(item_a, item_b, similarity)
|
|
312
|
+
|
|
313
|
+
# Estimate waste: the smaller item's tokens are "wasted" if redundant
|
|
314
|
+
waste = min(item_a.token_count, item_b.token_count)
|
|
315
|
+
if classification == RedundancyClassification.EXPECTED_OVERLAP:
|
|
316
|
+
# Expected overlap is not full waste — discount by 80%
|
|
317
|
+
waste = int(waste * 0.2)
|
|
318
|
+
|
|
319
|
+
# Build human-readable detail
|
|
320
|
+
detail = _build_detail(item_a, item_b, similarity, classification)
|
|
321
|
+
|
|
322
|
+
findings.append(RedundancyFinding(
|
|
323
|
+
item_a_id=item_a.id,
|
|
324
|
+
item_b_id=item_b.id,
|
|
325
|
+
similarity_score=similarity,
|
|
326
|
+
classification=classification,
|
|
327
|
+
estimated_waste_tokens=waste,
|
|
328
|
+
detail=detail,
|
|
329
|
+
))
|
|
330
|
+
|
|
331
|
+
# Sort strictly for CI determinism: waste desc, similarity desc, id_a, id_b
|
|
332
|
+
findings.sort(
|
|
333
|
+
key=lambda f: (-f.estimated_waste_tokens, -f.similarity_score, f.item_a_id, f.item_b_id)
|
|
334
|
+
)
|
|
335
|
+
return findings, final_wasted_tokens
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _build_detail(
|
|
339
|
+
item_a: ContextItem,
|
|
340
|
+
item_b: ContextItem,
|
|
341
|
+
similarity: float,
|
|
342
|
+
classification: RedundancyClassification,
|
|
343
|
+
) -> str:
|
|
344
|
+
"""Build a human-readable explanation of the finding."""
|
|
345
|
+
sim_pct = f"{similarity * 100:.0f}%"
|
|
346
|
+
|
|
347
|
+
if classification == RedundancyClassification.EXPECTED_OVERLAP:
|
|
348
|
+
return (
|
|
349
|
+
f"{sim_pct} similarity between '{item_a.source}' and '{item_b.source}' "
|
|
350
|
+
f"— expected overlap (adjacent chunks)"
|
|
351
|
+
)
|
|
352
|
+
elif classification == RedundancyClassification.BOILERPLATE:
|
|
353
|
+
return (
|
|
354
|
+
f"{sim_pct} similarity — both items contain boilerplate instructions"
|
|
355
|
+
)
|
|
356
|
+
else:
|
|
357
|
+
src_a = item_a.source or item_a.type.value
|
|
358
|
+
src_b = item_b.source or item_b.type.value
|
|
359
|
+
return (
|
|
360
|
+
f"{sim_pct} similarity between '{src_a}' and '{src_b}' "
|
|
361
|
+
f"— redundant context from independent sources"
|
|
362
|
+
)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structure Analyzer.
|
|
3
|
+
|
|
4
|
+
Measures the distribution of context types and detects imbalance.
|
|
5
|
+
Uses simple threshold-based rules — no complex entropy for V0.1.
|
|
6
|
+
|
|
7
|
+
Detects:
|
|
8
|
+
- Retrieval dominance (RAG flooding)
|
|
9
|
+
- System prompt bloat
|
|
10
|
+
- Memory explosion
|
|
11
|
+
- Tool output sprawl
|
|
12
|
+
- Missing context types
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from contextops.core.config import ContextOpsConfig
|
|
18
|
+
from contextops.core.models import (
|
|
19
|
+
ContextBundle,
|
|
20
|
+
ContextType,
|
|
21
|
+
FindingSeverity,
|
|
22
|
+
StructureFinding,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# ── Imbalance Issues ───────────────────────────────────────────────────────
|
|
26
|
+
ISSUES: dict[ContextType, dict[str, str | FindingSeverity]] = {
|
|
27
|
+
ContextType.RETRIEVAL: {
|
|
28
|
+
"issue": "Retrieval dominance",
|
|
29
|
+
"detail": "RAG chunks consume {pct}% of context — likely noisy retrieval",
|
|
30
|
+
"severity": FindingSeverity.HIGH,
|
|
31
|
+
},
|
|
32
|
+
ContextType.SYSTEM: {
|
|
33
|
+
"issue": "System prompt bloat",
|
|
34
|
+
"detail": "System prompt uses {pct}% of context — consider trimming instructions",
|
|
35
|
+
"severity": FindingSeverity.MEDIUM,
|
|
36
|
+
},
|
|
37
|
+
ContextType.MEMORY: {
|
|
38
|
+
"issue": "Memory explosion",
|
|
39
|
+
"detail": "Memory entries consume {pct}% of context — prune old memories",
|
|
40
|
+
"severity": FindingSeverity.HIGH,
|
|
41
|
+
},
|
|
42
|
+
ContextType.TOOL: {
|
|
43
|
+
"issue": "Tool output sprawl",
|
|
44
|
+
"detail": "Tool outputs use {pct}% of context — summarize tool responses",
|
|
45
|
+
"severity": FindingSeverity.MEDIUM,
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Minimum expected types for a "healthy" context
|
|
50
|
+
RECOMMENDED_MIN_TYPES: int = 2
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def analyze_structure(bundle: ContextBundle, config: ContextOpsConfig | None = None) -> list[StructureFinding]:
|
|
54
|
+
"""
|
|
55
|
+
Analyze the structural distribution of context types.
|
|
56
|
+
|
|
57
|
+
Checks:
|
|
58
|
+
1. Per-type ratio against config thresholds
|
|
59
|
+
2. Whether context lacks diversity (too few types)
|
|
60
|
+
|
|
61
|
+
Returns a list of StructureFinding, sorted by severity.
|
|
62
|
+
"""
|
|
63
|
+
config = config or ContextOpsConfig.default()
|
|
64
|
+
findings: list[StructureFinding] = []
|
|
65
|
+
total_tokens = bundle.total_tokens
|
|
66
|
+
|
|
67
|
+
if total_tokens == 0:
|
|
68
|
+
return findings
|
|
69
|
+
|
|
70
|
+
# Calculate ratios per type
|
|
71
|
+
type_tokens: dict[ContextType, int] = {}
|
|
72
|
+
for item in bundle.items:
|
|
73
|
+
type_tokens[item.type] = type_tokens.get(item.type, 0) + item.token_count
|
|
74
|
+
|
|
75
|
+
# Configurable thresholds
|
|
76
|
+
thresholds_map = {
|
|
77
|
+
ContextType.RETRIEVAL: config.retrieval_max_ratio,
|
|
78
|
+
ContextType.SYSTEM: config.system_max_ratio,
|
|
79
|
+
ContextType.MEMORY: config.memory_max_ratio,
|
|
80
|
+
ContextType.TOOL: config.tool_max_ratio,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Check each type against thresholds
|
|
84
|
+
for ctx_type, issue_info in ISSUES.items():
|
|
85
|
+
tokens = type_tokens.get(ctx_type, 0)
|
|
86
|
+
if tokens == 0:
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
ratio = tokens / total_tokens
|
|
90
|
+
max_ratio = thresholds_map[ctx_type]
|
|
91
|
+
|
|
92
|
+
if ratio > max_ratio:
|
|
93
|
+
findings.append(StructureFinding(
|
|
94
|
+
issue=str(issue_info["issue"]),
|
|
95
|
+
context_type=ctx_type,
|
|
96
|
+
actual_ratio=ratio,
|
|
97
|
+
threshold=max_ratio,
|
|
98
|
+
severity=issue_info["severity"], # type: ignore[arg-type]
|
|
99
|
+
))
|
|
100
|
+
|
|
101
|
+
# Check for low type diversity
|
|
102
|
+
unique_types = len(type_tokens)
|
|
103
|
+
if unique_types < RECOMMENDED_MIN_TYPES and bundle.item_count > 1:
|
|
104
|
+
# Find the dominant type
|
|
105
|
+
dominant = max(type_tokens, key=lambda t: type_tokens[t])
|
|
106
|
+
findings.append(StructureFinding(
|
|
107
|
+
issue="Low context diversity",
|
|
108
|
+
context_type=dominant,
|
|
109
|
+
actual_ratio=type_tokens[dominant] / total_tokens,
|
|
110
|
+
threshold=0.0, # not a ratio threshold
|
|
111
|
+
severity=FindingSeverity.LOW,
|
|
112
|
+
))
|
|
113
|
+
|
|
114
|
+
# Sort by severity (critical first)
|
|
115
|
+
severity_order = {
|
|
116
|
+
FindingSeverity.CRITICAL: 0,
|
|
117
|
+
FindingSeverity.HIGH: 1,
|
|
118
|
+
FindingSeverity.MEDIUM: 2,
|
|
119
|
+
FindingSeverity.LOW: 3,
|
|
120
|
+
}
|
|
121
|
+
findings.sort(key=lambda f: severity_order.get(f.severity, 99))
|
|
122
|
+
|
|
123
|
+
return findings
|