autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, Callable, Optional, Tuple
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
import numpy as np
|
|
6
|
+
from ..chunkers.base import Chunk
|
|
7
|
+
from ..utils.text import count_tokens, split_sentences
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class OverlapOptimizationResult:
|
|
11
|
+
"""Result of overlap optimization."""
|
|
12
|
+
original_chunks: List[Chunk]
|
|
13
|
+
optimized_chunks: List[Chunk]
|
|
14
|
+
overlap_stats: Dict[str, Any]
|
|
15
|
+
improvements: List[str]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OverlapOptimizer:
|
|
19
|
+
"""
|
|
20
|
+
World-Class Intelligent Overlap Optimization System.
|
|
21
|
+
|
|
22
|
+
Dynamically adjusts overlap between chunks based on semantic analysis
|
|
23
|
+
to ensure optimal context continuity without redundancy.
|
|
24
|
+
|
|
25
|
+
OPTIMIZATION STRATEGIES:
|
|
26
|
+
1. Semantic Bridging: More overlap at topic boundaries
|
|
27
|
+
2. Entity Preservation: Ensure named entities aren't split
|
|
28
|
+
3. Sentence Integrity: Overlap at sentence boundaries
|
|
29
|
+
4. Adaptive Sizing: Variable overlap based on chunk content
|
|
30
|
+
5. Context Windows: Sliding windows with smart step sizes
|
|
31
|
+
|
|
32
|
+
METHODS:
|
|
33
|
+
- fixed: Traditional fixed-token overlap
|
|
34
|
+
- semantic: Embedding-based adaptive overlap
|
|
35
|
+
- entity: NER-based overlap for entity preservation
|
|
36
|
+
- sentence: Always overlap complete sentences
|
|
37
|
+
- hybrid: Combination of all methods
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self,
|
|
41
|
+
embedding_fn: Callable[[List[str]], List[List[float]]] = None,
|
|
42
|
+
base_overlap: int = 50,
|
|
43
|
+
min_overlap: int = 20,
|
|
44
|
+
max_overlap: int = 200,
|
|
45
|
+
method: str = "hybrid"):
|
|
46
|
+
"""
|
|
47
|
+
Initialize the overlap optimizer.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
embedding_fn: Function for semantic analysis.
|
|
51
|
+
base_overlap: Default overlap in tokens.
|
|
52
|
+
min_overlap: Minimum allowed overlap.
|
|
53
|
+
max_overlap: Maximum allowed overlap.
|
|
54
|
+
method: "fixed", "semantic", "entity", "sentence", "hybrid".
|
|
55
|
+
"""
|
|
56
|
+
self.embedding_fn = embedding_fn
|
|
57
|
+
self.base_overlap = base_overlap
|
|
58
|
+
self.min_overlap = min_overlap
|
|
59
|
+
self.max_overlap = max_overlap
|
|
60
|
+
self.method = method
|
|
61
|
+
|
|
62
|
+
def optimize_overlaps(self,
|
|
63
|
+
chunks: List[Chunk],
|
|
64
|
+
original_text: str = None) -> OverlapOptimizationResult:
|
|
65
|
+
"""
|
|
66
|
+
Optimize overlaps between chunks with batched semantic analysis.
|
|
67
|
+
"""
|
|
68
|
+
if len(chunks) < 2:
|
|
69
|
+
return OverlapOptimizationResult(
|
|
70
|
+
original_chunks=chunks,
|
|
71
|
+
optimized_chunks=chunks,
|
|
72
|
+
overlap_stats={'pairs_analyzed': 0},
|
|
73
|
+
improvements=[]
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Optimization: Batch embed all boundary sentence pairs at once
|
|
77
|
+
boundary_embeddings = None
|
|
78
|
+
if self.embedding_fn and self.method in ["semantic", "hybrid"]:
|
|
79
|
+
boundary_sentences = []
|
|
80
|
+
for i in range(len(chunks) - 1):
|
|
81
|
+
s1 = split_sentences(chunks[i].text)
|
|
82
|
+
s2 = split_sentences(chunks[i + 1].text)
|
|
83
|
+
if s1 and s2:
|
|
84
|
+
boundary_sentences.extend([s1[-1], s2[0]])
|
|
85
|
+
else:
|
|
86
|
+
boundary_sentences.extend(["", ""]) # Placeholders
|
|
87
|
+
|
|
88
|
+
boundary_embeddings = self.embedding_fn(boundary_sentences)
|
|
89
|
+
|
|
90
|
+
# Analyze current overlaps
|
|
91
|
+
current_overlaps = self._analyze_current_overlaps(chunks)
|
|
92
|
+
|
|
93
|
+
# Calculate optimal overlaps for each pair
|
|
94
|
+
optimal_overlaps = []
|
|
95
|
+
for i in range(len(chunks) - 1):
|
|
96
|
+
pair_embeddings = None
|
|
97
|
+
if boundary_embeddings:
|
|
98
|
+
pair_embeddings = [boundary_embeddings[i*2], boundary_embeddings[i*2 + 1]]
|
|
99
|
+
|
|
100
|
+
optimal = self._calculate_optimal_overlap(
|
|
101
|
+
chunks[i], chunks[i + 1], i, len(chunks), pair_embeddings
|
|
102
|
+
)
|
|
103
|
+
optimal_overlaps.append(optimal)
|
|
104
|
+
|
|
105
|
+
# Generate optimized chunks
|
|
106
|
+
optimized_chunks = self._apply_overlaps(chunks, optimal_overlaps, original_text)
|
|
107
|
+
|
|
108
|
+
# Generate improvements list
|
|
109
|
+
improvements = self._generate_improvements(current_overlaps, optimal_overlaps)
|
|
110
|
+
|
|
111
|
+
return OverlapOptimizationResult(
|
|
112
|
+
original_chunks=chunks,
|
|
113
|
+
optimized_chunks=optimized_chunks,
|
|
114
|
+
overlap_stats={
|
|
115
|
+
'pairs_analyzed': len(chunks) - 1,
|
|
116
|
+
'current_overlaps': current_overlaps,
|
|
117
|
+
'optimal_overlaps': optimal_overlaps,
|
|
118
|
+
'avg_current': float(np.mean(current_overlaps)) if current_overlaps else 0.0,
|
|
119
|
+
'avg_optimal': float(np.mean(optimal_overlaps)) if optimal_overlaps else 0.0
|
|
120
|
+
},
|
|
121
|
+
improvements=improvements
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def add_overlap_to_chunks(self,
|
|
125
|
+
chunks: List[Chunk],
|
|
126
|
+
overlap_tokens: int = None) -> List[Chunk]:
|
|
127
|
+
"""
|
|
128
|
+
Add overlap context to chunks that may not have it.
|
|
129
|
+
|
|
130
|
+
This method adds text from adjacent chunks to each chunk,
|
|
131
|
+
useful when chunks were created without overlap.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
chunks: Original chunks without overlap.
|
|
135
|
+
overlap_tokens: Number of tokens to overlap. Uses base_overlap if None.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
New list of chunks with overlap added.
|
|
139
|
+
"""
|
|
140
|
+
if len(chunks) < 2:
|
|
141
|
+
return chunks
|
|
142
|
+
|
|
143
|
+
overlap = overlap_tokens or self.base_overlap
|
|
144
|
+
enhanced_chunks = []
|
|
145
|
+
|
|
146
|
+
for i, chunk in enumerate(chunks):
|
|
147
|
+
prefix = ""
|
|
148
|
+
suffix = ""
|
|
149
|
+
|
|
150
|
+
# Add suffix from next chunk
|
|
151
|
+
if i < len(chunks) - 1:
|
|
152
|
+
next_text = chunks[i + 1].text
|
|
153
|
+
suffix_sentences = split_sentences(next_text)
|
|
154
|
+
suffix_tokens = 0
|
|
155
|
+
suffix_parts = []
|
|
156
|
+
|
|
157
|
+
for sent in suffix_sentences:
|
|
158
|
+
sent_tokens = count_tokens(sent)
|
|
159
|
+
if suffix_tokens + sent_tokens <= overlap:
|
|
160
|
+
suffix_parts.append(sent)
|
|
161
|
+
suffix_tokens += sent_tokens
|
|
162
|
+
else:
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
if suffix_parts:
|
|
166
|
+
suffix = " " + " ".join(suffix_parts)
|
|
167
|
+
|
|
168
|
+
# Add prefix from previous chunk
|
|
169
|
+
if i > 0:
|
|
170
|
+
prev_text = chunks[i - 1].text
|
|
171
|
+
prefix_sentences = split_sentences(prev_text)
|
|
172
|
+
prefix_tokens = 0
|
|
173
|
+
prefix_parts = []
|
|
174
|
+
|
|
175
|
+
for sent in reversed(prefix_sentences):
|
|
176
|
+
sent_tokens = count_tokens(sent)
|
|
177
|
+
if prefix_tokens + sent_tokens <= overlap:
|
|
178
|
+
prefix_parts.insert(0, sent)
|
|
179
|
+
prefix_tokens += sent_tokens
|
|
180
|
+
else:
|
|
181
|
+
break
|
|
182
|
+
|
|
183
|
+
if prefix_parts:
|
|
184
|
+
prefix = " ".join(prefix_parts) + " "
|
|
185
|
+
|
|
186
|
+
# Create enhanced chunk
|
|
187
|
+
enhanced_text = prefix + chunk.text + suffix
|
|
188
|
+
|
|
189
|
+
enhanced_chunks.append(Chunk(
|
|
190
|
+
id=f"{chunk.id}_enhanced",
|
|
191
|
+
doc_id=chunk.doc_id,
|
|
192
|
+
text=enhanced_text,
|
|
193
|
+
meta={
|
|
194
|
+
**chunk.meta,
|
|
195
|
+
"has_overlap": True,
|
|
196
|
+
"prefix_tokens": count_tokens(prefix),
|
|
197
|
+
"suffix_tokens": count_tokens(suffix),
|
|
198
|
+
"original_id": chunk.id
|
|
199
|
+
}
|
|
200
|
+
))
|
|
201
|
+
|
|
202
|
+
return enhanced_chunks
|
|
203
|
+
|
|
204
|
+
def _analyze_current_overlaps(self, chunks: List[Chunk]) -> List[int]:
|
|
205
|
+
"""Analyze existing overlaps between adjacent chunks."""
|
|
206
|
+
overlaps = []
|
|
207
|
+
|
|
208
|
+
for i in range(len(chunks) - 1):
|
|
209
|
+
text1 = chunks[i].text.lower()
|
|
210
|
+
text2 = chunks[i + 1].text.lower()
|
|
211
|
+
|
|
212
|
+
# Find longest common suffix/prefix
|
|
213
|
+
overlap_tokens = self._find_text_overlap(text1, text2)
|
|
214
|
+
overlaps.append(overlap_tokens)
|
|
215
|
+
|
|
216
|
+
return overlaps
|
|
217
|
+
|
|
218
|
+
def _find_text_overlap(self, text1: str, text2: str) -> int:
|
|
219
|
+
"""Find token overlap between end of text1 and start of text2."""
|
|
220
|
+
words1 = text1.split()
|
|
221
|
+
words2 = text2.split()
|
|
222
|
+
|
|
223
|
+
max_overlap = min(len(words1), len(words2), 50) # Limit search
|
|
224
|
+
|
|
225
|
+
for overlap_len in range(max_overlap, 0, -1):
|
|
226
|
+
suffix = words1[-overlap_len:]
|
|
227
|
+
prefix = words2[:overlap_len]
|
|
228
|
+
if suffix == prefix:
|
|
229
|
+
return overlap_len
|
|
230
|
+
|
|
231
|
+
return 0
|
|
232
|
+
|
|
233
|
+
def _calculate_optimal_overlap(self,
|
|
234
|
+
chunk1: Chunk,
|
|
235
|
+
chunk2: Chunk,
|
|
236
|
+
pair_index: int,
|
|
237
|
+
total_chunks: int,
|
|
238
|
+
pair_embeddings: Optional[List[List[float]]] = None) -> int:
|
|
239
|
+
"""Calculate optimal overlap for a chunk pair with optional pre-calculated embeddings."""
|
|
240
|
+
if self.method == "fixed":
|
|
241
|
+
return self.base_overlap
|
|
242
|
+
|
|
243
|
+
optimal = self.base_overlap
|
|
244
|
+
factors = []
|
|
245
|
+
|
|
246
|
+
if self.method in ["semantic", "hybrid"]:
|
|
247
|
+
semantic_factor = self._semantic_overlap_factor(chunk1, chunk2, pair_embeddings)
|
|
248
|
+
factors.append(semantic_factor)
|
|
249
|
+
|
|
250
|
+
if self.method in ["entity", "hybrid"]:
|
|
251
|
+
entity_factor = self._entity_overlap_factor(chunk1, chunk2)
|
|
252
|
+
factors.append(entity_factor)
|
|
253
|
+
|
|
254
|
+
if self.method in ["sentence", "hybrid"]:
|
|
255
|
+
sentence_factor = self._sentence_overlap_factor(chunk1, chunk2)
|
|
256
|
+
factors.append(sentence_factor)
|
|
257
|
+
|
|
258
|
+
# Combine factors
|
|
259
|
+
if factors:
|
|
260
|
+
avg_factor = float(np.mean(factors))
|
|
261
|
+
optimal = int(self.base_overlap * avg_factor)
|
|
262
|
+
|
|
263
|
+
# Clamp to bounds
|
|
264
|
+
return max(self.min_overlap, min(self.max_overlap, optimal))
|
|
265
|
+
|
|
266
|
+
def _semantic_overlap_factor(self, chunk1: Chunk, chunk2: Chunk, pair_embeddings: Optional[List[List[float]]] = None) -> float:
|
|
267
|
+
"""
|
|
268
|
+
Calculate overlap factor based on semantic similarity using pre-calculated or on-the-fly embeddings.
|
|
269
|
+
"""
|
|
270
|
+
try:
|
|
271
|
+
vec1 = None
|
|
272
|
+
vec2 = None
|
|
273
|
+
|
|
274
|
+
if pair_embeddings and len(pair_embeddings) == 2:
|
|
275
|
+
vec1, vec2 = np.array(pair_embeddings[0]), np.array(pair_embeddings[1])
|
|
276
|
+
elif self.embedding_fn:
|
|
277
|
+
# Fallback to on-the-fly calculation if not batched
|
|
278
|
+
sentences1 = split_sentences(chunk1.text)
|
|
279
|
+
sentences2 = split_sentences(chunk2.text)
|
|
280
|
+
if not sentences1 or not sentences2: return 1.0
|
|
281
|
+
boundary_texts = [sentences1[-1], sentences2[0]]
|
|
282
|
+
embeddings = self.embedding_fn(boundary_texts)
|
|
283
|
+
vec1, vec2 = np.array(embeddings[0]), np.array(embeddings[1])
|
|
284
|
+
|
|
285
|
+
if vec1 is None or vec2 is None:
|
|
286
|
+
return 1.0
|
|
287
|
+
|
|
288
|
+
norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
|
|
289
|
+
if norm1 == 0 or norm2 == 0:
|
|
290
|
+
return 1.0
|
|
291
|
+
|
|
292
|
+
similarity = float(np.dot(vec1, vec2) / (norm1 * norm2))
|
|
293
|
+
|
|
294
|
+
# Low similarity = topic shift = more overlap needed
|
|
295
|
+
# Similarity 0.9+ = same topic = less overlap
|
|
296
|
+
# Similarity 0.5- = big shift = more overlap
|
|
297
|
+
if similarity > 0.85:
|
|
298
|
+
return 0.6 # Reduce overlap
|
|
299
|
+
elif similarity < 0.5:
|
|
300
|
+
return 1.8 # Increase overlap
|
|
301
|
+
else:
|
|
302
|
+
return float(1.0 + (0.7 - similarity)) # Linear scaling
|
|
303
|
+
|
|
304
|
+
except:
|
|
305
|
+
return 1.0
|
|
306
|
+
|
|
307
|
+
def _entity_overlap_factor(self, chunk1: Chunk, chunk2: Chunk) -> float:
|
|
308
|
+
"""
|
|
309
|
+
Calculate overlap factor based on entity preservation.
|
|
310
|
+
If chunk2 starts with references to entities from chunk1, increase overlap.
|
|
311
|
+
"""
|
|
312
|
+
import re
|
|
313
|
+
|
|
314
|
+
# Extract potential entities (capitalized words)
|
|
315
|
+
entity_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
|
|
316
|
+
|
|
317
|
+
entities1 = set(re.findall(entity_pattern, chunk1.text[-500:])) # Last 500 chars
|
|
318
|
+
|
|
319
|
+
# Check if chunk2 references these entities early
|
|
320
|
+
first_100_chars = chunk2.text[:200]
|
|
321
|
+
entities_in_start = set(re.findall(entity_pattern, first_100_chars))
|
|
322
|
+
|
|
323
|
+
shared_entities = entities1 & entities_in_start
|
|
324
|
+
|
|
325
|
+
if len(shared_entities) >= 3:
|
|
326
|
+
return 1.5 # More overlap to include entity context
|
|
327
|
+
elif len(shared_entities) >= 1:
|
|
328
|
+
return 1.2
|
|
329
|
+
else:
|
|
330
|
+
return 1.0
|
|
331
|
+
|
|
332
|
+
def _sentence_overlap_factor(self, chunk1: Chunk, chunk2: Chunk) -> float:
|
|
333
|
+
"""
|
|
334
|
+
Calculate overlap to ensure sentence integrity at boundaries.
|
|
335
|
+
"""
|
|
336
|
+
# Check if chunk1 ends mid-sentence
|
|
337
|
+
text1 = chunk1.text.strip()
|
|
338
|
+
|
|
339
|
+
if not text1:
|
|
340
|
+
return 1.0
|
|
341
|
+
|
|
342
|
+
# If doesn't end with sentence terminator, increase overlap
|
|
343
|
+
if text1[-1] not in '.!?':
|
|
344
|
+
return 1.5
|
|
345
|
+
|
|
346
|
+
# Check if chunk2 starts mid-sentence
|
|
347
|
+
text2 = chunk2.text.strip()
|
|
348
|
+
if text2 and text2[0].islower():
|
|
349
|
+
return 1.5
|
|
350
|
+
|
|
351
|
+
return 1.0
|
|
352
|
+
|
|
353
|
+
def _apply_overlaps(self,
|
|
354
|
+
chunks: List[Chunk],
|
|
355
|
+
optimal_overlaps: List[int],
|
|
356
|
+
original_text: str = None) -> List[Chunk]:
|
|
357
|
+
"""Apply calculated overlaps to create new chunks."""
|
|
358
|
+
# For now, return chunks with overlap metadata
|
|
359
|
+
# Full implementation would re-chunk from original text
|
|
360
|
+
|
|
361
|
+
optimized = []
|
|
362
|
+
for i, chunk in enumerate(chunks):
|
|
363
|
+
meta = {**chunk.meta}
|
|
364
|
+
|
|
365
|
+
if i > 0:
|
|
366
|
+
meta['overlap_from_prev'] = optimal_overlaps[i - 1]
|
|
367
|
+
if i < len(optimal_overlaps):
|
|
368
|
+
meta['overlap_to_next'] = optimal_overlaps[i]
|
|
369
|
+
|
|
370
|
+
meta['overlap_optimized'] = True
|
|
371
|
+
|
|
372
|
+
optimized.append(Chunk(
|
|
373
|
+
id=chunk.id,
|
|
374
|
+
doc_id=chunk.doc_id,
|
|
375
|
+
text=chunk.text,
|
|
376
|
+
meta=meta
|
|
377
|
+
))
|
|
378
|
+
|
|
379
|
+
return optimized
|
|
380
|
+
|
|
381
|
+
def _generate_improvements(self,
|
|
382
|
+
current: List[int],
|
|
383
|
+
optimal: List[int]) -> List[str]:
|
|
384
|
+
"""Generate improvement recommendations."""
|
|
385
|
+
improvements = []
|
|
386
|
+
|
|
387
|
+
if not current or not optimal:
|
|
388
|
+
return improvements
|
|
389
|
+
|
|
390
|
+
for i, (curr, opt) in enumerate(zip(current, optimal)):
|
|
391
|
+
diff = opt - curr
|
|
392
|
+
if abs(diff) > 20: # Significant difference
|
|
393
|
+
if diff > 0:
|
|
394
|
+
improvements.append(
|
|
395
|
+
f"Pair {i}-{i+1}: Increase overlap by {diff} tokens for better context"
|
|
396
|
+
)
|
|
397
|
+
else:
|
|
398
|
+
improvements.append(
|
|
399
|
+
f"Pair {i}-{i+1}: Reduce overlap by {-diff} tokens to remove redundancy"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
return improvements
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AutoChunks Post-Processing Pipeline
|
|
3
|
+
|
|
4
|
+
Applies quality optimizations to chunks ONLY for native AutoChunks chunkers.
|
|
5
|
+
Bridge chunkers (LangChain, etc.) get raw output for fair comparison.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
from typing import List, Dict, Any, Callable, Optional
|
|
10
|
+
from ..chunkers.base import Chunk
|
|
11
|
+
from ..utils.text import count_tokens, split_sentences
|
|
12
|
+
from ..utils.logger import logger
|
|
13
|
+
import time
|
|
14
|
+
from .scorer import ChunkQualityScorer
|
|
15
|
+
from .deduplicator import ChunkDeduplicator
|
|
16
|
+
from .overlap_optimizer import OverlapOptimizer
|
|
17
|
+
|
|
18
|
+
# Native AutoChunks chunkers that get post-processing
|
|
19
|
+
NATIVE_CHUNKERS = {
|
|
20
|
+
"fixed_length",
|
|
21
|
+
"recursive_character",
|
|
22
|
+
"sentence_aware",
|
|
23
|
+
"semantic_local",
|
|
24
|
+
"hybrid_semantic_stat",
|
|
25
|
+
"parent_child",
|
|
26
|
+
"layout_aware",
|
|
27
|
+
"agentic",
|
|
28
|
+
"proposition",
|
|
29
|
+
"contextual_retrieval",
|
|
30
|
+
"python_ast",
|
|
31
|
+
"html_section"
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Bridge chunkers that get raw output (fair comparison)
|
|
35
|
+
BRIDGE_CHUNKERS = {
|
|
36
|
+
"langchain_recursive",
|
|
37
|
+
"langchain_character",
|
|
38
|
+
"langchain_markdown",
|
|
39
|
+
"langchain_token",
|
|
40
|
+
"langchain_python",
|
|
41
|
+
"langchain_html",
|
|
42
|
+
"langchain_json"
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ChunkPostProcessor:
|
|
47
|
+
"""
|
|
48
|
+
Post-processing pipeline for chunk optimization.
|
|
49
|
+
|
|
50
|
+
Applied ONLY to native AutoChunks chunkers to ensure:
|
|
51
|
+
1. Fair comparison with bridge chunkers (they get raw output)
|
|
52
|
+
2. AutoChunks gets full pipeline benefits
|
|
53
|
+
|
|
54
|
+
Pipeline Steps:
|
|
55
|
+
1. Deduplication (optional) - Remove near-duplicate chunks
|
|
56
|
+
2. Overlap Optimization (optional) - Add/adjust overlap for context
|
|
57
|
+
3. Quality Scoring (always) - Add quality metrics to metadata
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self,
|
|
61
|
+
enable_dedup: bool = True,
|
|
62
|
+
enable_overlap_opt: bool = True,
|
|
63
|
+
embedding_fn: Callable[[List[str]], List[List[float]]] = None,
|
|
64
|
+
dedup_threshold: float = 0.90,
|
|
65
|
+
overlap_tokens: int = 50,
|
|
66
|
+
target_chunk_size: int = 512):
|
|
67
|
+
"""
|
|
68
|
+
Initialize the post-processor.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
enable_dedup: Whether to apply deduplication
|
|
72
|
+
enable_overlap_opt: Whether to optimize overlaps
|
|
73
|
+
embedding_fn: Embedding function for semantic operations
|
|
74
|
+
dedup_threshold: Similarity threshold for deduplication (0.85-0.95)
|
|
75
|
+
overlap_tokens: Target overlap in tokens
|
|
76
|
+
target_chunk_size: Target chunk size for quality scoring
|
|
77
|
+
"""
|
|
78
|
+
self.enable_dedup = enable_dedup
|
|
79
|
+
self.enable_overlap_opt = enable_overlap_opt
|
|
80
|
+
self.embedding_fn = embedding_fn
|
|
81
|
+
self.dedup_threshold = dedup_threshold
|
|
82
|
+
self.overlap_tokens = overlap_tokens
|
|
83
|
+
self.target_chunk_size = target_chunk_size
|
|
84
|
+
|
|
85
|
+
# Initialize components
|
|
86
|
+
self.scorer = ChunkQualityScorer(
|
|
87
|
+
embedding_fn=embedding_fn,
|
|
88
|
+
target_token_size=target_chunk_size
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.deduper = ChunkDeduplicator(
|
|
92
|
+
embedding_fn=embedding_fn,
|
|
93
|
+
similarity_threshold=dedup_threshold,
|
|
94
|
+
method="hybrid",
|
|
95
|
+
strategy="keep_longest" # Keep the most complete version
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
self.overlap_optimizer = OverlapOptimizer(
|
|
99
|
+
embedding_fn=embedding_fn,
|
|
100
|
+
base_overlap=overlap_tokens,
|
|
101
|
+
min_overlap=20,
|
|
102
|
+
max_overlap=100,
|
|
103
|
+
method="hybrid"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def process(self,
|
|
107
|
+
chunks: List[Chunk],
|
|
108
|
+
chunker_name: str,
|
|
109
|
+
return_metrics: bool = True) -> tuple[List[Chunk], Dict[str, Any]]:
|
|
110
|
+
"""
|
|
111
|
+
Apply post-processing pipeline to chunks.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
chunks: Input chunks from a chunker
|
|
115
|
+
chunker_name: Name of the chunker that produced these chunks
|
|
116
|
+
return_metrics: Whether to return quality metrics
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Tuple of (processed_chunks, quality_metrics)
|
|
120
|
+
"""
|
|
121
|
+
metrics = {
|
|
122
|
+
"post_processing_applied": False,
|
|
123
|
+
"is_native_chunker": chunker_name in NATIVE_CHUNKERS,
|
|
124
|
+
"original_count": len(chunks)
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# Only apply optimizations to native chunkers
|
|
128
|
+
if chunker_name not in NATIVE_CHUNKERS:
|
|
129
|
+
# For bridges, just score quality but don't modify
|
|
130
|
+
if return_metrics and chunks:
|
|
131
|
+
quality_reports = self.scorer.score_chunks(chunks)
|
|
132
|
+
metrics["avg_quality_score"] = sum(r.overall_score for r in quality_reports) / len(quality_reports)
|
|
133
|
+
metrics["quality_dimensions"] = self.scorer.get_summary_stats(quality_reports).get("dimension_means", {})
|
|
134
|
+
return chunks, metrics
|
|
135
|
+
|
|
136
|
+
metrics["post_processing_applied"] = True
|
|
137
|
+
processed_chunks = list(chunks)
|
|
138
|
+
|
|
139
|
+
# Step 1: Deduplication
|
|
140
|
+
if self.enable_dedup and len(processed_chunks) > 1:
|
|
141
|
+
logger.info(f"[{chunker_name}] Post-processor: Starting deduplication (count={len(processed_chunks)})...")
|
|
142
|
+
dp_start = time.time()
|
|
143
|
+
dedup_result = self.deduper.deduplicate(processed_chunks)
|
|
144
|
+
processed_chunks = dedup_result.kept_chunks
|
|
145
|
+
metrics["dedup_removed"] = dedup_result.removed_count
|
|
146
|
+
metrics["dedup_groups"] = len(dedup_result.duplicate_groups)
|
|
147
|
+
logger.info(f"[{chunker_name}] Post-processor: Deduplication finished in {time.time()-dp_start:.2f}s (removed {dedup_result.removed_count})")
|
|
148
|
+
|
|
149
|
+
# Step 2: Overlap Optimization
|
|
150
|
+
if self.enable_overlap_opt and len(processed_chunks) > 1:
|
|
151
|
+
logger.info(f"[{chunker_name}] Post-processor: Starting overlap optimization...")
|
|
152
|
+
ov_start = time.time()
|
|
153
|
+
# Add overlap context to chunks
|
|
154
|
+
enhanced = self.overlap_optimizer.add_overlap_to_chunks(
|
|
155
|
+
processed_chunks,
|
|
156
|
+
overlap_tokens=self.overlap_tokens
|
|
157
|
+
)
|
|
158
|
+
processed_chunks = enhanced
|
|
159
|
+
metrics["overlap_enhanced"] = True
|
|
160
|
+
logger.info(f"[{chunker_name}] Post-processor: Overlap optimization finished in {time.time()-ov_start:.2f}s")
|
|
161
|
+
|
|
162
|
+
# Step 3: Quality Scoring (always applied, adds to metadata)
|
|
163
|
+
if processed_chunks:
|
|
164
|
+
logger.info(f"[{chunker_name}] Post-processor: Starting quality scoring (count={len(processed_chunks)})...")
|
|
165
|
+
qs_start = time.time()
|
|
166
|
+
quality_reports = self.scorer.score_chunks(processed_chunks)
|
|
167
|
+
|
|
168
|
+
# Add quality scores to chunk metadata (Cast to float for serialization)
|
|
169
|
+
for chunk, report in zip(processed_chunks, quality_reports):
|
|
170
|
+
chunk.meta["quality_score"] = float(report.overall_score)
|
|
171
|
+
chunk.meta["quality_coherence"] = float(report.coherence_score)
|
|
172
|
+
chunk.meta["quality_completeness"] = float(report.completeness_score)
|
|
173
|
+
chunk.meta["quality_density"] = float(report.density_score)
|
|
174
|
+
chunk.meta["quality_boundary"] = float(report.boundary_score)
|
|
175
|
+
chunk.meta["quality_size"] = float(report.size_score)
|
|
176
|
+
if report.issues:
|
|
177
|
+
chunk.meta["quality_issues"] = report.issues
|
|
178
|
+
|
|
179
|
+
# Aggregate metrics
|
|
180
|
+
metrics["avg_quality_score"] = sum(r.overall_score for r in quality_reports) / len(quality_reports)
|
|
181
|
+
metrics["quality_dimensions"] = self.scorer.get_summary_stats(quality_reports).get("dimension_means", {})
|
|
182
|
+
metrics["chunks_with_issues"] = sum(1 for r in quality_reports if r.issues)
|
|
183
|
+
logger.info(f"[{chunker_name}] Post-processor: Quality scoring finished in {time.time()-qs_start:.2f}s")
|
|
184
|
+
|
|
185
|
+
metrics["final_count"] = len(processed_chunks)
|
|
186
|
+
|
|
187
|
+
return processed_chunks, metrics
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def apply_post_processing(chunks: List[Dict],
|
|
191
|
+
chunker_name: str,
|
|
192
|
+
embedding_fn: Callable = None,
|
|
193
|
+
enable_dedup: bool = True,
|
|
194
|
+
enable_overlap: bool = True,
|
|
195
|
+
dedup_threshold: float = 0.90,
|
|
196
|
+
overlap_tokens: int = 50) -> tuple[List[Dict], Dict[str, Any]]:
|
|
197
|
+
"""
|
|
198
|
+
Convenience function to apply post-processing to dict-format chunks.
|
|
199
|
+
|
|
200
|
+
This is the main entry point for the autochunker.py integration.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
chunks: List of chunk dictionaries with id, doc_id, text, meta
|
|
204
|
+
chunker_name: Name of the chunker
|
|
205
|
+
embedding_fn: Optional embedding function
|
|
206
|
+
enable_dedup: Whether to deduplicate
|
|
207
|
+
enable_overlap: Whether to optimize overlap
|
|
208
|
+
dedup_threshold: Similarity threshold for deduplication
|
|
209
|
+
overlap_tokens: Overlap size in tokens
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Tuple of (processed_chunk_dicts, quality_metrics)
|
|
213
|
+
"""
|
|
214
|
+
# Convert dicts to Chunk objects
|
|
215
|
+
chunk_objects = [
|
|
216
|
+
Chunk(
|
|
217
|
+
id=c["id"],
|
|
218
|
+
doc_id=c["doc_id"],
|
|
219
|
+
text=c["text"],
|
|
220
|
+
meta=c.get("meta", {})
|
|
221
|
+
) for c in chunks
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
# Create processor and run
|
|
225
|
+
processor = ChunkPostProcessor(
|
|
226
|
+
enable_dedup=enable_dedup,
|
|
227
|
+
enable_overlap_opt=enable_overlap,
|
|
228
|
+
embedding_fn=embedding_fn,
|
|
229
|
+
dedup_threshold=dedup_threshold,
|
|
230
|
+
overlap_tokens=overlap_tokens
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
processed_chunks, metrics = processor.process(chunk_objects, chunker_name)
|
|
234
|
+
|
|
235
|
+
# Convert back to dicts
|
|
236
|
+
result_dicts = [
|
|
237
|
+
{
|
|
238
|
+
"id": c.id,
|
|
239
|
+
"doc_id": c.doc_id,
|
|
240
|
+
"text": c.text,
|
|
241
|
+
"meta": c.meta
|
|
242
|
+
} for c in processed_chunks
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
return result_dicts, metrics
|