autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,402 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, Callable, Optional, Tuple
4
+ from dataclasses import dataclass, field
5
+ import numpy as np
6
+ from ..chunkers.base import Chunk
7
+ from ..utils.text import count_tokens, split_sentences
8
+
9
+ @dataclass
10
+ class OverlapOptimizationResult:
11
+ """Result of overlap optimization."""
12
+ original_chunks: List[Chunk]
13
+ optimized_chunks: List[Chunk]
14
+ overlap_stats: Dict[str, Any]
15
+ improvements: List[str]
16
+
17
+
18
+ class OverlapOptimizer:
19
+ """
20
+ World-Class Intelligent Overlap Optimization System.
21
+
22
+ Dynamically adjusts overlap between chunks based on semantic analysis
23
+ to ensure optimal context continuity without redundancy.
24
+
25
+ OPTIMIZATION STRATEGIES:
26
+ 1. Semantic Bridging: More overlap at topic boundaries
27
+ 2. Entity Preservation: Ensure named entities aren't split
28
+ 3. Sentence Integrity: Overlap at sentence boundaries
29
+ 4. Adaptive Sizing: Variable overlap based on chunk content
30
+ 5. Context Windows: Sliding windows with smart step sizes
31
+
32
+ METHODS:
33
+ - fixed: Traditional fixed-token overlap
34
+ - semantic: Embedding-based adaptive overlap
35
+ - entity: NER-based overlap for entity preservation
36
+ - sentence: Always overlap complete sentences
37
+ - hybrid: Combination of all methods
38
+ """
39
+
40
+ def __init__(self,
41
+ embedding_fn: Callable[[List[str]], List[List[float]]] = None,
42
+ base_overlap: int = 50,
43
+ min_overlap: int = 20,
44
+ max_overlap: int = 200,
45
+ method: str = "hybrid"):
46
+ """
47
+ Initialize the overlap optimizer.
48
+
49
+ Args:
50
+ embedding_fn: Function for semantic analysis.
51
+ base_overlap: Default overlap in tokens.
52
+ min_overlap: Minimum allowed overlap.
53
+ max_overlap: Maximum allowed overlap.
54
+ method: "fixed", "semantic", "entity", "sentence", "hybrid".
55
+ """
56
+ self.embedding_fn = embedding_fn
57
+ self.base_overlap = base_overlap
58
+ self.min_overlap = min_overlap
59
+ self.max_overlap = max_overlap
60
+ self.method = method
61
+
62
+ def optimize_overlaps(self,
63
+ chunks: List[Chunk],
64
+ original_text: str = None) -> OverlapOptimizationResult:
65
+ """
66
+ Optimize overlaps between chunks with batched semantic analysis.
67
+ """
68
+ if len(chunks) < 2:
69
+ return OverlapOptimizationResult(
70
+ original_chunks=chunks,
71
+ optimized_chunks=chunks,
72
+ overlap_stats={'pairs_analyzed': 0},
73
+ improvements=[]
74
+ )
75
+
76
+ # Optimization: Batch embed all boundary sentence pairs at once
77
+ boundary_embeddings = None
78
+ if self.embedding_fn and self.method in ["semantic", "hybrid"]:
79
+ boundary_sentences = []
80
+ for i in range(len(chunks) - 1):
81
+ s1 = split_sentences(chunks[i].text)
82
+ s2 = split_sentences(chunks[i + 1].text)
83
+ if s1 and s2:
84
+ boundary_sentences.extend([s1[-1], s2[0]])
85
+ else:
86
+ boundary_sentences.extend(["", ""]) # Placeholders
87
+
88
+ boundary_embeddings = self.embedding_fn(boundary_sentences)
89
+
90
+ # Analyze current overlaps
91
+ current_overlaps = self._analyze_current_overlaps(chunks)
92
+
93
+ # Calculate optimal overlaps for each pair
94
+ optimal_overlaps = []
95
+ for i in range(len(chunks) - 1):
96
+ pair_embeddings = None
97
+ if boundary_embeddings:
98
+ pair_embeddings = [boundary_embeddings[i*2], boundary_embeddings[i*2 + 1]]
99
+
100
+ optimal = self._calculate_optimal_overlap(
101
+ chunks[i], chunks[i + 1], i, len(chunks), pair_embeddings
102
+ )
103
+ optimal_overlaps.append(optimal)
104
+
105
+ # Generate optimized chunks
106
+ optimized_chunks = self._apply_overlaps(chunks, optimal_overlaps, original_text)
107
+
108
+ # Generate improvements list
109
+ improvements = self._generate_improvements(current_overlaps, optimal_overlaps)
110
+
111
+ return OverlapOptimizationResult(
112
+ original_chunks=chunks,
113
+ optimized_chunks=optimized_chunks,
114
+ overlap_stats={
115
+ 'pairs_analyzed': len(chunks) - 1,
116
+ 'current_overlaps': current_overlaps,
117
+ 'optimal_overlaps': optimal_overlaps,
118
+ 'avg_current': float(np.mean(current_overlaps)) if current_overlaps else 0.0,
119
+ 'avg_optimal': float(np.mean(optimal_overlaps)) if optimal_overlaps else 0.0
120
+ },
121
+ improvements=improvements
122
+ )
123
+
124
+ def add_overlap_to_chunks(self,
125
+ chunks: List[Chunk],
126
+ overlap_tokens: int = None) -> List[Chunk]:
127
+ """
128
+ Add overlap context to chunks that may not have it.
129
+
130
+ This method adds text from adjacent chunks to each chunk,
131
+ useful when chunks were created without overlap.
132
+
133
+ Args:
134
+ chunks: Original chunks without overlap.
135
+ overlap_tokens: Number of tokens to overlap. Uses base_overlap if None.
136
+
137
+ Returns:
138
+ New list of chunks with overlap added.
139
+ """
140
+ if len(chunks) < 2:
141
+ return chunks
142
+
143
+ overlap = overlap_tokens or self.base_overlap
144
+ enhanced_chunks = []
145
+
146
+ for i, chunk in enumerate(chunks):
147
+ prefix = ""
148
+ suffix = ""
149
+
150
+ # Add suffix from next chunk
151
+ if i < len(chunks) - 1:
152
+ next_text = chunks[i + 1].text
153
+ suffix_sentences = split_sentences(next_text)
154
+ suffix_tokens = 0
155
+ suffix_parts = []
156
+
157
+ for sent in suffix_sentences:
158
+ sent_tokens = count_tokens(sent)
159
+ if suffix_tokens + sent_tokens <= overlap:
160
+ suffix_parts.append(sent)
161
+ suffix_tokens += sent_tokens
162
+ else:
163
+ break
164
+
165
+ if suffix_parts:
166
+ suffix = " " + " ".join(suffix_parts)
167
+
168
+ # Add prefix from previous chunk
169
+ if i > 0:
170
+ prev_text = chunks[i - 1].text
171
+ prefix_sentences = split_sentences(prev_text)
172
+ prefix_tokens = 0
173
+ prefix_parts = []
174
+
175
+ for sent in reversed(prefix_sentences):
176
+ sent_tokens = count_tokens(sent)
177
+ if prefix_tokens + sent_tokens <= overlap:
178
+ prefix_parts.insert(0, sent)
179
+ prefix_tokens += sent_tokens
180
+ else:
181
+ break
182
+
183
+ if prefix_parts:
184
+ prefix = " ".join(prefix_parts) + " "
185
+
186
+ # Create enhanced chunk
187
+ enhanced_text = prefix + chunk.text + suffix
188
+
189
+ enhanced_chunks.append(Chunk(
190
+ id=f"{chunk.id}_enhanced",
191
+ doc_id=chunk.doc_id,
192
+ text=enhanced_text,
193
+ meta={
194
+ **chunk.meta,
195
+ "has_overlap": True,
196
+ "prefix_tokens": count_tokens(prefix),
197
+ "suffix_tokens": count_tokens(suffix),
198
+ "original_id": chunk.id
199
+ }
200
+ ))
201
+
202
+ return enhanced_chunks
203
+
204
+ def _analyze_current_overlaps(self, chunks: List[Chunk]) -> List[int]:
205
+ """Analyze existing overlaps between adjacent chunks."""
206
+ overlaps = []
207
+
208
+ for i in range(len(chunks) - 1):
209
+ text1 = chunks[i].text.lower()
210
+ text2 = chunks[i + 1].text.lower()
211
+
212
+ # Find longest common suffix/prefix
213
+ overlap_tokens = self._find_text_overlap(text1, text2)
214
+ overlaps.append(overlap_tokens)
215
+
216
+ return overlaps
217
+
218
+ def _find_text_overlap(self, text1: str, text2: str) -> int:
219
+ """Find token overlap between end of text1 and start of text2."""
220
+ words1 = text1.split()
221
+ words2 = text2.split()
222
+
223
+ max_overlap = min(len(words1), len(words2), 50) # Limit search
224
+
225
+ for overlap_len in range(max_overlap, 0, -1):
226
+ suffix = words1[-overlap_len:]
227
+ prefix = words2[:overlap_len]
228
+ if suffix == prefix:
229
+ return overlap_len
230
+
231
+ return 0
232
+
233
+ def _calculate_optimal_overlap(self,
234
+ chunk1: Chunk,
235
+ chunk2: Chunk,
236
+ pair_index: int,
237
+ total_chunks: int,
238
+ pair_embeddings: Optional[List[List[float]]] = None) -> int:
239
+ """Calculate optimal overlap for a chunk pair with optional pre-calculated embeddings."""
240
+ if self.method == "fixed":
241
+ return self.base_overlap
242
+
243
+ optimal = self.base_overlap
244
+ factors = []
245
+
246
+ if self.method in ["semantic", "hybrid"]:
247
+ semantic_factor = self._semantic_overlap_factor(chunk1, chunk2, pair_embeddings)
248
+ factors.append(semantic_factor)
249
+
250
+ if self.method in ["entity", "hybrid"]:
251
+ entity_factor = self._entity_overlap_factor(chunk1, chunk2)
252
+ factors.append(entity_factor)
253
+
254
+ if self.method in ["sentence", "hybrid"]:
255
+ sentence_factor = self._sentence_overlap_factor(chunk1, chunk2)
256
+ factors.append(sentence_factor)
257
+
258
+ # Combine factors
259
+ if factors:
260
+ avg_factor = float(np.mean(factors))
261
+ optimal = int(self.base_overlap * avg_factor)
262
+
263
+ # Clamp to bounds
264
+ return max(self.min_overlap, min(self.max_overlap, optimal))
265
+
266
+ def _semantic_overlap_factor(self, chunk1: Chunk, chunk2: Chunk, pair_embeddings: Optional[List[List[float]]] = None) -> float:
267
+ """
268
+ Calculate overlap factor based on semantic similarity using pre-calculated or on-the-fly embeddings.
269
+ """
270
+ try:
271
+ vec1 = None
272
+ vec2 = None
273
+
274
+ if pair_embeddings and len(pair_embeddings) == 2:
275
+ vec1, vec2 = np.array(pair_embeddings[0]), np.array(pair_embeddings[1])
276
+ elif self.embedding_fn:
277
+ # Fallback to on-the-fly calculation if not batched
278
+ sentences1 = split_sentences(chunk1.text)
279
+ sentences2 = split_sentences(chunk2.text)
280
+ if not sentences1 or not sentences2: return 1.0
281
+ boundary_texts = [sentences1[-1], sentences2[0]]
282
+ embeddings = self.embedding_fn(boundary_texts)
283
+ vec1, vec2 = np.array(embeddings[0]), np.array(embeddings[1])
284
+
285
+ if vec1 is None or vec2 is None:
286
+ return 1.0
287
+
288
+ norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
289
+ if norm1 == 0 or norm2 == 0:
290
+ return 1.0
291
+
292
+ similarity = float(np.dot(vec1, vec2) / (norm1 * norm2))
293
+
294
+ # Low similarity = topic shift = more overlap needed
295
+ # Similarity 0.9+ = same topic = less overlap
296
+ # Similarity 0.5- = big shift = more overlap
297
+ if similarity > 0.85:
298
+ return 0.6 # Reduce overlap
299
+ elif similarity < 0.5:
300
+ return 1.8 # Increase overlap
301
+ else:
302
+ return float(1.0 + (0.7 - similarity)) # Linear scaling
303
+
304
+ except:
305
+ return 1.0
306
+
307
+ def _entity_overlap_factor(self, chunk1: Chunk, chunk2: Chunk) -> float:
308
+ """
309
+ Calculate overlap factor based on entity preservation.
310
+ If chunk2 starts with references to entities from chunk1, increase overlap.
311
+ """
312
+ import re
313
+
314
+ # Extract potential entities (capitalized words)
315
+ entity_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
316
+
317
+ entities1 = set(re.findall(entity_pattern, chunk1.text[-500:])) # Last 500 chars
318
+
319
+ # Check if chunk2 references these entities early
320
+ first_100_chars = chunk2.text[:200]
321
+ entities_in_start = set(re.findall(entity_pattern, first_100_chars))
322
+
323
+ shared_entities = entities1 & entities_in_start
324
+
325
+ if len(shared_entities) >= 3:
326
+ return 1.5 # More overlap to include entity context
327
+ elif len(shared_entities) >= 1:
328
+ return 1.2
329
+ else:
330
+ return 1.0
331
+
332
+ def _sentence_overlap_factor(self, chunk1: Chunk, chunk2: Chunk) -> float:
333
+ """
334
+ Calculate overlap to ensure sentence integrity at boundaries.
335
+ """
336
+ # Check if chunk1 ends mid-sentence
337
+ text1 = chunk1.text.strip()
338
+
339
+ if not text1:
340
+ return 1.0
341
+
342
+ # If doesn't end with sentence terminator, increase overlap
343
+ if text1[-1] not in '.!?':
344
+ return 1.5
345
+
346
+ # Check if chunk2 starts mid-sentence
347
+ text2 = chunk2.text.strip()
348
+ if text2 and text2[0].islower():
349
+ return 1.5
350
+
351
+ return 1.0
352
+
353
+ def _apply_overlaps(self,
354
+ chunks: List[Chunk],
355
+ optimal_overlaps: List[int],
356
+ original_text: str = None) -> List[Chunk]:
357
+ """Apply calculated overlaps to create new chunks."""
358
+ # For now, return chunks with overlap metadata
359
+ # Full implementation would re-chunk from original text
360
+
361
+ optimized = []
362
+ for i, chunk in enumerate(chunks):
363
+ meta = {**chunk.meta}
364
+
365
+ if i > 0:
366
+ meta['overlap_from_prev'] = optimal_overlaps[i - 1]
367
+ if i < len(optimal_overlaps):
368
+ meta['overlap_to_next'] = optimal_overlaps[i]
369
+
370
+ meta['overlap_optimized'] = True
371
+
372
+ optimized.append(Chunk(
373
+ id=chunk.id,
374
+ doc_id=chunk.doc_id,
375
+ text=chunk.text,
376
+ meta=meta
377
+ ))
378
+
379
+ return optimized
380
+
381
+ def _generate_improvements(self,
382
+ current: List[int],
383
+ optimal: List[int]) -> List[str]:
384
+ """Generate improvement recommendations."""
385
+ improvements = []
386
+
387
+ if not current or not optimal:
388
+ return improvements
389
+
390
+ for i, (curr, opt) in enumerate(zip(current, optimal)):
391
+ diff = opt - curr
392
+ if abs(diff) > 20: # Significant difference
393
+ if diff > 0:
394
+ improvements.append(
395
+ f"Pair {i}-{i+1}: Increase overlap by {diff} tokens for better context"
396
+ )
397
+ else:
398
+ improvements.append(
399
+ f"Pair {i}-{i+1}: Reduce overlap by {-diff} tokens to remove redundancy"
400
+ )
401
+
402
+ return improvements
@@ -0,0 +1,245 @@
1
+ """
2
+ AutoChunks Post-Processing Pipeline
3
+
4
+ Applies quality optimizations to chunks ONLY for native AutoChunks chunkers.
5
+ Bridge chunkers (LangChain, etc.) get raw output for fair comparison.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ from typing import List, Dict, Any, Callable, Optional
10
+ from ..chunkers.base import Chunk
11
+ from ..utils.text import count_tokens, split_sentences
12
+ from ..utils.logger import logger
13
+ import time
14
+ from .scorer import ChunkQualityScorer
15
+ from .deduplicator import ChunkDeduplicator
16
+ from .overlap_optimizer import OverlapOptimizer
17
+
18
+ # Native AutoChunks chunkers that get post-processing
19
+ NATIVE_CHUNKERS = {
20
+ "fixed_length",
21
+ "recursive_character",
22
+ "sentence_aware",
23
+ "semantic_local",
24
+ "hybrid_semantic_stat",
25
+ "parent_child",
26
+ "layout_aware",
27
+ "agentic",
28
+ "proposition",
29
+ "contextual_retrieval",
30
+ "python_ast",
31
+ "html_section"
32
+ }
33
+
34
+ # Bridge chunkers that get raw output (fair comparison)
35
+ BRIDGE_CHUNKERS = {
36
+ "langchain_recursive",
37
+ "langchain_character",
38
+ "langchain_markdown",
39
+ "langchain_token",
40
+ "langchain_python",
41
+ "langchain_html",
42
+ "langchain_json"
43
+ }
44
+
45
+
46
+ class ChunkPostProcessor:
47
+ """
48
+ Post-processing pipeline for chunk optimization.
49
+
50
+ Applied ONLY to native AutoChunks chunkers to ensure:
51
+ 1. Fair comparison with bridge chunkers (they get raw output)
52
+ 2. AutoChunks gets full pipeline benefits
53
+
54
+ Pipeline Steps:
55
+ 1. Deduplication (optional) - Remove near-duplicate chunks
56
+ 2. Overlap Optimization (optional) - Add/adjust overlap for context
57
+ 3. Quality Scoring (always) - Add quality metrics to metadata
58
+ """
59
+
60
+ def __init__(self,
61
+ enable_dedup: bool = True,
62
+ enable_overlap_opt: bool = True,
63
+ embedding_fn: Callable[[List[str]], List[List[float]]] = None,
64
+ dedup_threshold: float = 0.90,
65
+ overlap_tokens: int = 50,
66
+ target_chunk_size: int = 512):
67
+ """
68
+ Initialize the post-processor.
69
+
70
+ Args:
71
+ enable_dedup: Whether to apply deduplication
72
+ enable_overlap_opt: Whether to optimize overlaps
73
+ embedding_fn: Embedding function for semantic operations
74
+ dedup_threshold: Similarity threshold for deduplication (0.85-0.95)
75
+ overlap_tokens: Target overlap in tokens
76
+ target_chunk_size: Target chunk size for quality scoring
77
+ """
78
+ self.enable_dedup = enable_dedup
79
+ self.enable_overlap_opt = enable_overlap_opt
80
+ self.embedding_fn = embedding_fn
81
+ self.dedup_threshold = dedup_threshold
82
+ self.overlap_tokens = overlap_tokens
83
+ self.target_chunk_size = target_chunk_size
84
+
85
+ # Initialize components
86
+ self.scorer = ChunkQualityScorer(
87
+ embedding_fn=embedding_fn,
88
+ target_token_size=target_chunk_size
89
+ )
90
+
91
+ self.deduper = ChunkDeduplicator(
92
+ embedding_fn=embedding_fn,
93
+ similarity_threshold=dedup_threshold,
94
+ method="hybrid",
95
+ strategy="keep_longest" # Keep the most complete version
96
+ )
97
+
98
+ self.overlap_optimizer = OverlapOptimizer(
99
+ embedding_fn=embedding_fn,
100
+ base_overlap=overlap_tokens,
101
+ min_overlap=20,
102
+ max_overlap=100,
103
+ method="hybrid"
104
+ )
105
+
106
+ def process(self,
107
+ chunks: List[Chunk],
108
+ chunker_name: str,
109
+ return_metrics: bool = True) -> tuple[List[Chunk], Dict[str, Any]]:
110
+ """
111
+ Apply post-processing pipeline to chunks.
112
+
113
+ Args:
114
+ chunks: Input chunks from a chunker
115
+ chunker_name: Name of the chunker that produced these chunks
116
+ return_metrics: Whether to return quality metrics
117
+
118
+ Returns:
119
+ Tuple of (processed_chunks, quality_metrics)
120
+ """
121
+ metrics = {
122
+ "post_processing_applied": False,
123
+ "is_native_chunker": chunker_name in NATIVE_CHUNKERS,
124
+ "original_count": len(chunks)
125
+ }
126
+
127
+ # Only apply optimizations to native chunkers
128
+ if chunker_name not in NATIVE_CHUNKERS:
129
+ # For bridges, just score quality but don't modify
130
+ if return_metrics and chunks:
131
+ quality_reports = self.scorer.score_chunks(chunks)
132
+ metrics["avg_quality_score"] = sum(r.overall_score for r in quality_reports) / len(quality_reports)
133
+ metrics["quality_dimensions"] = self.scorer.get_summary_stats(quality_reports).get("dimension_means", {})
134
+ return chunks, metrics
135
+
136
+ metrics["post_processing_applied"] = True
137
+ processed_chunks = list(chunks)
138
+
139
+ # Step 1: Deduplication
140
+ if self.enable_dedup and len(processed_chunks) > 1:
141
+ logger.info(f"[{chunker_name}] Post-processor: Starting deduplication (count={len(processed_chunks)})...")
142
+ dp_start = time.time()
143
+ dedup_result = self.deduper.deduplicate(processed_chunks)
144
+ processed_chunks = dedup_result.kept_chunks
145
+ metrics["dedup_removed"] = dedup_result.removed_count
146
+ metrics["dedup_groups"] = len(dedup_result.duplicate_groups)
147
+ logger.info(f"[{chunker_name}] Post-processor: Deduplication finished in {time.time()-dp_start:.2f}s (removed {dedup_result.removed_count})")
148
+
149
+ # Step 2: Overlap Optimization
150
+ if self.enable_overlap_opt and len(processed_chunks) > 1:
151
+ logger.info(f"[{chunker_name}] Post-processor: Starting overlap optimization...")
152
+ ov_start = time.time()
153
+ # Add overlap context to chunks
154
+ enhanced = self.overlap_optimizer.add_overlap_to_chunks(
155
+ processed_chunks,
156
+ overlap_tokens=self.overlap_tokens
157
+ )
158
+ processed_chunks = enhanced
159
+ metrics["overlap_enhanced"] = True
160
+ logger.info(f"[{chunker_name}] Post-processor: Overlap optimization finished in {time.time()-ov_start:.2f}s")
161
+
162
+ # Step 3: Quality Scoring (always applied, adds to metadata)
163
+ if processed_chunks:
164
+ logger.info(f"[{chunker_name}] Post-processor: Starting quality scoring (count={len(processed_chunks)})...")
165
+ qs_start = time.time()
166
+ quality_reports = self.scorer.score_chunks(processed_chunks)
167
+
168
+ # Add quality scores to chunk metadata (Cast to float for serialization)
169
+ for chunk, report in zip(processed_chunks, quality_reports):
170
+ chunk.meta["quality_score"] = float(report.overall_score)
171
+ chunk.meta["quality_coherence"] = float(report.coherence_score)
172
+ chunk.meta["quality_completeness"] = float(report.completeness_score)
173
+ chunk.meta["quality_density"] = float(report.density_score)
174
+ chunk.meta["quality_boundary"] = float(report.boundary_score)
175
+ chunk.meta["quality_size"] = float(report.size_score)
176
+ if report.issues:
177
+ chunk.meta["quality_issues"] = report.issues
178
+
179
+ # Aggregate metrics
180
+ metrics["avg_quality_score"] = sum(r.overall_score for r in quality_reports) / len(quality_reports)
181
+ metrics["quality_dimensions"] = self.scorer.get_summary_stats(quality_reports).get("dimension_means", {})
182
+ metrics["chunks_with_issues"] = sum(1 for r in quality_reports if r.issues)
183
+ logger.info(f"[{chunker_name}] Post-processor: Quality scoring finished in {time.time()-qs_start:.2f}s")
184
+
185
+ metrics["final_count"] = len(processed_chunks)
186
+
187
+ return processed_chunks, metrics
188
+
189
+
190
+ def apply_post_processing(chunks: List[Dict],
191
+ chunker_name: str,
192
+ embedding_fn: Callable = None,
193
+ enable_dedup: bool = True,
194
+ enable_overlap: bool = True,
195
+ dedup_threshold: float = 0.90,
196
+ overlap_tokens: int = 50) -> tuple[List[Dict], Dict[str, Any]]:
197
+ """
198
+ Convenience function to apply post-processing to dict-format chunks.
199
+
200
+ This is the main entry point for the autochunker.py integration.
201
+
202
+ Args:
203
+ chunks: List of chunk dictionaries with id, doc_id, text, meta
204
+ chunker_name: Name of the chunker
205
+ embedding_fn: Optional embedding function
206
+ enable_dedup: Whether to deduplicate
207
+ enable_overlap: Whether to optimize overlap
208
+ dedup_threshold: Similarity threshold for deduplication
209
+ overlap_tokens: Overlap size in tokens
210
+
211
+ Returns:
212
+ Tuple of (processed_chunk_dicts, quality_metrics)
213
+ """
214
+ # Convert dicts to Chunk objects
215
+ chunk_objects = [
216
+ Chunk(
217
+ id=c["id"],
218
+ doc_id=c["doc_id"],
219
+ text=c["text"],
220
+ meta=c.get("meta", {})
221
+ ) for c in chunks
222
+ ]
223
+
224
+ # Create processor and run
225
+ processor = ChunkPostProcessor(
226
+ enable_dedup=enable_dedup,
227
+ enable_overlap_opt=enable_overlap,
228
+ embedding_fn=embedding_fn,
229
+ dedup_threshold=dedup_threshold,
230
+ overlap_tokens=overlap_tokens
231
+ )
232
+
233
+ processed_chunks, metrics = processor.process(chunk_objects, chunker_name)
234
+
235
+ # Convert back to dicts
236
+ result_dicts = [
237
+ {
238
+ "id": c.id,
239
+ "doc_id": c.doc_id,
240
+ "text": c.text,
241
+ "meta": c.meta
242
+ } for c in processed_chunks
243
+ ]
244
+
245
+ return result_dicts, metrics