autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,326 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, Callable, Optional, Tuple, Set
4
+ from dataclasses import dataclass
5
+ import hashlib
6
+ import numpy as np
7
+ from ..chunkers.base import Chunk
8
+ from ..utils.text import count_tokens
9
+ from ..utils.logger import logger
10
+ import time
11
+
12
+ @dataclass
13
+ class DeduplicationResult:
14
+ """Result of deduplication operation."""
15
+ original_count: int
16
+ deduplicated_count: int
17
+ removed_count: int
18
+ duplicate_groups: List[List[str]] # Groups of chunk IDs that are duplicates
19
+ kept_chunks: List[Chunk]
20
+ removed_chunks: List[Chunk]
21
+
22
+
23
+ class ChunkDeduplicator:
24
+ """
25
+ World-Class Chunk Deduplication System.
26
+
27
+ Identifies and removes duplicate or near-duplicate chunks using
28
+ multiple similarity detection methods.
29
+
30
+ DEDUPLICATION METHODS:
31
+ 1. Exact Hash: MD5/SHA256 for identical content
32
+ 2. MinHash LSH: Locality-Sensitive Hashing for near-duplicates
33
+ 3. Semantic: Embedding-based similarity for paraphrases
34
+ 4. N-gram Jaccard: Character/word n-gram overlap
35
+
36
+ STRATEGIES:
37
+ - keep_first: Keep the first occurrence
38
+ - keep_longest: Keep the longest version
39
+ - keep_best: Keep the highest quality (requires scorer)
40
+ - merge: Merge duplicates into a single enhanced chunk
41
+ """
42
+
43
+ def __init__(self,
44
+ embedding_fn: Callable[[List[str]], List[List[float]]] = None,
45
+ similarity_threshold: float = 0.85,
46
+ method: str = "hybrid",
47
+ strategy: str = "keep_first",
48
+ minhash_permutations: int = 128,
49
+ ngram_size: int = 3):
50
+ """
51
+ Initialize the deduplicator.
52
+
53
+ Args:
54
+ embedding_fn: Function for semantic similarity (optional).
55
+ similarity_threshold: Threshold for considering chunks as duplicates (0-1).
56
+ method: "exact", "minhash", "semantic", "ngram", or "hybrid" (all methods).
57
+ strategy: "keep_first", "keep_longest", "keep_best", "merge".
58
+ minhash_permutations: Number of permutations for MinHash.
59
+ ngram_size: Size of n-grams for Jaccard similarity.
60
+ """
61
+ self.embedding_fn = embedding_fn
62
+ self.similarity_threshold = similarity_threshold
63
+ self.method = method
64
+ self.strategy = strategy
65
+ self.minhash_permutations = minhash_permutations
66
+ self.ngram_size = ngram_size
67
+
68
+ def deduplicate(self,
69
+ chunks: List[Chunk],
70
+ quality_scorer=None) -> DeduplicationResult:
71
+ """
72
+ Remove duplicate chunks from a list with optimized batch embedding.
73
+ """
74
+ if not chunks:
75
+ return DeduplicationResult(0, 0, 0, [], [], [])
76
+
77
+ # Optimization: Pre-calculate all embeddings for semantic similarity if needed
78
+ all_embeddings = None
79
+ if self.embedding_fn and (self.method == "semantic" or self.method == "hybrid"):
80
+ all_embeddings = np.array(self.embedding_fn([c.text for c in chunks]))
81
+
82
+ # Find all duplicate groups (passing pre-calculated embeddings)
83
+ duplicate_groups = self._find_duplicate_groups(chunks, all_embeddings)
84
+
85
+ # Decide which to keep from each group
86
+ kept_ids: Set[str] = set()
87
+ removed_ids: Set[str] = set()
88
+
89
+ for group in duplicate_groups:
90
+ if len(group) <= 1:
91
+ kept_ids.update(group)
92
+ continue
93
+
94
+ group_chunks = [c for c in chunks if c.id in group]
95
+
96
+ if self.strategy == "keep_first":
97
+ # Keep the one with lowest index
98
+ chunk_indices = {c.id: idx for idx, c in enumerate(chunks)}
99
+ sorted_group = sorted(group_chunks, key=lambda c: chunk_indices[c.id])
100
+ kept_ids.add(sorted_group[0].id)
101
+ removed_ids.update(c.id for c in sorted_group[1:])
102
+
103
+ elif self.strategy == "keep_longest":
104
+ # Keep the longest
105
+ sorted_group = sorted(group_chunks, key=lambda c: len(c.text), reverse=True)
106
+ kept_ids.add(sorted_group[0].id)
107
+ removed_ids.update(c.id for c in sorted_group[1:])
108
+
109
+ elif self.strategy == "keep_best":
110
+ if quality_scorer:
111
+ # Note: quality_scorer.score_chunks is already optimized for batching
112
+ reports = quality_scorer.score_chunks(group_chunks)
113
+ scored = [(c, r.overall_score) for c, r in zip(group_chunks, reports)]
114
+ sorted_group = sorted(scored, key=lambda x: x[1], reverse=True)
115
+ kept_ids.add(sorted_group[0][0].id)
116
+ removed_ids.update(c.id for c, _ in sorted_group[1:])
117
+ else:
118
+ # Fallback to keep_longest
119
+ sorted_group = sorted(group_chunks, key=lambda c: len(c.text), reverse=True)
120
+ kept_ids.add(sorted_group[0].id)
121
+ removed_ids.update(c.id for c in sorted_group[1:])
122
+
123
+ elif self.strategy == "merge":
124
+ # Keep first but enhance with info from others
125
+ chunk_indices = {c.id: idx for idx, c in enumerate(chunks)}
126
+ sorted_group = sorted(group_chunks, key=lambda c: chunk_indices[c.id])
127
+ kept_ids.add(sorted_group[0].id)
128
+ removed_ids.update(c.id for c in sorted_group[1:])
129
+
130
+ # Add non-duplicate chunks to kept
131
+ all_grouped_ids = set()
132
+ for group in duplicate_groups:
133
+ all_grouped_ids.update(group)
134
+
135
+ for chunk in chunks:
136
+ if chunk.id not in all_grouped_ids:
137
+ kept_ids.add(chunk.id)
138
+
139
+ # Build result
140
+ kept_chunks = [c for c in chunks if c.id in kept_ids]
141
+ removed_chunks = [c for c in chunks if c.id in removed_ids]
142
+
143
+ return DeduplicationResult(
144
+ original_count=len(chunks),
145
+ deduplicated_count=len(kept_chunks),
146
+ removed_count=len(removed_chunks),
147
+ duplicate_groups=[list(g) for g in duplicate_groups if len(g) > 1],
148
+ kept_chunks=kept_chunks,
149
+ removed_chunks=removed_chunks
150
+ )
151
+
152
+ def _find_duplicate_groups(self, chunks: List[Chunk], all_embeddings: Optional[np.ndarray] = None) -> List[Set[str]]:
153
+ """Find groups of duplicate chunks with high-performance vectorization."""
154
+ n = len(chunks)
155
+ if n == 0: return []
156
+
157
+ logger.info(f"Deduplicator: Optimizing similarity matrix for {n} chunks...")
158
+ start_time = time.time()
159
+
160
+ # 1. Pre-calculate all non-semantic features once
161
+ hashes = [self._exact_hash(c.text) for c in chunks]
162
+ ngrams = [self._get_ngrams(c.text, self.ngram_size) for c in chunks]
163
+ signatures = None
164
+ if self.method in ["minhash", "hybrid"]:
165
+ signatures = [self._minhash_signature(self._get_shingles(c.text)) for c in chunks]
166
+
167
+ similarity_matrix = np.zeros((n, n))
168
+
169
+ # 2. Vectorized Semantic Similarity (Dot Product)
170
+ if all_embeddings is not None:
171
+ logger.info("Deduplicator: Using vectorized matrix multiplication for semantic similarity")
172
+ # Normalize embeddings to use dot product as cosine similarity
173
+ norms = np.linalg.norm(all_embeddings, axis=1, keepdims=True)
174
+ norms[norms == 0] = 1.0
175
+ norm_embeddings = all_embeddings / norms
176
+ similarity_matrix = np.dot(norm_embeddings, norm_embeddings.T)
177
+ else:
178
+ # Identity diagonal if no semantic match
179
+ np.fill_diagonal(similarity_matrix, 1.0)
180
+
181
+ # 3. Fast-path loop for other methods
182
+ logger.info(f"Deduplicator: Running hybrid similarity checks for {n} chunks...")
183
+ for i in range(n):
184
+ if i > 0 and i % 100 == 0:
185
+ logger.info(f"Deduplicator: Progress {i}/{n}...")
186
+
187
+ for j in range(i + 1, n):
188
+ # If semantic is already > threshold, skip other expensive checks
189
+ if similarity_matrix[i, j] >= self.similarity_threshold:
190
+ similarity_matrix[j, i] = similarity_matrix[i, j]
191
+ continue
192
+
193
+ # Fast hash check
194
+ if hashes[i] == hashes[j]:
195
+ similarity_matrix[i, j] = 1.0
196
+ similarity_matrix[j, i] = 1.0
197
+ continue
198
+
199
+ # Other methods (now using pre-calculated features)
200
+ scores = []
201
+ if self.method in ["ngram", "hybrid"]:
202
+ # Jaccard
203
+ u = ngrams[i] | ngrams[j]
204
+ scores.append(len(ngrams[i] & ngrams[j]) / len(u) if u else 0.0)
205
+
206
+ if self.method in ["minhash", "hybrid"] and signatures:
207
+ # Estimate Jaccard from signature overlap
208
+ matches = sum(1 for a, b in zip(signatures[i], signatures[j]) if a == b)
209
+ scores.append(matches / len(signatures[i]))
210
+
211
+ if scores:
212
+ sim = max(scores)
213
+ if sim > similarity_matrix[i, j]:
214
+ similarity_matrix[i, j] = sim
215
+ similarity_matrix[j, i] = sim
216
+
217
+ logger.info(f"Deduplicator: Vectorized matrix finished in {time.time()-start_time:.2f}s")
218
+
219
+ # 4. Find connected components above threshold (the actual duplicates)
220
+ visited = set()
221
+ groups = []
222
+
223
+ def dfs(idx: int, group: Set[str]):
224
+ if idx in visited:
225
+ return
226
+ visited.add(idx)
227
+ group.add(chunks[idx].id)
228
+
229
+ for j in range(n):
230
+ if j not in visited and similarity_matrix[idx, j] >= self.similarity_threshold:
231
+ dfs(j, group)
232
+
233
+ for i in range(n):
234
+ if i not in visited:
235
+ group = set()
236
+ dfs(i, group)
237
+ if group:
238
+ groups.append(group)
239
+
240
+ return groups
241
+
242
+ def _exact_hash(self, text: str) -> str:
243
+ """Generate hash for exact matching."""
244
+ normalized = " ".join(text.lower().split())
245
+ return hashlib.sha256(normalized.encode()).hexdigest()
246
+
247
+ def _minhash_similarity(self, text1: str, text2: str) -> float:
248
+ """MinHash-based similarity estimation."""
249
+ # Generate shingles
250
+ shingles1 = self._get_shingles(text1)
251
+ shingles2 = self._get_shingles(text2)
252
+
253
+ if not shingles1 or not shingles2:
254
+ return 0.0
255
+
256
+ # Generate MinHash signatures
257
+ sig1 = self._minhash_signature(shingles1)
258
+ sig2 = self._minhash_signature(shingles2)
259
+
260
+ # Estimate Jaccard from signature overlap
261
+ matches = sum(1 for a, b in zip(sig1, sig2) if a == b)
262
+ return matches / len(sig1)
263
+
264
+ def _get_shingles(self, text: str, k: int = 3) -> Set[str]:
265
+ """Get character k-shingles from text."""
266
+ text = text.lower()
267
+ return {text[i:i+k] for i in range(len(text) - k + 1)}
268
+
269
+ def _minhash_signature(self, shingles: Set[str]) -> List[int]:
270
+ """Generate MinHash signature for a set of shingles."""
271
+ # Use hash functions via different seeds
272
+ signature = []
273
+ for seed in range(self.minhash_permutations):
274
+ min_hash = float('inf')
275
+ for shingle in shingles:
276
+ h = hash(shingle + str(seed)) & 0xFFFFFFFF
277
+ min_hash = min(min_hash, h)
278
+ signature.append(min_hash)
279
+ return signature
280
+
281
+ def _semantic_similarity(self,
282
+ text1: str,
283
+ text2: str,
284
+ idx1: int = -1,
285
+ idx2: int = -1,
286
+ all_embeddings: Optional[np.ndarray] = None) -> float:
287
+ """Embedding-based semantic similarity with batch optimization support."""
288
+ if all_embeddings is not None and idx1 >= 0 and idx2 >= 0:
289
+ # Fast Path: Use pre-calculated embeddings
290
+ vec1, vec2 = all_embeddings[idx1], all_embeddings[idx2]
291
+ norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
292
+ if norm1 == 0 or norm2 == 0: return 0.0
293
+ return float(np.dot(vec1, vec2) / (norm1 * norm2))
294
+
295
+ if not self.embedding_fn:
296
+ return 0.0
297
+
298
+ try:
299
+ embeddings = self.embedding_fn([text1, text2])
300
+ vec1, vec2 = np.array(embeddings[0]), np.array(embeddings[1])
301
+
302
+ norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
303
+ if norm1 == 0 or norm2 == 0:
304
+ return 0.0
305
+
306
+ return float(np.dot(vec1, vec2) / (norm1 * norm2))
307
+ except:
308
+ return 0.0
309
+
310
+ def _ngram_jaccard(self, text1: str, text2: str) -> float:
311
+ """N-gram Jaccard similarity."""
312
+ ngrams1 = self._get_ngrams(text1, self.ngram_size)
313
+ ngrams2 = self._get_ngrams(text2, self.ngram_size)
314
+
315
+ if not ngrams1 or not ngrams2:
316
+ return 0.0
317
+
318
+ intersection = ngrams1 & ngrams2
319
+ union = ngrams1 | ngrams2
320
+
321
+ return len(intersection) / len(union) if union else 0.0
322
+
323
+ def _get_ngrams(self, text: str, n: int) -> Set[Tuple[str, ...]]:
324
+ """Get word n-grams from text."""
325
+ words = text.lower().split()
326
+ return {tuple(words[i:i+n]) for i in range(len(words) - n + 1)}