autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, Callable, Optional, Tuple, Set
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
import hashlib
|
|
6
|
+
import numpy as np
|
|
7
|
+
from ..chunkers.base import Chunk
|
|
8
|
+
from ..utils.text import count_tokens
|
|
9
|
+
from ..utils.logger import logger
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class DeduplicationResult:
|
|
14
|
+
"""Result of deduplication operation."""
|
|
15
|
+
original_count: int
|
|
16
|
+
deduplicated_count: int
|
|
17
|
+
removed_count: int
|
|
18
|
+
duplicate_groups: List[List[str]] # Groups of chunk IDs that are duplicates
|
|
19
|
+
kept_chunks: List[Chunk]
|
|
20
|
+
removed_chunks: List[Chunk]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ChunkDeduplicator:
|
|
24
|
+
"""
|
|
25
|
+
World-Class Chunk Deduplication System.
|
|
26
|
+
|
|
27
|
+
Identifies and removes duplicate or near-duplicate chunks using
|
|
28
|
+
multiple similarity detection methods.
|
|
29
|
+
|
|
30
|
+
DEDUPLICATION METHODS:
|
|
31
|
+
1. Exact Hash: MD5/SHA256 for identical content
|
|
32
|
+
2. MinHash LSH: Locality-Sensitive Hashing for near-duplicates
|
|
33
|
+
3. Semantic: Embedding-based similarity for paraphrases
|
|
34
|
+
4. N-gram Jaccard: Character/word n-gram overlap
|
|
35
|
+
|
|
36
|
+
STRATEGIES:
|
|
37
|
+
- keep_first: Keep the first occurrence
|
|
38
|
+
- keep_longest: Keep the longest version
|
|
39
|
+
- keep_best: Keep the highest quality (requires scorer)
|
|
40
|
+
- merge: Merge duplicates into a single enhanced chunk
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self,
|
|
44
|
+
embedding_fn: Callable[[List[str]], List[List[float]]] = None,
|
|
45
|
+
similarity_threshold: float = 0.85,
|
|
46
|
+
method: str = "hybrid",
|
|
47
|
+
strategy: str = "keep_first",
|
|
48
|
+
minhash_permutations: int = 128,
|
|
49
|
+
ngram_size: int = 3):
|
|
50
|
+
"""
|
|
51
|
+
Initialize the deduplicator.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
embedding_fn: Function for semantic similarity (optional).
|
|
55
|
+
similarity_threshold: Threshold for considering chunks as duplicates (0-1).
|
|
56
|
+
method: "exact", "minhash", "semantic", "ngram", or "hybrid" (all methods).
|
|
57
|
+
strategy: "keep_first", "keep_longest", "keep_best", "merge".
|
|
58
|
+
minhash_permutations: Number of permutations for MinHash.
|
|
59
|
+
ngram_size: Size of n-grams for Jaccard similarity.
|
|
60
|
+
"""
|
|
61
|
+
self.embedding_fn = embedding_fn
|
|
62
|
+
self.similarity_threshold = similarity_threshold
|
|
63
|
+
self.method = method
|
|
64
|
+
self.strategy = strategy
|
|
65
|
+
self.minhash_permutations = minhash_permutations
|
|
66
|
+
self.ngram_size = ngram_size
|
|
67
|
+
|
|
68
|
+
def deduplicate(self,
|
|
69
|
+
chunks: List[Chunk],
|
|
70
|
+
quality_scorer=None) -> DeduplicationResult:
|
|
71
|
+
"""
|
|
72
|
+
Remove duplicate chunks from a list with optimized batch embedding.
|
|
73
|
+
"""
|
|
74
|
+
if not chunks:
|
|
75
|
+
return DeduplicationResult(0, 0, 0, [], [], [])
|
|
76
|
+
|
|
77
|
+
# Optimization: Pre-calculate all embeddings for semantic similarity if needed
|
|
78
|
+
all_embeddings = None
|
|
79
|
+
if self.embedding_fn and (self.method == "semantic" or self.method == "hybrid"):
|
|
80
|
+
all_embeddings = np.array(self.embedding_fn([c.text for c in chunks]))
|
|
81
|
+
|
|
82
|
+
# Find all duplicate groups (passing pre-calculated embeddings)
|
|
83
|
+
duplicate_groups = self._find_duplicate_groups(chunks, all_embeddings)
|
|
84
|
+
|
|
85
|
+
# Decide which to keep from each group
|
|
86
|
+
kept_ids: Set[str] = set()
|
|
87
|
+
removed_ids: Set[str] = set()
|
|
88
|
+
|
|
89
|
+
for group in duplicate_groups:
|
|
90
|
+
if len(group) <= 1:
|
|
91
|
+
kept_ids.update(group)
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
group_chunks = [c for c in chunks if c.id in group]
|
|
95
|
+
|
|
96
|
+
if self.strategy == "keep_first":
|
|
97
|
+
# Keep the one with lowest index
|
|
98
|
+
chunk_indices = {c.id: idx for idx, c in enumerate(chunks)}
|
|
99
|
+
sorted_group = sorted(group_chunks, key=lambda c: chunk_indices[c.id])
|
|
100
|
+
kept_ids.add(sorted_group[0].id)
|
|
101
|
+
removed_ids.update(c.id for c in sorted_group[1:])
|
|
102
|
+
|
|
103
|
+
elif self.strategy == "keep_longest":
|
|
104
|
+
# Keep the longest
|
|
105
|
+
sorted_group = sorted(group_chunks, key=lambda c: len(c.text), reverse=True)
|
|
106
|
+
kept_ids.add(sorted_group[0].id)
|
|
107
|
+
removed_ids.update(c.id for c in sorted_group[1:])
|
|
108
|
+
|
|
109
|
+
elif self.strategy == "keep_best":
|
|
110
|
+
if quality_scorer:
|
|
111
|
+
# Note: quality_scorer.score_chunks is already optimized for batching
|
|
112
|
+
reports = quality_scorer.score_chunks(group_chunks)
|
|
113
|
+
scored = [(c, r.overall_score) for c, r in zip(group_chunks, reports)]
|
|
114
|
+
sorted_group = sorted(scored, key=lambda x: x[1], reverse=True)
|
|
115
|
+
kept_ids.add(sorted_group[0][0].id)
|
|
116
|
+
removed_ids.update(c.id for c, _ in sorted_group[1:])
|
|
117
|
+
else:
|
|
118
|
+
# Fallback to keep_longest
|
|
119
|
+
sorted_group = sorted(group_chunks, key=lambda c: len(c.text), reverse=True)
|
|
120
|
+
kept_ids.add(sorted_group[0].id)
|
|
121
|
+
removed_ids.update(c.id for c in sorted_group[1:])
|
|
122
|
+
|
|
123
|
+
elif self.strategy == "merge":
|
|
124
|
+
# Keep first but enhance with info from others
|
|
125
|
+
chunk_indices = {c.id: idx for idx, c in enumerate(chunks)}
|
|
126
|
+
sorted_group = sorted(group_chunks, key=lambda c: chunk_indices[c.id])
|
|
127
|
+
kept_ids.add(sorted_group[0].id)
|
|
128
|
+
removed_ids.update(c.id for c in sorted_group[1:])
|
|
129
|
+
|
|
130
|
+
# Add non-duplicate chunks to kept
|
|
131
|
+
all_grouped_ids = set()
|
|
132
|
+
for group in duplicate_groups:
|
|
133
|
+
all_grouped_ids.update(group)
|
|
134
|
+
|
|
135
|
+
for chunk in chunks:
|
|
136
|
+
if chunk.id not in all_grouped_ids:
|
|
137
|
+
kept_ids.add(chunk.id)
|
|
138
|
+
|
|
139
|
+
# Build result
|
|
140
|
+
kept_chunks = [c for c in chunks if c.id in kept_ids]
|
|
141
|
+
removed_chunks = [c for c in chunks if c.id in removed_ids]
|
|
142
|
+
|
|
143
|
+
return DeduplicationResult(
|
|
144
|
+
original_count=len(chunks),
|
|
145
|
+
deduplicated_count=len(kept_chunks),
|
|
146
|
+
removed_count=len(removed_chunks),
|
|
147
|
+
duplicate_groups=[list(g) for g in duplicate_groups if len(g) > 1],
|
|
148
|
+
kept_chunks=kept_chunks,
|
|
149
|
+
removed_chunks=removed_chunks
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def _find_duplicate_groups(self, chunks: List[Chunk], all_embeddings: Optional[np.ndarray] = None) -> List[Set[str]]:
|
|
153
|
+
"""Find groups of duplicate chunks with high-performance vectorization."""
|
|
154
|
+
n = len(chunks)
|
|
155
|
+
if n == 0: return []
|
|
156
|
+
|
|
157
|
+
logger.info(f"Deduplicator: Optimizing similarity matrix for {n} chunks...")
|
|
158
|
+
start_time = time.time()
|
|
159
|
+
|
|
160
|
+
# 1. Pre-calculate all non-semantic features once
|
|
161
|
+
hashes = [self._exact_hash(c.text) for c in chunks]
|
|
162
|
+
ngrams = [self._get_ngrams(c.text, self.ngram_size) for c in chunks]
|
|
163
|
+
signatures = None
|
|
164
|
+
if self.method in ["minhash", "hybrid"]:
|
|
165
|
+
signatures = [self._minhash_signature(self._get_shingles(c.text)) for c in chunks]
|
|
166
|
+
|
|
167
|
+
similarity_matrix = np.zeros((n, n))
|
|
168
|
+
|
|
169
|
+
# 2. Vectorized Semantic Similarity (Dot Product)
|
|
170
|
+
if all_embeddings is not None:
|
|
171
|
+
logger.info("Deduplicator: Using vectorized matrix multiplication for semantic similarity")
|
|
172
|
+
# Normalize embeddings to use dot product as cosine similarity
|
|
173
|
+
norms = np.linalg.norm(all_embeddings, axis=1, keepdims=True)
|
|
174
|
+
norms[norms == 0] = 1.0
|
|
175
|
+
norm_embeddings = all_embeddings / norms
|
|
176
|
+
similarity_matrix = np.dot(norm_embeddings, norm_embeddings.T)
|
|
177
|
+
else:
|
|
178
|
+
# Identity diagonal if no semantic match
|
|
179
|
+
np.fill_diagonal(similarity_matrix, 1.0)
|
|
180
|
+
|
|
181
|
+
# 3. Fast-path loop for other methods
|
|
182
|
+
logger.info(f"Deduplicator: Running hybrid similarity checks for {n} chunks...")
|
|
183
|
+
for i in range(n):
|
|
184
|
+
if i > 0 and i % 100 == 0:
|
|
185
|
+
logger.info(f"Deduplicator: Progress {i}/{n}...")
|
|
186
|
+
|
|
187
|
+
for j in range(i + 1, n):
|
|
188
|
+
# If semantic is already > threshold, skip other expensive checks
|
|
189
|
+
if similarity_matrix[i, j] >= self.similarity_threshold:
|
|
190
|
+
similarity_matrix[j, i] = similarity_matrix[i, j]
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
# Fast hash check
|
|
194
|
+
if hashes[i] == hashes[j]:
|
|
195
|
+
similarity_matrix[i, j] = 1.0
|
|
196
|
+
similarity_matrix[j, i] = 1.0
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
# Other methods (now using pre-calculated features)
|
|
200
|
+
scores = []
|
|
201
|
+
if self.method in ["ngram", "hybrid"]:
|
|
202
|
+
# Jaccard
|
|
203
|
+
u = ngrams[i] | ngrams[j]
|
|
204
|
+
scores.append(len(ngrams[i] & ngrams[j]) / len(u) if u else 0.0)
|
|
205
|
+
|
|
206
|
+
if self.method in ["minhash", "hybrid"] and signatures:
|
|
207
|
+
# Estimate Jaccard from signature overlap
|
|
208
|
+
matches = sum(1 for a, b in zip(signatures[i], signatures[j]) if a == b)
|
|
209
|
+
scores.append(matches / len(signatures[i]))
|
|
210
|
+
|
|
211
|
+
if scores:
|
|
212
|
+
sim = max(scores)
|
|
213
|
+
if sim > similarity_matrix[i, j]:
|
|
214
|
+
similarity_matrix[i, j] = sim
|
|
215
|
+
similarity_matrix[j, i] = sim
|
|
216
|
+
|
|
217
|
+
logger.info(f"Deduplicator: Vectorized matrix finished in {time.time()-start_time:.2f}s")
|
|
218
|
+
|
|
219
|
+
# 4. Find connected components above threshold (the actual duplicates)
|
|
220
|
+
visited = set()
|
|
221
|
+
groups = []
|
|
222
|
+
|
|
223
|
+
def dfs(idx: int, group: Set[str]):
|
|
224
|
+
if idx in visited:
|
|
225
|
+
return
|
|
226
|
+
visited.add(idx)
|
|
227
|
+
group.add(chunks[idx].id)
|
|
228
|
+
|
|
229
|
+
for j in range(n):
|
|
230
|
+
if j not in visited and similarity_matrix[idx, j] >= self.similarity_threshold:
|
|
231
|
+
dfs(j, group)
|
|
232
|
+
|
|
233
|
+
for i in range(n):
|
|
234
|
+
if i not in visited:
|
|
235
|
+
group = set()
|
|
236
|
+
dfs(i, group)
|
|
237
|
+
if group:
|
|
238
|
+
groups.append(group)
|
|
239
|
+
|
|
240
|
+
return groups
|
|
241
|
+
|
|
242
|
+
def _exact_hash(self, text: str) -> str:
|
|
243
|
+
"""Generate hash for exact matching."""
|
|
244
|
+
normalized = " ".join(text.lower().split())
|
|
245
|
+
return hashlib.sha256(normalized.encode()).hexdigest()
|
|
246
|
+
|
|
247
|
+
def _minhash_similarity(self, text1: str, text2: str) -> float:
|
|
248
|
+
"""MinHash-based similarity estimation."""
|
|
249
|
+
# Generate shingles
|
|
250
|
+
shingles1 = self._get_shingles(text1)
|
|
251
|
+
shingles2 = self._get_shingles(text2)
|
|
252
|
+
|
|
253
|
+
if not shingles1 or not shingles2:
|
|
254
|
+
return 0.0
|
|
255
|
+
|
|
256
|
+
# Generate MinHash signatures
|
|
257
|
+
sig1 = self._minhash_signature(shingles1)
|
|
258
|
+
sig2 = self._minhash_signature(shingles2)
|
|
259
|
+
|
|
260
|
+
# Estimate Jaccard from signature overlap
|
|
261
|
+
matches = sum(1 for a, b in zip(sig1, sig2) if a == b)
|
|
262
|
+
return matches / len(sig1)
|
|
263
|
+
|
|
264
|
+
def _get_shingles(self, text: str, k: int = 3) -> Set[str]:
|
|
265
|
+
"""Get character k-shingles from text."""
|
|
266
|
+
text = text.lower()
|
|
267
|
+
return {text[i:i+k] for i in range(len(text) - k + 1)}
|
|
268
|
+
|
|
269
|
+
def _minhash_signature(self, shingles: Set[str]) -> List[int]:
|
|
270
|
+
"""Generate MinHash signature for a set of shingles."""
|
|
271
|
+
# Use hash functions via different seeds
|
|
272
|
+
signature = []
|
|
273
|
+
for seed in range(self.minhash_permutations):
|
|
274
|
+
min_hash = float('inf')
|
|
275
|
+
for shingle in shingles:
|
|
276
|
+
h = hash(shingle + str(seed)) & 0xFFFFFFFF
|
|
277
|
+
min_hash = min(min_hash, h)
|
|
278
|
+
signature.append(min_hash)
|
|
279
|
+
return signature
|
|
280
|
+
|
|
281
|
+
def _semantic_similarity(self,
|
|
282
|
+
text1: str,
|
|
283
|
+
text2: str,
|
|
284
|
+
idx1: int = -1,
|
|
285
|
+
idx2: int = -1,
|
|
286
|
+
all_embeddings: Optional[np.ndarray] = None) -> float:
|
|
287
|
+
"""Embedding-based semantic similarity with batch optimization support."""
|
|
288
|
+
if all_embeddings is not None and idx1 >= 0 and idx2 >= 0:
|
|
289
|
+
# Fast Path: Use pre-calculated embeddings
|
|
290
|
+
vec1, vec2 = all_embeddings[idx1], all_embeddings[idx2]
|
|
291
|
+
norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
|
|
292
|
+
if norm1 == 0 or norm2 == 0: return 0.0
|
|
293
|
+
return float(np.dot(vec1, vec2) / (norm1 * norm2))
|
|
294
|
+
|
|
295
|
+
if not self.embedding_fn:
|
|
296
|
+
return 0.0
|
|
297
|
+
|
|
298
|
+
try:
|
|
299
|
+
embeddings = self.embedding_fn([text1, text2])
|
|
300
|
+
vec1, vec2 = np.array(embeddings[0]), np.array(embeddings[1])
|
|
301
|
+
|
|
302
|
+
norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
|
|
303
|
+
if norm1 == 0 or norm2 == 0:
|
|
304
|
+
return 0.0
|
|
305
|
+
|
|
306
|
+
return float(np.dot(vec1, vec2) / (norm1 * norm2))
|
|
307
|
+
except:
|
|
308
|
+
return 0.0
|
|
309
|
+
|
|
310
|
+
def _ngram_jaccard(self, text1: str, text2: str) -> float:
|
|
311
|
+
"""N-gram Jaccard similarity."""
|
|
312
|
+
ngrams1 = self._get_ngrams(text1, self.ngram_size)
|
|
313
|
+
ngrams2 = self._get_ngrams(text2, self.ngram_size)
|
|
314
|
+
|
|
315
|
+
if not ngrams1 or not ngrams2:
|
|
316
|
+
return 0.0
|
|
317
|
+
|
|
318
|
+
intersection = ngrams1 & ngrams2
|
|
319
|
+
union = ngrams1 | ngrams2
|
|
320
|
+
|
|
321
|
+
return len(intersection) / len(union) if union else 0.0
|
|
322
|
+
|
|
323
|
+
def _get_ngrams(self, text: str, n: int) -> Set[Tuple[str, ...]]:
|
|
324
|
+
"""Get word n-grams from text."""
|
|
325
|
+
words = text.lower().split()
|
|
326
|
+
return {tuple(words[i:i+n]) for i in range(len(words) - n + 1)}
|