autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, Callable, Optional
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
import numpy as np
|
|
6
|
+
from ..chunkers.base import Chunk
|
|
7
|
+
from ..utils.text import count_tokens, split_sentences
|
|
8
|
+
from ..utils.logger import logger
|
|
9
|
+
import time
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ChunkQualityReport:
|
|
13
|
+
"""Comprehensive quality report for a chunk."""
|
|
14
|
+
chunk_id: str
|
|
15
|
+
overall_score: float # 0-1, higher is better
|
|
16
|
+
|
|
17
|
+
# Individual dimension scores (0-1)
|
|
18
|
+
coherence_score: float # Internal semantic consistency
|
|
19
|
+
completeness_score: float # Self-containedness
|
|
20
|
+
density_score: float # Information density
|
|
21
|
+
boundary_score: float # Quality of start/end boundaries
|
|
22
|
+
size_score: float # Optimal size relative to target
|
|
23
|
+
|
|
24
|
+
# Detailed metrics
|
|
25
|
+
token_count: int
|
|
26
|
+
sentence_count: int
|
|
27
|
+
avg_sentence_length: float
|
|
28
|
+
|
|
29
|
+
# Flags
|
|
30
|
+
issues: List[str] = field(default_factory=list)
|
|
31
|
+
recommendations: List[str] = field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ChunkQualityScorer:
|
|
35
|
+
"""
|
|
36
|
+
World-Class Chunk Quality Scoring System.
|
|
37
|
+
|
|
38
|
+
Evaluates chunks across multiple quality dimensions to identify
|
|
39
|
+
problematic chunks and guide optimization.
|
|
40
|
+
|
|
41
|
+
QUALITY DIMENSIONS:
|
|
42
|
+
1. Coherence: Internal semantic consistency (embedding similarity)
|
|
43
|
+
2. Completeness: Self-containedness (no dangling references)
|
|
44
|
+
3. Density: Information richness vs fluff
|
|
45
|
+
4. Boundaries: Clean starts and ends (no mid-sentence cuts)
|
|
46
|
+
5. Size: Optimal length for the target use case
|
|
47
|
+
|
|
48
|
+
SCORING METHODOLOGY:
|
|
49
|
+
- Each dimension is scored 0-1
|
|
50
|
+
- Weighted combination for overall score
|
|
51
|
+
- Issue detection with actionable recommendations
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# Patterns indicating incomplete boundaries
|
|
55
|
+
INCOMPLETE_START_PATTERNS = [
|
|
56
|
+
r'^[a-z]', # Starts with lowercase
|
|
57
|
+
r'^(and|but|or|so|then|however|therefore|thus|hence)\b', # Starts with conjunction
|
|
58
|
+
r'^(this|that|these|those|it|they|he|she)\b', # Starts with pronoun
|
|
59
|
+
r'^\.', # Starts with period
|
|
60
|
+
r'^\,', # Starts with comma
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
INCOMPLETE_END_PATTERNS = [
|
|
64
|
+
r'[,;:]\s*$', # Ends with comma/semicolon
|
|
65
|
+
r'\b(and|or|but|the|a|an)\s*$', # Ends with article/conjunction
|
|
66
|
+
r'[^.!?\"\')\]]\s*$', # Doesn't end with terminal punctuation
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
# Filler words that reduce density
|
|
70
|
+
FILLER_WORDS = {
|
|
71
|
+
'very', 'really', 'quite', 'rather', 'somewhat', 'basically',
|
|
72
|
+
'actually', 'literally', 'just', 'simply', 'obviously', 'clearly',
|
|
73
|
+
'in order to', 'due to the fact that', 'at this point in time'
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def __init__(self,
|
|
77
|
+
embedding_fn: Callable[[List[str]], List[List[float]]] = None,
|
|
78
|
+
target_token_size: int = 512,
|
|
79
|
+
weights: Dict[str, float] = None):
|
|
80
|
+
"""
|
|
81
|
+
Initialize the quality scorer.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
embedding_fn: Function to generate embeddings for coherence scoring.
|
|
85
|
+
target_token_size: Ideal chunk size for size scoring.
|
|
86
|
+
weights: Custom weights for each dimension.
|
|
87
|
+
"""
|
|
88
|
+
self.embedding_fn = embedding_fn
|
|
89
|
+
self.target_token_size = target_token_size
|
|
90
|
+
self.weights = weights or {
|
|
91
|
+
'coherence': 0.25,
|
|
92
|
+
'completeness': 0.20,
|
|
93
|
+
'density': 0.15,
|
|
94
|
+
'boundary': 0.20,
|
|
95
|
+
'size': 0.20
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
def score_chunk(self, chunk: Chunk) -> ChunkQualityReport:
|
|
99
|
+
"""
|
|
100
|
+
Generate comprehensive quality report for a single chunk.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
chunk: The chunk to evaluate
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
ChunkQualityReport with all metrics
|
|
107
|
+
"""
|
|
108
|
+
text = chunk.text
|
|
109
|
+
issues = []
|
|
110
|
+
recommendations = []
|
|
111
|
+
|
|
112
|
+
# Basic metrics
|
|
113
|
+
token_count = count_tokens(text)
|
|
114
|
+
sentences = split_sentences(text)
|
|
115
|
+
sentence_count = len(sentences)
|
|
116
|
+
avg_sentence_length = np.mean([count_tokens(s) for s in sentences]) if sentences else 0
|
|
117
|
+
|
|
118
|
+
# 1. Coherence Score
|
|
119
|
+
coherence_score = self._score_coherence(sentences)
|
|
120
|
+
if coherence_score < 0.6:
|
|
121
|
+
issues.append("Low internal coherence - chunk may contain unrelated content")
|
|
122
|
+
recommendations.append("Consider splitting at topic boundaries")
|
|
123
|
+
|
|
124
|
+
# 2. Completeness Score
|
|
125
|
+
completeness_score = self._score_completeness(text)
|
|
126
|
+
if completeness_score < 0.7:
|
|
127
|
+
issues.append("Chunk may not be self-contained")
|
|
128
|
+
recommendations.append("Resolve pronouns and add context")
|
|
129
|
+
|
|
130
|
+
# 3. Density Score
|
|
131
|
+
density_score = self._score_density(text, sentences)
|
|
132
|
+
if density_score < 0.5:
|
|
133
|
+
issues.append("Low information density")
|
|
134
|
+
recommendations.append("Remove filler words and redundant phrases")
|
|
135
|
+
|
|
136
|
+
# 4. Boundary Score
|
|
137
|
+
boundary_score = self._score_boundaries(text)
|
|
138
|
+
if boundary_score < 0.7:
|
|
139
|
+
issues.append("Incomplete boundaries detected")
|
|
140
|
+
recommendations.append("Adjust split points to sentence boundaries")
|
|
141
|
+
|
|
142
|
+
# 5. Size Score
|
|
143
|
+
size_score = self._score_size(token_count)
|
|
144
|
+
if size_score < 0.6:
|
|
145
|
+
if token_count < self.target_token_size * 0.3:
|
|
146
|
+
issues.append("Chunk is too small")
|
|
147
|
+
recommendations.append("Merge with adjacent chunks")
|
|
148
|
+
else:
|
|
149
|
+
issues.append("Chunk is too large")
|
|
150
|
+
recommendations.append("Split into smaller chunks")
|
|
151
|
+
|
|
152
|
+
# Calculate overall score
|
|
153
|
+
overall_score = (
|
|
154
|
+
self.weights['coherence'] * coherence_score +
|
|
155
|
+
self.weights['completeness'] * completeness_score +
|
|
156
|
+
self.weights['density'] * density_score +
|
|
157
|
+
self.weights['boundary'] * boundary_score +
|
|
158
|
+
self.weights['size'] * size_score
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return ChunkQualityReport(
|
|
162
|
+
chunk_id=chunk.id,
|
|
163
|
+
overall_score=overall_score,
|
|
164
|
+
coherence_score=coherence_score,
|
|
165
|
+
completeness_score=completeness_score,
|
|
166
|
+
density_score=density_score,
|
|
167
|
+
boundary_score=boundary_score,
|
|
168
|
+
size_score=size_score,
|
|
169
|
+
token_count=token_count,
|
|
170
|
+
sentence_count=sentence_count,
|
|
171
|
+
avg_sentence_length=avg_sentence_length,
|
|
172
|
+
issues=issues,
|
|
173
|
+
recommendations=recommendations
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def score_chunks(self, chunks: List[Chunk]) -> List[ChunkQualityReport]:
|
|
177
|
+
"""
|
|
178
|
+
Score multiple chunks with optimized batch embedding and progress logging.
|
|
179
|
+
"""
|
|
180
|
+
if not chunks:
|
|
181
|
+
return []
|
|
182
|
+
|
|
183
|
+
logger.info(f"QualityScorer: Scoring {len(chunks)} chunks...")
|
|
184
|
+
start_time = time.time()
|
|
185
|
+
|
|
186
|
+
# Optimization: Pre-collect and batch embed all sentences for coherence scoring
|
|
187
|
+
all_sentences = []
|
|
188
|
+
chunk_sentence_maps = []
|
|
189
|
+
|
|
190
|
+
if self.embedding_fn:
|
|
191
|
+
logger.info(f"QualityScorer: Splitting sentences for batch embedding...")
|
|
192
|
+
for chunk in chunks:
|
|
193
|
+
sentences = split_sentences(chunk.text)
|
|
194
|
+
start_idx = len(all_sentences)
|
|
195
|
+
all_sentences.extend(sentences)
|
|
196
|
+
end_idx = len(all_sentences)
|
|
197
|
+
chunk_sentence_maps.append((sentences, list(range(start_idx, end_idx))))
|
|
198
|
+
|
|
199
|
+
logger.info(f"QualityScorer: Batch embedding {len(all_sentences)} sentences...")
|
|
200
|
+
embed_start = time.time()
|
|
201
|
+
all_embeddings = self.embedding_fn(all_sentences)
|
|
202
|
+
logger.info(f"QualityScorer: Embedding finished in {time.time()-embed_start:.2f}s")
|
|
203
|
+
|
|
204
|
+
reports = []
|
|
205
|
+
for i, chunk in enumerate(chunks):
|
|
206
|
+
if i > 0 and i % 50 == 0:
|
|
207
|
+
logger.info(f"QualityScorer: Scoring progress {i}/{len(chunks)}...")
|
|
208
|
+
|
|
209
|
+
sentences, indices = chunk_sentence_maps[i]
|
|
210
|
+
embeddings = [all_embeddings[idx] for idx in indices]
|
|
211
|
+
# Pass pre-calculated embeddings to a specialized internal method
|
|
212
|
+
reports.append(self._score_chunk_with_cached_embeddings(chunk, sentences, embeddings))
|
|
213
|
+
|
|
214
|
+
logger.info(f"QualityScorer: Total scoring finished in {time.time()-start_time:.2f}s")
|
|
215
|
+
return reports
|
|
216
|
+
else:
|
|
217
|
+
# Fallback to serial scoring if no embedding function
|
|
218
|
+
logger.info(f"QualityScorer: Using non-embedding fallback scoring...")
|
|
219
|
+
results = [self.score_chunk(chunk) for chunk in chunks]
|
|
220
|
+
logger.info(f"QualityScorer: Fallback scoring finished in {time.time()-start_time:.2f}s")
|
|
221
|
+
return results
|
|
222
|
+
|
|
223
|
+
def _score_chunk_with_cached_embeddings(self,
|
|
224
|
+
chunk: Chunk,
|
|
225
|
+
sentences: List[str],
|
|
226
|
+
sentence_embeddings: List[List[float]]) -> ChunkQualityReport:
|
|
227
|
+
"""Internal version of score_chunk that uses pre-calculated embeddings."""
|
|
228
|
+
text = chunk.text
|
|
229
|
+
issues = []
|
|
230
|
+
recommendations = []
|
|
231
|
+
|
|
232
|
+
# Basic metrics
|
|
233
|
+
token_count = count_tokens(text)
|
|
234
|
+
sentence_count = len(sentences)
|
|
235
|
+
avg_sentence_length = np.mean([count_tokens(s) for s in sentences]) if sentences else 0
|
|
236
|
+
|
|
237
|
+
# 1. Coherence Score (using pre-calculated embeddings)
|
|
238
|
+
if self.embedding_fn:
|
|
239
|
+
coherence_score = float(self._score_coherence_cached(sentences, sentence_embeddings))
|
|
240
|
+
else:
|
|
241
|
+
coherence_score = float(self._lexical_coherence(sentences))
|
|
242
|
+
|
|
243
|
+
if coherence_score < 0.6:
|
|
244
|
+
issues.append("Low internal coherence")
|
|
245
|
+
recommendations.append("Consider splitting at topic boundaries")
|
|
246
|
+
|
|
247
|
+
completeness_score = float(self._score_completeness(text))
|
|
248
|
+
density_score = float(self._score_density(text, sentences))
|
|
249
|
+
boundary_score = float(self._score_boundaries(text))
|
|
250
|
+
size_score = float(self._score_size(token_count))
|
|
251
|
+
|
|
252
|
+
overall_score = float(
|
|
253
|
+
self.weights['coherence'] * coherence_score +
|
|
254
|
+
self.weights['completeness'] * completeness_score +
|
|
255
|
+
self.weights['density'] * density_score +
|
|
256
|
+
self.weights['boundary'] * boundary_score +
|
|
257
|
+
self.weights['size'] * size_score
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return ChunkQualityReport(
|
|
261
|
+
chunk_id=chunk.id, overall_score=overall_score,
|
|
262
|
+
coherence_score=coherence_score, completeness_score=completeness_score,
|
|
263
|
+
density_score=density_score, boundary_score=boundary_score,
|
|
264
|
+
size_score=size_score, token_count=token_count,
|
|
265
|
+
sentence_count=sentence_count, avg_sentence_length=float(avg_sentence_length),
|
|
266
|
+
issues=issues, recommendations=recommendations
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def _score_coherence_cached(self, sentences: List[str], embeddings: List[List[float]]) -> float:
|
|
270
|
+
"""Score coherence using provided embeddings."""
|
|
271
|
+
if len(sentences) <= 1: return 1.0
|
|
272
|
+
if not embeddings: return self._lexical_coherence(sentences)
|
|
273
|
+
|
|
274
|
+
emb_array = np.array(embeddings)
|
|
275
|
+
similarities = []
|
|
276
|
+
for i in range(len(emb_array) - 1):
|
|
277
|
+
for j in range(i + 1, len(emb_array)):
|
|
278
|
+
norm_i = np.linalg.norm(emb_array[i])
|
|
279
|
+
norm_j = np.linalg.norm(emb_array[j])
|
|
280
|
+
if norm_i > 0 and norm_j > 0:
|
|
281
|
+
sim = np.dot(emb_array[i], emb_array[j]) / (norm_i * norm_j)
|
|
282
|
+
similarities.append(sim)
|
|
283
|
+
return np.mean(similarities) if similarities else 0.5
|
|
284
|
+
|
|
285
|
+
def get_summary_stats(self, reports: List[ChunkQualityReport]) -> Dict[str, Any]:
|
|
286
|
+
"""Get aggregate statistics from multiple reports with plain Python types for serialization."""
|
|
287
|
+
if not reports:
|
|
288
|
+
return {}
|
|
289
|
+
|
|
290
|
+
scores = [r.overall_score for r in reports]
|
|
291
|
+
return {
|
|
292
|
+
'count': len(reports),
|
|
293
|
+
'mean_score': float(np.mean(scores)),
|
|
294
|
+
'std_score': float(np.std(scores)),
|
|
295
|
+
'min_score': float(np.min(scores)),
|
|
296
|
+
'max_score': float(np.max(scores)),
|
|
297
|
+
'below_threshold': int(sum(1 for s in scores if s < 0.6)),
|
|
298
|
+
'dimension_means': {
|
|
299
|
+
'coherence': float(np.mean([r.coherence_score for r in reports])),
|
|
300
|
+
'completeness': float(np.mean([r.completeness_score for r in reports])),
|
|
301
|
+
'density': float(np.mean([r.density_score for r in reports])),
|
|
302
|
+
'boundary': float(np.mean([r.boundary_score for r in reports])),
|
|
303
|
+
'size': float(np.mean([r.size_score for r in reports]))
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
def _score_coherence(self, sentences: List[str]) -> float:
|
|
308
|
+
"""Score internal semantic consistency."""
|
|
309
|
+
if len(sentences) <= 1:
|
|
310
|
+
return 1.0 # Single sentence is coherent by definition
|
|
311
|
+
|
|
312
|
+
if self.embedding_fn is None:
|
|
313
|
+
# Fallback: use lexical overlap
|
|
314
|
+
return self._lexical_coherence(sentences)
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
embeddings = np.array(self.embedding_fn(sentences))
|
|
318
|
+
n = len(embeddings)
|
|
319
|
+
if n <= 1: return 1.0
|
|
320
|
+
|
|
321
|
+
# Vectorized pairwise cosine similarity
|
|
322
|
+
# Normalize embeddings first
|
|
323
|
+
norms = np.linalg.norm(embeddings, axis=1)
|
|
324
|
+
norms[norms < 1e-9] = 1.0
|
|
325
|
+
norm_embeddings = embeddings / norms[:, np.newaxis]
|
|
326
|
+
|
|
327
|
+
# Similarity matrix (N x N)
|
|
328
|
+
sim_matrix = norm_embeddings @ norm_embeddings.T
|
|
329
|
+
|
|
330
|
+
# Extract upper triangle (excluding diagonal) for pairwise mean
|
|
331
|
+
tri_indices = np.triu_indices(n, k=1)
|
|
332
|
+
pair_similarities = sim_matrix[tri_indices]
|
|
333
|
+
|
|
334
|
+
return float(np.mean(pair_similarities)) if pair_similarities.size > 0 else 0.5
|
|
335
|
+
except Exception as e:
|
|
336
|
+
logger.debug(f"Vectorized coherence scoring failed: {e}")
|
|
337
|
+
return self._lexical_coherence(sentences)
|
|
338
|
+
|
|
339
|
+
def _lexical_coherence(self, sentences: List[str]) -> float:
|
|
340
|
+
"""Fallback coherence using word overlap."""
|
|
341
|
+
if len(sentences) <= 1:
|
|
342
|
+
return 1.0
|
|
343
|
+
|
|
344
|
+
word_sets = [set(s.lower().split()) for s in sentences]
|
|
345
|
+
overlaps = []
|
|
346
|
+
|
|
347
|
+
for i in range(len(word_sets) - 1):
|
|
348
|
+
intersection = word_sets[i] & word_sets[i + 1]
|
|
349
|
+
union = word_sets[i] | word_sets[i + 1]
|
|
350
|
+
if union:
|
|
351
|
+
overlaps.append(len(intersection) / len(union))
|
|
352
|
+
|
|
353
|
+
return np.mean(overlaps) if overlaps else 0.5
|
|
354
|
+
|
|
355
|
+
def _score_completeness(self, text: str) -> float:
|
|
356
|
+
"""Score self-containedness."""
|
|
357
|
+
import re
|
|
358
|
+
|
|
359
|
+
score = 1.0
|
|
360
|
+
penalties = []
|
|
361
|
+
|
|
362
|
+
# Check for unresolved pronouns at start
|
|
363
|
+
first_sentence = split_sentences(text)[0] if split_sentences(text) else text[:100]
|
|
364
|
+
pronoun_pattern = r'\b(this|that|these|those|it|they|he|she|him|her|them)\b'
|
|
365
|
+
if re.search(pronoun_pattern, first_sentence.lower()):
|
|
366
|
+
penalties.append(0.15)
|
|
367
|
+
|
|
368
|
+
# Check for references to external context
|
|
369
|
+
external_refs = [
|
|
370
|
+
r'\babove\b', r'\bbelow\b', r'\bpreviously\b', r'\bfollowing\b',
|
|
371
|
+
r'\bas mentioned\b', r'\bas discussed\b', r'\bsee \w+\b'
|
|
372
|
+
]
|
|
373
|
+
for pattern in external_refs:
|
|
374
|
+
if re.search(pattern, text.lower()):
|
|
375
|
+
penalties.append(0.1)
|
|
376
|
+
|
|
377
|
+
# Check for incomplete lists/enumerations
|
|
378
|
+
if re.search(r'\b(firstly|first|1\.)\b', text.lower()):
|
|
379
|
+
if not re.search(r'\b(secondly|second|2\.)\b', text.lower()):
|
|
380
|
+
penalties.append(0.1) # Started enumeration but incomplete
|
|
381
|
+
|
|
382
|
+
return max(0.0, score - sum(penalties))
|
|
383
|
+
|
|
384
|
+
def _score_density(self, text: str, sentences: List[str]) -> float:
|
|
385
|
+
"""Score information density."""
|
|
386
|
+
if not text:
|
|
387
|
+
return 0.0
|
|
388
|
+
|
|
389
|
+
words = text.lower().split()
|
|
390
|
+
word_count = len(words)
|
|
391
|
+
|
|
392
|
+
if word_count == 0:
|
|
393
|
+
return 0.0
|
|
394
|
+
|
|
395
|
+
# Count filler words
|
|
396
|
+
filler_count = sum(1 for w in words if w in self.FILLER_WORDS)
|
|
397
|
+
filler_ratio = filler_count / word_count
|
|
398
|
+
|
|
399
|
+
# Check for repetition
|
|
400
|
+
unique_words = set(words)
|
|
401
|
+
uniqueness_ratio = len(unique_words) / word_count
|
|
402
|
+
|
|
403
|
+
# Sentence length variance (too uniform = formulaic)
|
|
404
|
+
if len(sentences) > 1:
|
|
405
|
+
lengths = [len(s.split()) for s in sentences]
|
|
406
|
+
length_variance = np.std(lengths) / (np.mean(lengths) + 1)
|
|
407
|
+
variance_score = min(1.0, length_variance * 2) # Some variance is good
|
|
408
|
+
else:
|
|
409
|
+
variance_score = 0.5
|
|
410
|
+
|
|
411
|
+
density = (
|
|
412
|
+
0.4 * (1 - filler_ratio * 5) + # Penalize fillers
|
|
413
|
+
0.4 * uniqueness_ratio +
|
|
414
|
+
0.2 * variance_score
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
return max(0.0, min(1.0, density))
|
|
418
|
+
|
|
419
|
+
def _score_boundaries(self, text: str) -> float:
|
|
420
|
+
"""Score quality of chunk boundaries."""
|
|
421
|
+
import re
|
|
422
|
+
|
|
423
|
+
score = 1.0
|
|
424
|
+
|
|
425
|
+
# Check start
|
|
426
|
+
for pattern in self.INCOMPLETE_START_PATTERNS:
|
|
427
|
+
if re.match(pattern, text.strip(), re.IGNORECASE):
|
|
428
|
+
score -= 0.15
|
|
429
|
+
break
|
|
430
|
+
|
|
431
|
+
# Check end
|
|
432
|
+
for pattern in self.INCOMPLETE_END_PATTERNS:
|
|
433
|
+
if re.search(pattern, text.strip()):
|
|
434
|
+
score -= 0.15
|
|
435
|
+
break
|
|
436
|
+
|
|
437
|
+
# Bonus for clean sentence boundaries
|
|
438
|
+
text = text.strip()
|
|
439
|
+
if text and text[-1] in '.!?"\'':
|
|
440
|
+
score += 0.1
|
|
441
|
+
|
|
442
|
+
return max(0.0, min(1.0, score))
|
|
443
|
+
|
|
444
|
+
def _score_size(self, token_count: int) -> float:
|
|
445
|
+
"""Score chunk size relative to target."""
|
|
446
|
+
if self.target_token_size == 0:
|
|
447
|
+
return 1.0
|
|
448
|
+
|
|
449
|
+
ratio = token_count / self.target_token_size
|
|
450
|
+
|
|
451
|
+
# Optimal range: 0.7 - 1.3 of target
|
|
452
|
+
if 0.7 <= ratio <= 1.3:
|
|
453
|
+
return 1.0
|
|
454
|
+
elif 0.5 <= ratio <= 1.5:
|
|
455
|
+
return 0.8
|
|
456
|
+
elif 0.3 <= ratio <= 2.0:
|
|
457
|
+
return 0.5
|
|
458
|
+
else:
|
|
459
|
+
return 0.2
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, Tuple
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class InMemoryIndex:
|
|
7
|
+
def __init__(self, dim: int):
|
|
8
|
+
self.dim = dim
|
|
9
|
+
self.vecs = []
|
|
10
|
+
self._vec_array = None # Cache for numpy representation
|
|
11
|
+
self.meta = []
|
|
12
|
+
|
|
13
|
+
def add(self, vectors: List[List[float]], metas: List[Dict[str, Any]]):
|
|
14
|
+
self.vecs.extend(vectors)
|
|
15
|
+
self.meta.extend(metas)
|
|
16
|
+
self._vec_array = None # Invalidate cache on add
|
|
17
|
+
|
|
18
|
+
def search(self, query_vec: List[float], top_k: int = 10) -> List[Tuple[int, float]]:
|
|
19
|
+
if self._vec_array is None:
|
|
20
|
+
if not self.vecs:
|
|
21
|
+
return []
|
|
22
|
+
self._vec_array = np.array(self.vecs, dtype=np.float32)
|
|
23
|
+
|
|
24
|
+
V = self._vec_array
|
|
25
|
+
q = np.array(query_vec, dtype=np.float32)
|
|
26
|
+
|
|
27
|
+
# Matrix multiplication for cosine similarity (if normalized)
|
|
28
|
+
# Handle case where q might be a batch or just a single vector
|
|
29
|
+
if q.ndim == 1:
|
|
30
|
+
sims = (V @ q)
|
|
31
|
+
else:
|
|
32
|
+
sims = (V @ q.T).T
|
|
33
|
+
|
|
34
|
+
top_k = min(top_k, V.shape[0])
|
|
35
|
+
if q.ndim == 1:
|
|
36
|
+
idxs = np.argpartition(-sims, top_k-1)[:top_k]
|
|
37
|
+
ranked = sorted([(int(i), float(sims[i])) for i in idxs], key=lambda x: -x[1])
|
|
38
|
+
else:
|
|
39
|
+
# Batch mode search support
|
|
40
|
+
results = []
|
|
41
|
+
for query_sims in sims:
|
|
42
|
+
idxs = np.argpartition(-query_sims, top_k-1)[:top_k]
|
|
43
|
+
ranked = sorted([(int(i), float(query_sims[i])) for i in idxs], key=lambda x: -x[1])
|
|
44
|
+
results.append(ranked)
|
|
45
|
+
return results
|
|
46
|
+
|
|
47
|
+
return ranked
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
import os, json
|
|
3
|
+
from . import plan as plan_mod
|
|
4
|
+
from ..utils.hashing import sha256_hex
|
|
5
|
+
|
|
6
|
+
class Cache:
|
|
7
|
+
def __init__(self, root: str):
|
|
8
|
+
self.root = root
|
|
9
|
+
os.makedirs(self.root, exist_ok=True)
|
|
10
|
+
|
|
11
|
+
def path_for(self, *parts: str) -> str:
|
|
12
|
+
return os.path.join(self.root, *parts)
|
|
13
|
+
|
|
14
|
+
def get_json(self, key: str):
|
|
15
|
+
p = self.path_for(key + ".json")
|
|
16
|
+
if os.path.exists(p):
|
|
17
|
+
with open(p, 'r', encoding='utf-8') as f:
|
|
18
|
+
return json.load(f)
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
def set_json(self, key: str, value):
|
|
22
|
+
p = self.path_for(key + ".json")
|
|
23
|
+
os.makedirs(os.path.dirname(p), exist_ok=True)
|
|
24
|
+
with open(p, 'w', encoding='utf-8') as f:
|
|
25
|
+
json.dump(value, f, ensure_ascii=False, indent=2)
|
|
26
|
+
|
|
27
|
+
def put_bytes(self, key: str, data: bytes):
|
|
28
|
+
p = self.path_for(key)
|
|
29
|
+
os.makedirs(os.path.dirname(p), exist_ok=True)
|
|
30
|
+
with open(p, 'wb') as f:
|
|
31
|
+
f.write(data)
|
|
32
|
+
|
|
33
|
+
def has(self, key: str) -> bool:
|
|
34
|
+
return os.path.exists(self.path_for(key))
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import os, json, time
|
|
4
|
+
from dataclasses import dataclass, asdict, field
|
|
5
|
+
from typing import Dict, Any
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Plan:
|
|
10
|
+
id: str
|
|
11
|
+
corpus_hash: str
|
|
12
|
+
generator_pipeline: Dict[str, Any]
|
|
13
|
+
metrics: Dict[str, Any]
|
|
14
|
+
embedding: Dict[str, Any]
|
|
15
|
+
created_at: str = field(default_factory=lambda: time.strftime("%Y-%m-%d %H:%M:%S"))
|
|
16
|
+
|
|
17
|
+
def apply(self, documents, chunker) -> list:
|
|
18
|
+
"""Delegate application to chunker with the saved generator params."""
|
|
19
|
+
gen_name = self.generator_pipeline.get("name")
|
|
20
|
+
params = self.generator_pipeline.get("params", {})
|
|
21
|
+
return chunker.apply_with_generator(documents, gen_name, params)
|
|
22
|
+
|
|
23
|
+
def to_langchain(self) -> 'AutoChunkLangChainAdapter':
|
|
24
|
+
from ..adapters.langchain import AutoChunkLangChainAdapter
|
|
25
|
+
return AutoChunkLangChainAdapter(plan=self)
|
|
26
|
+
|
|
27
|
+
def to_llamaindex(self) -> 'AutoChunkLlamaIndexAdapter':
|
|
28
|
+
from ..adapters.llamaindex import AutoChunkLlamaIndexAdapter
|
|
29
|
+
return AutoChunkLlamaIndexAdapter(plan=self)
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def write(path: str, plan: 'Plan'):
|
|
33
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
34
|
+
yaml.safe_dump(asdict(plan), f, sort_keys=False, allow_unicode=True)
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def read(path: str) -> 'Plan':
|
|
38
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
39
|
+
data = yaml.safe_load(f)
|
|
40
|
+
return Plan(**data)
|
|
File without changes
|