autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,459 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, Callable, Optional
4
+ from dataclasses import dataclass, field
5
+ import numpy as np
6
+ from ..chunkers.base import Chunk
7
+ from ..utils.text import count_tokens, split_sentences
8
+ from ..utils.logger import logger
9
+ import time
10
+
11
+ @dataclass
12
+ class ChunkQualityReport:
13
+ """Comprehensive quality report for a chunk."""
14
+ chunk_id: str
15
+ overall_score: float # 0-1, higher is better
16
+
17
+ # Individual dimension scores (0-1)
18
+ coherence_score: float # Internal semantic consistency
19
+ completeness_score: float # Self-containedness
20
+ density_score: float # Information density
21
+ boundary_score: float # Quality of start/end boundaries
22
+ size_score: float # Optimal size relative to target
23
+
24
+ # Detailed metrics
25
+ token_count: int
26
+ sentence_count: int
27
+ avg_sentence_length: float
28
+
29
+ # Flags
30
+ issues: List[str] = field(default_factory=list)
31
+ recommendations: List[str] = field(default_factory=list)
32
+
33
+
34
+ class ChunkQualityScorer:
35
+ """
36
+ World-Class Chunk Quality Scoring System.
37
+
38
+ Evaluates chunks across multiple quality dimensions to identify
39
+ problematic chunks and guide optimization.
40
+
41
+ QUALITY DIMENSIONS:
42
+ 1. Coherence: Internal semantic consistency (embedding similarity)
43
+ 2. Completeness: Self-containedness (no dangling references)
44
+ 3. Density: Information richness vs fluff
45
+ 4. Boundaries: Clean starts and ends (no mid-sentence cuts)
46
+ 5. Size: Optimal length for the target use case
47
+
48
+ SCORING METHODOLOGY:
49
+ - Each dimension is scored 0-1
50
+ - Weighted combination for overall score
51
+ - Issue detection with actionable recommendations
52
+ """
53
+
54
+ # Patterns indicating incomplete boundaries
55
+ INCOMPLETE_START_PATTERNS = [
56
+ r'^[a-z]', # Starts with lowercase
57
+ r'^(and|but|or|so|then|however|therefore|thus|hence)\b', # Starts with conjunction
58
+ r'^(this|that|these|those|it|they|he|she)\b', # Starts with pronoun
59
+ r'^\.', # Starts with period
60
+ r'^\,', # Starts with comma
61
+ ]
62
+
63
+ INCOMPLETE_END_PATTERNS = [
64
+ r'[,;:]\s*$', # Ends with comma/semicolon
65
+ r'\b(and|or|but|the|a|an)\s*$', # Ends with article/conjunction
66
+ r'[^.!?\"\')\]]\s*$', # Doesn't end with terminal punctuation
67
+ ]
68
+
69
+ # Filler words that reduce density
70
+ FILLER_WORDS = {
71
+ 'very', 'really', 'quite', 'rather', 'somewhat', 'basically',
72
+ 'actually', 'literally', 'just', 'simply', 'obviously', 'clearly',
73
+ 'in order to', 'due to the fact that', 'at this point in time'
74
+ }
75
+
76
+ def __init__(self,
77
+ embedding_fn: Callable[[List[str]], List[List[float]]] = None,
78
+ target_token_size: int = 512,
79
+ weights: Dict[str, float] = None):
80
+ """
81
+ Initialize the quality scorer.
82
+
83
+ Args:
84
+ embedding_fn: Function to generate embeddings for coherence scoring.
85
+ target_token_size: Ideal chunk size for size scoring.
86
+ weights: Custom weights for each dimension.
87
+ """
88
+ self.embedding_fn = embedding_fn
89
+ self.target_token_size = target_token_size
90
+ self.weights = weights or {
91
+ 'coherence': 0.25,
92
+ 'completeness': 0.20,
93
+ 'density': 0.15,
94
+ 'boundary': 0.20,
95
+ 'size': 0.20
96
+ }
97
+
98
+ def score_chunk(self, chunk: Chunk) -> ChunkQualityReport:
99
+ """
100
+ Generate comprehensive quality report for a single chunk.
101
+
102
+ Args:
103
+ chunk: The chunk to evaluate
104
+
105
+ Returns:
106
+ ChunkQualityReport with all metrics
107
+ """
108
+ text = chunk.text
109
+ issues = []
110
+ recommendations = []
111
+
112
+ # Basic metrics
113
+ token_count = count_tokens(text)
114
+ sentences = split_sentences(text)
115
+ sentence_count = len(sentences)
116
+ avg_sentence_length = np.mean([count_tokens(s) for s in sentences]) if sentences else 0
117
+
118
+ # 1. Coherence Score
119
+ coherence_score = self._score_coherence(sentences)
120
+ if coherence_score < 0.6:
121
+ issues.append("Low internal coherence - chunk may contain unrelated content")
122
+ recommendations.append("Consider splitting at topic boundaries")
123
+
124
+ # 2. Completeness Score
125
+ completeness_score = self._score_completeness(text)
126
+ if completeness_score < 0.7:
127
+ issues.append("Chunk may not be self-contained")
128
+ recommendations.append("Resolve pronouns and add context")
129
+
130
+ # 3. Density Score
131
+ density_score = self._score_density(text, sentences)
132
+ if density_score < 0.5:
133
+ issues.append("Low information density")
134
+ recommendations.append("Remove filler words and redundant phrases")
135
+
136
+ # 4. Boundary Score
137
+ boundary_score = self._score_boundaries(text)
138
+ if boundary_score < 0.7:
139
+ issues.append("Incomplete boundaries detected")
140
+ recommendations.append("Adjust split points to sentence boundaries")
141
+
142
+ # 5. Size Score
143
+ size_score = self._score_size(token_count)
144
+ if size_score < 0.6:
145
+ if token_count < self.target_token_size * 0.3:
146
+ issues.append("Chunk is too small")
147
+ recommendations.append("Merge with adjacent chunks")
148
+ else:
149
+ issues.append("Chunk is too large")
150
+ recommendations.append("Split into smaller chunks")
151
+
152
+ # Calculate overall score
153
+ overall_score = (
154
+ self.weights['coherence'] * coherence_score +
155
+ self.weights['completeness'] * completeness_score +
156
+ self.weights['density'] * density_score +
157
+ self.weights['boundary'] * boundary_score +
158
+ self.weights['size'] * size_score
159
+ )
160
+
161
+ return ChunkQualityReport(
162
+ chunk_id=chunk.id,
163
+ overall_score=overall_score,
164
+ coherence_score=coherence_score,
165
+ completeness_score=completeness_score,
166
+ density_score=density_score,
167
+ boundary_score=boundary_score,
168
+ size_score=size_score,
169
+ token_count=token_count,
170
+ sentence_count=sentence_count,
171
+ avg_sentence_length=avg_sentence_length,
172
+ issues=issues,
173
+ recommendations=recommendations
174
+ )
175
+
176
+ def score_chunks(self, chunks: List[Chunk]) -> List[ChunkQualityReport]:
177
+ """
178
+ Score multiple chunks with optimized batch embedding and progress logging.
179
+ """
180
+ if not chunks:
181
+ return []
182
+
183
+ logger.info(f"QualityScorer: Scoring {len(chunks)} chunks...")
184
+ start_time = time.time()
185
+
186
+ # Optimization: Pre-collect and batch embed all sentences for coherence scoring
187
+ all_sentences = []
188
+ chunk_sentence_maps = []
189
+
190
+ if self.embedding_fn:
191
+ logger.info(f"QualityScorer: Splitting sentences for batch embedding...")
192
+ for chunk in chunks:
193
+ sentences = split_sentences(chunk.text)
194
+ start_idx = len(all_sentences)
195
+ all_sentences.extend(sentences)
196
+ end_idx = len(all_sentences)
197
+ chunk_sentence_maps.append((sentences, list(range(start_idx, end_idx))))
198
+
199
+ logger.info(f"QualityScorer: Batch embedding {len(all_sentences)} sentences...")
200
+ embed_start = time.time()
201
+ all_embeddings = self.embedding_fn(all_sentences)
202
+ logger.info(f"QualityScorer: Embedding finished in {time.time()-embed_start:.2f}s")
203
+
204
+ reports = []
205
+ for i, chunk in enumerate(chunks):
206
+ if i > 0 and i % 50 == 0:
207
+ logger.info(f"QualityScorer: Scoring progress {i}/{len(chunks)}...")
208
+
209
+ sentences, indices = chunk_sentence_maps[i]
210
+ embeddings = [all_embeddings[idx] for idx in indices]
211
+ # Pass pre-calculated embeddings to a specialized internal method
212
+ reports.append(self._score_chunk_with_cached_embeddings(chunk, sentences, embeddings))
213
+
214
+ logger.info(f"QualityScorer: Total scoring finished in {time.time()-start_time:.2f}s")
215
+ return reports
216
+ else:
217
+ # Fallback to serial scoring if no embedding function
218
+ logger.info(f"QualityScorer: Using non-embedding fallback scoring...")
219
+ results = [self.score_chunk(chunk) for chunk in chunks]
220
+ logger.info(f"QualityScorer: Fallback scoring finished in {time.time()-start_time:.2f}s")
221
+ return results
222
+
223
+ def _score_chunk_with_cached_embeddings(self,
224
+ chunk: Chunk,
225
+ sentences: List[str],
226
+ sentence_embeddings: List[List[float]]) -> ChunkQualityReport:
227
+ """Internal version of score_chunk that uses pre-calculated embeddings."""
228
+ text = chunk.text
229
+ issues = []
230
+ recommendations = []
231
+
232
+ # Basic metrics
233
+ token_count = count_tokens(text)
234
+ sentence_count = len(sentences)
235
+ avg_sentence_length = np.mean([count_tokens(s) for s in sentences]) if sentences else 0
236
+
237
+ # 1. Coherence Score (using pre-calculated embeddings)
238
+ if self.embedding_fn:
239
+ coherence_score = float(self._score_coherence_cached(sentences, sentence_embeddings))
240
+ else:
241
+ coherence_score = float(self._lexical_coherence(sentences))
242
+
243
+ if coherence_score < 0.6:
244
+ issues.append("Low internal coherence")
245
+ recommendations.append("Consider splitting at topic boundaries")
246
+
247
+ completeness_score = float(self._score_completeness(text))
248
+ density_score = float(self._score_density(text, sentences))
249
+ boundary_score = float(self._score_boundaries(text))
250
+ size_score = float(self._score_size(token_count))
251
+
252
+ overall_score = float(
253
+ self.weights['coherence'] * coherence_score +
254
+ self.weights['completeness'] * completeness_score +
255
+ self.weights['density'] * density_score +
256
+ self.weights['boundary'] * boundary_score +
257
+ self.weights['size'] * size_score
258
+ )
259
+
260
+ return ChunkQualityReport(
261
+ chunk_id=chunk.id, overall_score=overall_score,
262
+ coherence_score=coherence_score, completeness_score=completeness_score,
263
+ density_score=density_score, boundary_score=boundary_score,
264
+ size_score=size_score, token_count=token_count,
265
+ sentence_count=sentence_count, avg_sentence_length=float(avg_sentence_length),
266
+ issues=issues, recommendations=recommendations
267
+ )
268
+
269
+ def _score_coherence_cached(self, sentences: List[str], embeddings: List[List[float]]) -> float:
270
+ """Score coherence using provided embeddings."""
271
+ if len(sentences) <= 1: return 1.0
272
+ if not embeddings: return self._lexical_coherence(sentences)
273
+
274
+ emb_array = np.array(embeddings)
275
+ similarities = []
276
+ for i in range(len(emb_array) - 1):
277
+ for j in range(i + 1, len(emb_array)):
278
+ norm_i = np.linalg.norm(emb_array[i])
279
+ norm_j = np.linalg.norm(emb_array[j])
280
+ if norm_i > 0 and norm_j > 0:
281
+ sim = np.dot(emb_array[i], emb_array[j]) / (norm_i * norm_j)
282
+ similarities.append(sim)
283
+ return np.mean(similarities) if similarities else 0.5
284
+
285
+ def get_summary_stats(self, reports: List[ChunkQualityReport]) -> Dict[str, Any]:
286
+ """Get aggregate statistics from multiple reports with plain Python types for serialization."""
287
+ if not reports:
288
+ return {}
289
+
290
+ scores = [r.overall_score for r in reports]
291
+ return {
292
+ 'count': len(reports),
293
+ 'mean_score': float(np.mean(scores)),
294
+ 'std_score': float(np.std(scores)),
295
+ 'min_score': float(np.min(scores)),
296
+ 'max_score': float(np.max(scores)),
297
+ 'below_threshold': int(sum(1 for s in scores if s < 0.6)),
298
+ 'dimension_means': {
299
+ 'coherence': float(np.mean([r.coherence_score for r in reports])),
300
+ 'completeness': float(np.mean([r.completeness_score for r in reports])),
301
+ 'density': float(np.mean([r.density_score for r in reports])),
302
+ 'boundary': float(np.mean([r.boundary_score for r in reports])),
303
+ 'size': float(np.mean([r.size_score for r in reports]))
304
+ }
305
+ }
306
+
307
+ def _score_coherence(self, sentences: List[str]) -> float:
308
+ """Score internal semantic consistency."""
309
+ if len(sentences) <= 1:
310
+ return 1.0 # Single sentence is coherent by definition
311
+
312
+ if self.embedding_fn is None:
313
+ # Fallback: use lexical overlap
314
+ return self._lexical_coherence(sentences)
315
+
316
+ try:
317
+ embeddings = np.array(self.embedding_fn(sentences))
318
+ n = len(embeddings)
319
+ if n <= 1: return 1.0
320
+
321
+ # Vectorized pairwise cosine similarity
322
+ # Normalize embeddings first
323
+ norms = np.linalg.norm(embeddings, axis=1)
324
+ norms[norms < 1e-9] = 1.0
325
+ norm_embeddings = embeddings / norms[:, np.newaxis]
326
+
327
+ # Similarity matrix (N x N)
328
+ sim_matrix = norm_embeddings @ norm_embeddings.T
329
+
330
+ # Extract upper triangle (excluding diagonal) for pairwise mean
331
+ tri_indices = np.triu_indices(n, k=1)
332
+ pair_similarities = sim_matrix[tri_indices]
333
+
334
+ return float(np.mean(pair_similarities)) if pair_similarities.size > 0 else 0.5
335
+ except Exception as e:
336
+ logger.debug(f"Vectorized coherence scoring failed: {e}")
337
+ return self._lexical_coherence(sentences)
338
+
339
+ def _lexical_coherence(self, sentences: List[str]) -> float:
340
+ """Fallback coherence using word overlap."""
341
+ if len(sentences) <= 1:
342
+ return 1.0
343
+
344
+ word_sets = [set(s.lower().split()) for s in sentences]
345
+ overlaps = []
346
+
347
+ for i in range(len(word_sets) - 1):
348
+ intersection = word_sets[i] & word_sets[i + 1]
349
+ union = word_sets[i] | word_sets[i + 1]
350
+ if union:
351
+ overlaps.append(len(intersection) / len(union))
352
+
353
+ return np.mean(overlaps) if overlaps else 0.5
354
+
355
+ def _score_completeness(self, text: str) -> float:
356
+ """Score self-containedness."""
357
+ import re
358
+
359
+ score = 1.0
360
+ penalties = []
361
+
362
+ # Check for unresolved pronouns at start
363
+ first_sentence = split_sentences(text)[0] if split_sentences(text) else text[:100]
364
+ pronoun_pattern = r'\b(this|that|these|those|it|they|he|she|him|her|them)\b'
365
+ if re.search(pronoun_pattern, first_sentence.lower()):
366
+ penalties.append(0.15)
367
+
368
+ # Check for references to external context
369
+ external_refs = [
370
+ r'\babove\b', r'\bbelow\b', r'\bpreviously\b', r'\bfollowing\b',
371
+ r'\bas mentioned\b', r'\bas discussed\b', r'\bsee \w+\b'
372
+ ]
373
+ for pattern in external_refs:
374
+ if re.search(pattern, text.lower()):
375
+ penalties.append(0.1)
376
+
377
+ # Check for incomplete lists/enumerations
378
+ if re.search(r'\b(firstly|first|1\.)\b', text.lower()):
379
+ if not re.search(r'\b(secondly|second|2\.)\b', text.lower()):
380
+ penalties.append(0.1) # Started enumeration but incomplete
381
+
382
+ return max(0.0, score - sum(penalties))
383
+
384
+ def _score_density(self, text: str, sentences: List[str]) -> float:
385
+ """Score information density."""
386
+ if not text:
387
+ return 0.0
388
+
389
+ words = text.lower().split()
390
+ word_count = len(words)
391
+
392
+ if word_count == 0:
393
+ return 0.0
394
+
395
+ # Count filler words
396
+ filler_count = sum(1 for w in words if w in self.FILLER_WORDS)
397
+ filler_ratio = filler_count / word_count
398
+
399
+ # Check for repetition
400
+ unique_words = set(words)
401
+ uniqueness_ratio = len(unique_words) / word_count
402
+
403
+ # Sentence length variance (too uniform = formulaic)
404
+ if len(sentences) > 1:
405
+ lengths = [len(s.split()) for s in sentences]
406
+ length_variance = np.std(lengths) / (np.mean(lengths) + 1)
407
+ variance_score = min(1.0, length_variance * 2) # Some variance is good
408
+ else:
409
+ variance_score = 0.5
410
+
411
+ density = (
412
+ 0.4 * (1 - filler_ratio * 5) + # Penalize fillers
413
+ 0.4 * uniqueness_ratio +
414
+ 0.2 * variance_score
415
+ )
416
+
417
+ return max(0.0, min(1.0, density))
418
+
419
+ def _score_boundaries(self, text: str) -> float:
420
+ """Score quality of chunk boundaries."""
421
+ import re
422
+
423
+ score = 1.0
424
+
425
+ # Check start
426
+ for pattern in self.INCOMPLETE_START_PATTERNS:
427
+ if re.match(pattern, text.strip(), re.IGNORECASE):
428
+ score -= 0.15
429
+ break
430
+
431
+ # Check end
432
+ for pattern in self.INCOMPLETE_END_PATTERNS:
433
+ if re.search(pattern, text.strip()):
434
+ score -= 0.15
435
+ break
436
+
437
+ # Bonus for clean sentence boundaries
438
+ text = text.strip()
439
+ if text and text[-1] in '.!?"\'':
440
+ score += 0.1
441
+
442
+ return max(0.0, min(1.0, score))
443
+
444
+ def _score_size(self, token_count: int) -> float:
445
+ """Score chunk size relative to target."""
446
+ if self.target_token_size == 0:
447
+ return 1.0
448
+
449
+ ratio = token_count / self.target_token_size
450
+
451
+ # Optimal range: 0.7 - 1.3 of target
452
+ if 0.7 <= ratio <= 1.3:
453
+ return 1.0
454
+ elif 0.5 <= ratio <= 1.5:
455
+ return 0.8
456
+ elif 0.3 <= ratio <= 2.0:
457
+ return 0.5
458
+ else:
459
+ return 0.2
File without changes
@@ -0,0 +1,47 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, Tuple
4
+ import numpy as np
5
+
6
+ class InMemoryIndex:
7
+ def __init__(self, dim: int):
8
+ self.dim = dim
9
+ self.vecs = []
10
+ self._vec_array = None # Cache for numpy representation
11
+ self.meta = []
12
+
13
+ def add(self, vectors: List[List[float]], metas: List[Dict[str, Any]]):
14
+ self.vecs.extend(vectors)
15
+ self.meta.extend(metas)
16
+ self._vec_array = None # Invalidate cache on add
17
+
18
+ def search(self, query_vec: List[float], top_k: int = 10) -> List[Tuple[int, float]]:
19
+ if self._vec_array is None:
20
+ if not self.vecs:
21
+ return []
22
+ self._vec_array = np.array(self.vecs, dtype=np.float32)
23
+
24
+ V = self._vec_array
25
+ q = np.array(query_vec, dtype=np.float32)
26
+
27
+ # Matrix multiplication for cosine similarity (if normalized)
28
+ # Handle case where q might be a batch or just a single vector
29
+ if q.ndim == 1:
30
+ sims = (V @ q)
31
+ else:
32
+ sims = (V @ q.T).T
33
+
34
+ top_k = min(top_k, V.shape[0])
35
+ if q.ndim == 1:
36
+ idxs = np.argpartition(-sims, top_k-1)[:top_k]
37
+ ranked = sorted([(int(i), float(sims[i])) for i in idxs], key=lambda x: -x[1])
38
+ else:
39
+ # Batch mode search support
40
+ results = []
41
+ for query_sims in sims:
42
+ idxs = np.argpartition(-query_sims, top_k-1)[:top_k]
43
+ ranked = sorted([(int(i), float(query_sims[i])) for i in idxs], key=lambda x: -x[1])
44
+ results.append(ranked)
45
+ return results
46
+
47
+ return ranked
@@ -0,0 +1,4 @@
1
+
2
+ class ParentChildRetriever:
3
+ def __init__(self):
4
+ raise NotImplementedError("Parent-Child retrieval is a placeholder in this starter.")
File without changes
@@ -0,0 +1,34 @@
1
+
2
+ import os, json
3
+ from . import plan as plan_mod
4
+ from ..utils.hashing import sha256_hex
5
+
6
+ class Cache:
7
+ def __init__(self, root: str):
8
+ self.root = root
9
+ os.makedirs(self.root, exist_ok=True)
10
+
11
+ def path_for(self, *parts: str) -> str:
12
+ return os.path.join(self.root, *parts)
13
+
14
+ def get_json(self, key: str):
15
+ p = self.path_for(key + ".json")
16
+ if os.path.exists(p):
17
+ with open(p, 'r', encoding='utf-8') as f:
18
+ return json.load(f)
19
+ return None
20
+
21
+ def set_json(self, key: str, value):
22
+ p = self.path_for(key + ".json")
23
+ os.makedirs(os.path.dirname(p), exist_ok=True)
24
+ with open(p, 'w', encoding='utf-8') as f:
25
+ json.dump(value, f, ensure_ascii=False, indent=2)
26
+
27
+ def put_bytes(self, key: str, data: bytes):
28
+ p = self.path_for(key)
29
+ os.makedirs(os.path.dirname(p), exist_ok=True)
30
+ with open(p, 'wb') as f:
31
+ f.write(data)
32
+
33
+ def has(self, key: str) -> bool:
34
+ return os.path.exists(self.path_for(key))
@@ -0,0 +1,40 @@
1
+
2
+ from __future__ import annotations
3
+ import os, json, time
4
+ from dataclasses import dataclass, asdict, field
5
+ from typing import Dict, Any
6
+ import yaml
7
+
8
+ @dataclass
9
+ class Plan:
10
+ id: str
11
+ corpus_hash: str
12
+ generator_pipeline: Dict[str, Any]
13
+ metrics: Dict[str, Any]
14
+ embedding: Dict[str, Any]
15
+ created_at: str = field(default_factory=lambda: time.strftime("%Y-%m-%d %H:%M:%S"))
16
+
17
+ def apply(self, documents, chunker) -> list:
18
+ """Delegate application to chunker with the saved generator params."""
19
+ gen_name = self.generator_pipeline.get("name")
20
+ params = self.generator_pipeline.get("params", {})
21
+ return chunker.apply_with_generator(documents, gen_name, params)
22
+
23
+ def to_langchain(self) -> 'AutoChunkLangChainAdapter':
24
+ from ..adapters.langchain import AutoChunkLangChainAdapter
25
+ return AutoChunkLangChainAdapter(plan=self)
26
+
27
+ def to_llamaindex(self) -> 'AutoChunkLlamaIndexAdapter':
28
+ from ..adapters.llamaindex import AutoChunkLlamaIndexAdapter
29
+ return AutoChunkLlamaIndexAdapter(plan=self)
30
+
31
+ @staticmethod
32
+ def write(path: str, plan: 'Plan'):
33
+ with open(path, 'w', encoding='utf-8') as f:
34
+ yaml.safe_dump(asdict(plan), f, sort_keys=False, allow_unicode=True)
35
+
36
+ @staticmethod
37
+ def read(path: str) -> 'Plan':
38
+ with open(path, 'r', encoding='utf-8') as f:
39
+ data = yaml.safe_load(f)
40
+ return Plan(**data)
File without changes
@@ -0,0 +1,8 @@
1
+
2
+ import hashlib
3
+
4
+ def sha256_hex(data: bytes) -> str:
5
+ return hashlib.sha256(data).hexdigest()
6
+
7
+ def content_hash(text: str) -> str:
8
+ return sha256_hex(text.encode("utf-8"))