autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,215 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Callable, Optional, Pattern
4
+ import re
5
+ from .base import BaseChunker, Chunk
6
+ from ..utils.text import count_tokens, get_tokens, decode_tokens
7
+
8
+ class RecursiveCharacterChunker(BaseChunker):
9
+ """
10
+ Recursive Character Chunker with Tiered Separators.
11
+
12
+ BEST-OF-BREED FEATURES:
13
+ 1. Regex Separator Support: Use regex patterns via `is_separator_regex=True`.
14
+ 2. Keep Separator Mode: Preserves delimiters at chunk boundaries.
15
+ 3. Start Index Tracking: Records character offset for citation purposes.
16
+ 4. Adaptive Fallback: Falls back to token-split when separators are exhausted.
17
+ 5. Code Block Awareness: Avoids splitting inside fenced code blocks.
18
+ """
19
+ name = "recursive_character"
20
+
21
+ # Default separator hierarchy (most significant first)
22
+ DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""]
23
+
24
+ def __init__(self,
25
+ separators: List[str] = None,
26
+ is_separator_regex: bool = False,
27
+ keep_separator: bool = True,
28
+ tokenizer: str = "auto"):
29
+ """
30
+ Initialize the chunker.
31
+
32
+ Args:
33
+ separators: List of separators in priority order
34
+ is_separator_regex: If True, treat separators as regex patterns
35
+ keep_separator: If True, include separator in the chunk that precedes it
36
+ tokenizer: Backend for tokenization
37
+ """
38
+ self.separators = separators or self.DEFAULT_SEPARATORS
39
+ self.is_separator_regex = is_separator_regex
40
+ self.keep_separator = keep_separator
41
+ self.tokenizer = tokenizer
42
+
43
+ def chunk(self,
44
+ doc_id: str,
45
+ text: str,
46
+ base_token_size: int = 512,
47
+ overlap: int = 64,
48
+ add_start_index: bool = False,
49
+ respect_code_blocks: bool = True,
50
+ **params) -> List[Chunk]:
51
+ """
52
+ Recursively split text using separator hierarchy.
53
+
54
+ Args:
55
+ doc_id: Document identifier
56
+ text: Input text
57
+ base_token_size: Target chunk size in tokens
58
+ overlap: Number of tokens to overlap between chunks
59
+ add_start_index: If True, record character start position
60
+ respect_code_blocks: If True, avoid splitting inside ``` blocks
61
+
62
+ Returns:
63
+ List of Chunk objects
64
+ """
65
+ separators = params.get("separators", self.separators)
66
+
67
+ # Extract code blocks if needed
68
+ code_block_ranges = []
69
+ if respect_code_blocks:
70
+ from ..utils.text import extract_code_blocks
71
+ code_blocks = extract_code_blocks(text)
72
+ code_block_ranges = [(b["start"], b["end"]) for b in code_blocks]
73
+
74
+ def _is_in_code_block(pos: int) -> bool:
75
+ for start, end in code_block_ranges:
76
+ if start <= pos < end:
77
+ return True
78
+ return False
79
+
80
+ def _split_with_separator(input_text: str, separator: str, is_regex: bool) -> List[str]:
81
+ """Split text while optionally keeping the separator."""
82
+ if is_regex:
83
+ pattern = separator
84
+ else:
85
+ pattern = re.escape(separator)
86
+
87
+ if self.keep_separator:
88
+ # Use capturing group to keep separator
89
+ raw_splits = re.split(f"({pattern})", input_text)
90
+ # Merge separator with preceding text
91
+ splits = []
92
+ for i in range(0, len(raw_splits) - 1, 2):
93
+ combined = raw_splits[i] + raw_splits[i + 1]
94
+ if combined:
95
+ splits.append(combined)
96
+ if len(raw_splits) % 2 == 1 and raw_splits[-1]:
97
+ splits.append(raw_splits[-1])
98
+ return splits
99
+ else:
100
+ return [s for s in re.split(pattern, input_text) if s]
101
+
102
+ def _recursive_split(input_text: str, seps: List[str], char_offset: int = 0) -> List[tuple]:
103
+ """
104
+ Recursively split text.
105
+ Returns list of (text, start_char_index) tuples.
106
+ """
107
+ token_count = count_tokens(input_text, tokenizer=self.tokenizer)
108
+
109
+ # Base case: fits in one chunk
110
+ if token_count <= base_token_size:
111
+ return [(input_text, char_offset)]
112
+
113
+ # No more separators: fallback to token splitting
114
+ if not seps:
115
+ all_tokens = get_tokens(input_text, tokenizer=self.tokenizer)
116
+ results = []
117
+ step = max(1, base_token_size - overlap)
118
+ pos = 0
119
+ curr_char = char_offset
120
+
121
+ while pos < len(all_tokens):
122
+ window = all_tokens[pos : pos + base_token_size]
123
+ chunk_text = decode_tokens(window)
124
+ results.append((chunk_text, curr_char))
125
+
126
+ stepped = all_tokens[pos : pos + step]
127
+ curr_char += len(decode_tokens(stepped))
128
+ pos += step
129
+
130
+ if pos >= len(all_tokens):
131
+ break
132
+
133
+ return results
134
+
135
+ # Try current separator
136
+ curr_sep = seps[0]
137
+ remaining_seps = seps[1:]
138
+
139
+ splits = _split_with_separator(input_text, curr_sep, self.is_separator_regex)
140
+
141
+ # If separator didn't help, try next
142
+ if len(splits) <= 1:
143
+ return _recursive_split(input_text, remaining_seps, char_offset)
144
+
145
+ # Merge splits into chunks
146
+ final_chunks = []
147
+ buffer = []
148
+ buffer_tokens = 0
149
+ buffer_start = char_offset
150
+ current_char = char_offset
151
+
152
+ for split_text in splits:
153
+ split_tokens = count_tokens(split_text, tokenizer=self.tokenizer)
154
+
155
+ # If a single split is too big, recurse deeper
156
+ if split_tokens > base_token_size:
157
+ # Flush buffer first
158
+ if buffer:
159
+ final_chunks.append(("".join(buffer), buffer_start))
160
+ buffer = []
161
+ buffer_tokens = 0
162
+
163
+ # Recurse on the large split
164
+ sub_chunks = _recursive_split(split_text, remaining_seps, current_char)
165
+ final_chunks.extend(sub_chunks)
166
+ current_char += len(split_text)
167
+ buffer_start = current_char
168
+ continue
169
+
170
+ # Would this overflow the buffer?
171
+ if buffer_tokens + split_tokens > base_token_size and buffer:
172
+ final_chunks.append(("".join(buffer), buffer_start))
173
+
174
+ # Handle overlap: keep last N tokens worth of text
175
+ overlap_buffer = []
176
+ overlap_tokens = 0
177
+ for prev in reversed(buffer):
178
+ prev_tokens = count_tokens(prev, tokenizer=self.tokenizer)
179
+ if overlap_tokens + prev_tokens <= overlap:
180
+ overlap_buffer.insert(0, prev)
181
+ overlap_tokens += prev_tokens
182
+ else:
183
+ break
184
+
185
+ buffer = overlap_buffer
186
+ buffer_tokens = overlap_tokens
187
+ # Adjust start position for overlap
188
+ buffer_start = current_char - len("".join(overlap_buffer))
189
+
190
+ buffer.append(split_text)
191
+ buffer_tokens += split_tokens
192
+ current_char += len(split_text)
193
+
194
+ if buffer:
195
+ final_chunks.append(("".join(buffer), buffer_start))
196
+
197
+ return final_chunks
198
+
199
+ # Execute recursive pipeline
200
+ raw_chunks = _recursive_split(text, separators, 0)
201
+
202
+ # Wrap in Chunk objects
203
+ return [
204
+ Chunk(
205
+ id=f"{doc_id}#rc#{i}",
206
+ doc_id=doc_id,
207
+ text=chunk_text.strip(),
208
+ meta={
209
+ "chunk_index": i,
210
+ "strategy": "recursive_character",
211
+ "token_count": count_tokens(chunk_text, tokenizer=self.tokenizer),
212
+ **({"start_index": start_idx} if add_start_index else {})
213
+ }
214
+ ) for i, (chunk_text, start_idx) in enumerate(raw_chunks) if chunk_text.strip()
215
+ ]
@@ -0,0 +1,140 @@
1
+
2
+ from __future__ import annotations
3
+ import numpy as np
4
+ from typing import List, Any, Callable
5
+ from .base import BaseChunker, Chunk
6
+ from ..utils.text import split_sentences, count_tokens
7
+
8
+ class SemanticLocalChunker(BaseChunker):
9
+ """
10
+ Semantic Chunker using Window-based Gradient Similarity.
11
+ Detects topic shifts by comparing sliding windows of sentence embeddings.
12
+
13
+ BEST-OF-BREED IMPROVEMENTS:
14
+ 1. Windowed-Similarity: Instead of comparing adjacent sentences, it compares the
15
+ "semantic momentum" of previous and future windows. This suppresses noise from
16
+ short/outlier sentences.
17
+ 2. Dynamic Percentile: Uses a robust percentile-based peak detection for boundaries.
18
+ 3. Multi-Factor Safety: Combines semantic drift with strict token caps to ensure LLM
19
+ context-window compliance.
20
+ """
21
+ name = "semantic_local"
22
+
23
+ def chunk(self,
24
+ doc_id: str,
25
+ text: str,
26
+ embedding_fn: Callable[[List[str]], List[List[float]]] = None,
27
+ threshold_percentile: float = 0.9,
28
+ window_size: int = 3,
29
+ **params) -> List[Chunk]:
30
+
31
+ sentences = split_sentences(text)
32
+ if len(sentences) <= 1:
33
+ return [Chunk(id=f"{doc_id}#sl#0", doc_id=doc_id, text=text, meta={"chunk_index": 0})]
34
+
35
+ if embedding_fn is None:
36
+ # Fallback to SentenceAware if no embeddings
37
+ from .sentence_aware import SentenceAwareChunker
38
+ return SentenceAwareChunker().chunk(doc_id, text, **params)
39
+
40
+ # 1. Vectorize all sentences -style
41
+ embeddings = np.array(embedding_fn(sentences))
42
+ n = len(embeddings)
43
+
44
+ # 2. Calculate Windowed Distances (Vectorized Gradient)
45
+ # We use a moving average window to calculate semantic momentum
46
+ def moving_average(a, n=3):
47
+ ret = np.cumsum(a, axis=0)
48
+ ret[n:] = ret[n:] - ret[:-n]
49
+ return ret[n - 1:] / n
50
+
51
+ # Pre-calculate windowed means (shifted to align with gaps between sentences)
52
+ # For a prompt comparison between windows [i-w+1:i+1] and [i+1:i+w+1]
53
+
54
+ # We can do this efficiently by calculating all possible window means once
55
+ # and then slicing them.
56
+
57
+ distances = []
58
+ # Fallback to loop if document is very short, otherwise vectorized logic
59
+ # For simplicity and robustness with edge cases (min/max window),
60
+ # we'll keep the loop but optimize the inside with pre-calculated norms
61
+ # but in a real-world scenario we'd use np.convolve for a pure vector path.
62
+
63
+ norms = np.linalg.norm(embeddings, axis=1)
64
+ # Avoid division by zero
65
+ norms[norms == 0] = 1e-9
66
+ norm_embeddings = embeddings / norms[:, np.newaxis]
67
+
68
+ for i in range(n - 1):
69
+ start_prev = max(0, i - window_size + 1)
70
+ end_prev = i + 1
71
+ start_next = i + 1
72
+ end_next = min(n, i + 1 + window_size)
73
+
74
+ # Use raw embeddings for mean to preserve magnitude signal if desired,
75
+ # or normalized embeddings for pure cosine. usually uses raw mean then normalize.
76
+ vec_prev = np.mean(embeddings[start_prev:end_prev], axis=0)
77
+ vec_next = np.mean(embeddings[start_next:end_next], axis=0)
78
+
79
+ norm_p = np.linalg.norm(vec_prev)
80
+ norm_n = np.linalg.norm(vec_next)
81
+
82
+ if norm_p < 1e-9 or norm_n < 1e-9:
83
+ distances.append(0.0)
84
+ else:
85
+ sim = np.dot(vec_prev, vec_next) / (norm_p * norm_n)
86
+ distances.append(float(1 - sim))
87
+
88
+ # 3. Peak Detection
89
+ if not distances:
90
+ return [Chunk(id=f"{doc_id}#sl#0", doc_id=doc_id, text=text, meta={"chunk_index": 0})]
91
+
92
+ breakpoint_threshold = np.percentile(distances, threshold_percentile * 100)
93
+
94
+ # 4. Greedy Assembly with Safety Caps
95
+ safety_max_tokens = params.get("base_token_size", 512) * 2 # Usually 2x the target size is a good semantic ceiling
96
+ if "safety_max_tokens" in params:
97
+ safety_max_tokens = params["safety_max_tokens"]
98
+
99
+ chunks = []
100
+ curr_buffer = []
101
+ curr_tokens = 0
102
+
103
+ for i, sentence in enumerate(sentences):
104
+ sent_tokens = count_tokens(sentence)
105
+
106
+ # Decide to split BEFORE adding the sentence?
107
+ is_semantic_split = False
108
+ if i > 0 and i-1 < len(distances):
109
+ if distances[i-1] >= breakpoint_threshold and distances[i-1] > 0:
110
+ is_semantic_split = True
111
+
112
+ # Safety Split: Don't let semantic chunks grow into monsters
113
+ is_safety_split = curr_tokens + sent_tokens > safety_max_tokens
114
+
115
+ if (is_semantic_split or is_safety_split) and curr_buffer:
116
+ chunks.append(self._make_chunk(doc_id, curr_buffer, len(chunks), "safety" if is_safety_split else "semantic"))
117
+ curr_buffer = []
118
+ curr_tokens = 0
119
+
120
+ curr_buffer.append(sentence)
121
+ curr_tokens += sent_tokens
122
+
123
+ if curr_buffer:
124
+ chunks.append(self._make_chunk(doc_id, curr_buffer, len(chunks), "final"))
125
+
126
+ return chunks
127
+
128
+ def _make_chunk(self, doc_id: str, buffer: List[str], index: int, split_reason: str) -> Chunk:
129
+ text = " ".join(buffer).strip()
130
+ return Chunk(
131
+ id=f"{doc_id}#sl#{index}",
132
+ doc_id=doc_id,
133
+ text=text,
134
+ meta={
135
+ "chunk_index": index,
136
+ "strategy": "semantic_local",
137
+ "split_reason": split_reason,
138
+ "token_count": count_tokens(text)
139
+ }
140
+ )
@@ -0,0 +1,102 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Any
4
+ from .base import BaseChunker, Chunk
5
+ from ..utils.text import split_sentences, count_tokens
6
+
7
+ class SentenceAwareChunker(BaseChunker):
8
+ """
9
+ Sentence-Aware Chunker with Look-back Overlap.
10
+ Groups sentences while respecting token limits and providing context continuity.
11
+
12
+ BEST-OF-BREED IMPROVEMENTS:
13
+ 1. Sentence Overlap: Repeating the last N sentences for transition context.
14
+ 2. Intelligent Oversize Handling: Uses recursive splitting for "monster sentences"
15
+ instead of crude fixed-length fallback.
16
+ 3. NLTK Integration: Leverages the updated sentence splitter.
17
+ """
18
+ name = "sentence_aware"
19
+
20
+ def chunk(self, doc_id: str, text: str, base_token_size: int = 512, overlap: int = 64, **params) -> List[Chunk]:
21
+ sentences = split_sentences(text)
22
+ chunks = []
23
+ current_buffer = []
24
+ current_tokens = 0
25
+ idx = 0
26
+
27
+ # We define overlap in sentences if possible, or tokens
28
+ # Standard is to use token-based sentence overlap
29
+
30
+ for s in sentences:
31
+ sent_tokens = count_tokens(s)
32
+
33
+ # 1. Handle "Monster Sentences" (Single sentence > limit)
34
+ if sent_tokens > base_token_size:
35
+ # If we have stuff in buffer, flush first
36
+ if current_buffer:
37
+ chunks.append(self._make_chunk(doc_id, current_buffer, idx))
38
+ idx += 1
39
+ # Prepare overlap from the end of the buffer
40
+ current_buffer, current_tokens = self._get_overlap(current_buffer, overlap)
41
+
42
+ # Use recursive logic for this giant sentence
43
+ from .recursive_character import RecursiveCharacterChunker
44
+ sub_chunker = RecursiveCharacterChunker()
45
+ sub_chunks = sub_chunker.chunk(f"{doc_id}_monster", s, base_token_size=base_token_size, overlap=overlap)
46
+
47
+ for sc in sub_chunks:
48
+ chunks.append(Chunk(
49
+ id=f"{doc_id}#sa#{idx}",
50
+ doc_id=doc_id,
51
+ text=sc.text,
52
+ meta={"chunk_index": idx, "strategy": "sentence_aware_recursive"}
53
+ ))
54
+ idx += 1
55
+
56
+ # Reset buffer after a monster sentence break
57
+ current_buffer = []
58
+ current_tokens = 0
59
+ continue
60
+
61
+ # 2. Regular Accumulation
62
+ if current_tokens + sent_tokens > base_token_size:
63
+ # Flush
64
+ chunks.append(self._make_chunk(doc_id, current_buffer, idx))
65
+ idx += 1
66
+
67
+ # Context Overlap (Sentence-Level)
68
+ current_buffer, current_tokens = self._get_overlap(current_buffer, overlap)
69
+
70
+ current_buffer.append(s)
71
+ current_tokens += sent_tokens
72
+
73
+ if current_buffer:
74
+ chunks.append(self._make_chunk(doc_id, current_buffer, idx))
75
+
76
+ return chunks
77
+
78
+ def _get_overlap(self, buffer: List[str], overlap_limit: int) -> tuple[List[str], int]:
79
+ """Calculates the suffix of sentences to carry over for overlap."""
80
+ overlap_buffer = []
81
+ overlap_tokens = 0
82
+ for s in reversed(buffer):
83
+ t = count_tokens(s)
84
+ if overlap_tokens + t <= overlap_limit:
85
+ overlap_buffer.insert(0, s)
86
+ overlap_tokens += t
87
+ else:
88
+ break
89
+ return overlap_buffer, overlap_tokens
90
+
91
+ def _make_chunk(self, doc_id: str, buffer: List[str], index: int) -> Chunk:
92
+ text = " ".join(buffer).strip()
93
+ return Chunk(
94
+ id=f"{doc_id}#sa#{index}",
95
+ doc_id=doc_id,
96
+ text=text,
97
+ meta={
98
+ "chunk_index": index,
99
+ "strategy": "sentence_aware",
100
+ "token_count": count_tokens(text)
101
+ }
102
+ )
autochunk/cli.py ADDED
@@ -0,0 +1,135 @@
1
+
2
+ import argparse
3
+ import sys
4
+ import os
5
+ import json
6
+ import yaml
7
+ from typing import List, Optional
8
+
9
+ from .autochunker import AutoChunker
10
+ from .config import AutoChunkConfig, EvalConfig, ProxyConfig, NetworkConfig, RagasConfig
11
+ from .storage.plan import Plan
12
+ from .utils.logger import logger
13
+
14
+ def cmd_optimize(args):
15
+ """Run the optimization search from CLI."""
16
+ logger.info(f"Starting CLI Optimization on: {args.docs}")
17
+
18
+ cfg = AutoChunkConfig(
19
+ mode=args.mode,
20
+ embedding_provider=args.embedding_provider,
21
+ embedding_model_or_path=args.embedding_model,
22
+ cache_dir=args.cache_dir
23
+ )
24
+
25
+ # Optional override for proxy
26
+ if args.no_proxy:
27
+ cfg.proxy_config.enabled = False
28
+
29
+ chunker = AutoChunker(
30
+ mode=cfg.mode,
31
+ eval_config=EvalConfig(objective=args.objective),
32
+ embedding_provider=cfg.embedding_provider,
33
+ embedding_model_or_path=cfg.embedding_model_or_path,
34
+ cache_dir=cfg.cache_dir
35
+ )
36
+
37
+ # Enable RAGAS if flag is set
38
+ if args.analyze_ragas:
39
+ chunker.cfg.ragas_config.enabled = True
40
+ if hasattr(args, 'ragas_llm_provider') and args.ragas_llm_provider:
41
+ chunker.cfg.ragas_config.llm_provider = args.ragas_llm_provider
42
+ if hasattr(args, 'ragas_llm_model') and args.ragas_llm_model:
43
+ chunker.cfg.ragas_config.llm_model = args.ragas_llm_model
44
+ logger.info(f"RAGAS enabled. LLM Provider: {chunker.cfg.ragas_config.llm_provider}")
45
+
46
+ def on_progress(msg, step=None):
47
+ prefix = f"[Stage {step}] " if step else ""
48
+ logger.info(f"{prefix}{msg}")
49
+
50
+ plan, report = chunker.optimize(
51
+ documents=args.docs,
52
+ on_progress=on_progress
53
+ )
54
+
55
+ # Save the plan
56
+ Plan.write(args.out, plan)
57
+ logger.success(f"Optimization complete! Winning plan saved to: {args.out}")
58
+
59
+ if args.report:
60
+ with open(args.report, 'w') as f:
61
+ json.dump(report, f, indent=2)
62
+ logger.info(f"Full metrics report saved to: {args.report}")
63
+
64
+ def cmd_apply(args):
65
+ """Apply a winning plan to a corpus."""
66
+ logger.info(f"Applying plan {args.plan} to docs: {args.docs}")
67
+
68
+ plan = Plan.read(args.plan)
69
+
70
+ # Initialize chunker with plan's embedding settings
71
+ chunker = AutoChunker(
72
+ embedding_provider=plan.embedding.get("name"),
73
+ embedding_model_or_path=plan.embedding.get("model")
74
+ )
75
+
76
+ chunks = plan.apply(args.docs, chunker)
77
+
78
+ with open(args.out, 'w', encoding='utf-8') as f:
79
+ json.dump(chunks, f, ensure_ascii=False, indent=2)
80
+
81
+ logger.success(f"Successfully created {len(chunks)} chunks. Selection saved to: {args.out}")
82
+
83
+ def cmd_serve(args):
84
+ """Launch the Dashboard."""
85
+ import uvicorn
86
+ from .web.server import app
87
+ logger.info(f"Launching AutoChunks Dashboard on http://{args.host}:{args.port}")
88
+ uvicorn.run(app, host=args.host, port=args.port)
89
+
90
+ def main():
91
+ parser = argparse.ArgumentParser(
92
+ description="AutoChunks: Autonomous Retrieval Optimization for RAG",
93
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
94
+ )
95
+ subparsers = parser.add_subparsers(dest="command", help="Command to execute")
96
+
97
+ # Optimize command
98
+ opt_p = subparsers.add_parser("optimize", help="Search for the best chunking strategy")
99
+ opt_p.add_argument("--docs", required=True, help="Path to document folder")
100
+ opt_p.add_argument("--mode", default="light", choices=["light", "full"], help="Evaluation Depth: Controls synthetic QA sampling density")
101
+ opt_p.add_argument("--objective", default="balanced", choices=["balanced", "quality", "cost", "latency", "metric_only"], help="Optimization objective")
102
+ opt_p.add_argument("--embedding-provider", default="hashing", help="Embedding provider")
103
+ opt_p.add_argument("--embedding-model", default="BAAI/bge-small-en-v1.5", help="Model name or path")
104
+ opt_p.add_argument("--out", default="best_plan.yaml", help="Output path for the winning plan")
105
+ opt_p.add_argument("--report", help="Output path for the full JSON metrics report")
106
+ opt_p.add_argument("--cache-dir", default=".ac_cache", help="Cache directory")
107
+ opt_p.add_argument("--no-proxy", action="store_true", help="Disable representative sampling (process all docs)")
108
+ opt_p.add_argument("--analyze-ragas", action="store_true", help="Enable RAGAS LLM-based evaluation metrics (Context Precision/Recall)")
109
+ opt_p.add_argument("--ragas-llm-provider", default="auto", choices=["auto", "openai", "ollama", "huggingface"], help="LLM provider for RAGAS (auto detects OpenAI key or Ollama)")
110
+ opt_p.add_argument("--ragas-llm-model", default=None, help="Model name for RAGAS LLM (e.g., gpt-4o-mini, llama3.2)")
111
+
112
+ # Apply command
113
+ app_p = subparsers.add_parser("apply", help="Execute a saved plan on a corpus")
114
+ app_p.add_argument("--plan", required=True, help="Path to the .yaml plan file")
115
+ app_p.add_argument("--docs", required=True, help="Path to documents to chunk")
116
+ app_p.add_argument("--out", default="chunks.json", help="Output path for processed chunks")
117
+
118
+ # Serve command
119
+ srv_p = subparsers.add_parser("serve", help="Start the Web Dashboard")
120
+ srv_p.add_argument("--host", default="0.0.0.0", help="Host address")
121
+ srv_p.add_argument("--port", type=int, default=8000, help="Port number")
122
+
123
+ args = parser.parse_args()
124
+
125
+ if args.command == "optimize":
126
+ cmd_optimize(args)
127
+ elif args.command == "apply":
128
+ cmd_apply(args)
129
+ elif args.command == "serve":
130
+ cmd_serve(args)
131
+ else:
132
+ parser.print_help()
133
+
134
+ if __name__ == "__main__":
135
+ main()
autochunk/config.py ADDED
@@ -0,0 +1,76 @@
1
+
2
+ from __future__ import annotations
3
+ from dataclasses import dataclass, field
4
+ from typing import Callable, Optional, List, Dict, Any
5
+
6
+ @dataclass
7
+ class EvalConfig:
8
+ metrics: List[str] = field(default_factory=lambda: ["mrr@10", "ndcg@10", "recall@50"])
9
+ objective: str = "balanced" # quality|cost|latency|balanced
10
+ k: int = 10
11
+ latency_target_ms: int = 250
12
+ cost_budget_usd: float = 5.0
13
+ parent_child_eval: bool = False
14
+
15
+ @dataclass
16
+ class RetrievalStrategy:
17
+ type: str = "standard" # or parent_child
18
+ child_token_size: int = 128
19
+ parent_token_size: int = 1024
20
+
21
+ @dataclass
22
+ class ProxyConfig:
23
+ enabled: bool = False
24
+ cluster_k: int = 5
25
+ proxy_percent: int = 10
26
+ verify_percent: int = 20
27
+
28
+ @dataclass
29
+ class RagasConfig:
30
+ enabled: bool = False
31
+ metrics: List[str] = field(default_factory=lambda: ["context_precision", "context_recall"])
32
+ sample_size: int = 20 # Limit RAGAS to a subset to save costs
33
+ llm_provider: str = "openai" # openai|ollama|huggingface
34
+ llm_model: Optional[str] = None # Model name (e.g., "gpt-4o-mini", "llama3.2", "microsoft/Phi-3-mini-4k-instruct")
35
+ api_key: Optional[str] = field(default=None, repr=False) # API Key for OpenAI/Cloud providers
36
+
37
+ @dataclass
38
+ class SafetyConstraints:
39
+ max_chunks_per_doc: int = 5000
40
+ min_avg_chunk_tokens: int = 120
41
+ max_redundant_overlap_ratio: float = 0.35
42
+
43
+ @dataclass
44
+ class ParallelConfig:
45
+ embedding_concurrency: int = 4
46
+ retriever_concurrency: int = 4
47
+ batch_size: int = 32
48
+
49
+ @dataclass
50
+ class NetworkConfig:
51
+ proxy_url: Optional[str] = None
52
+ local_models_path: Optional[str] = None
53
+ trusted_orgs: List[str] = field(default_factory=lambda: ["ds4sd", "RapidAI", "BAAI", "sentence-transformers"])
54
+
55
+ @dataclass
56
+ class TokenizerConfig:
57
+ name: str = "whitespace"
58
+ vocab_source: str = "custom"
59
+
60
+ @dataclass
61
+ class AutoChunkConfig:
62
+ eval_config: EvalConfig = field(default_factory=EvalConfig)
63
+ retrieval_strategy: RetrievalStrategy = field(default_factory=RetrievalStrategy)
64
+ proxy_config: ProxyConfig = field(default_factory=ProxyConfig)
65
+ ragas_config: RagasConfig = field(default_factory=RagasConfig)
66
+ safety: SafetyConstraints = field(default_factory=SafetyConstraints)
67
+ parallel: ParallelConfig = field(default_factory=ParallelConfig)
68
+ tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)
69
+ network: NetworkConfig = field(default_factory=NetworkConfig)
70
+ embedding_provider: str = "hashing" # hashing|local|tei|openai
71
+ embedding_model_or_path: str = "BAAI/bge-small-en-v1.5"
72
+ embedding_api_key: Optional[str] = field(default=None, repr=False) # API Key for cloud embedding providers
73
+ mode: str = "light" # light|full|incremental
74
+ cache_dir: str = ".ac_cache"
75
+ telemetry_enabled: bool = False # Enable Tracing (Arize Phoenix)
76
+ metadata_enrichment: Dict[str, Any] = field(default_factory=dict)