autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
autochunk/utils/io.py ADDED
@@ -0,0 +1,176 @@
1
+ import os, hashlib
2
+ from typing import List, Dict, Optional, Callable
3
+ from .logger import logger
4
+ from .hashing import sha256_hex
5
+
6
+ SUPPORTED_EXTS = {".txt", ".md", ".pdf", ".html", ".htm"}
7
+
8
+ def read_text_file(path: str) -> str:
9
+ with open(path, 'r', encoding='utf-8', errors='ignore') as f:
10
+ return f.read()
11
+
12
+ def _load_pdf_markdown(path: str, on_progress: Optional[Callable[[str], None]] = None) -> str:
13
+ """Progressive PDF to Markdown extraction for AutoChunks native chunkers."""
14
+ import fitz
15
+ doc = fitz.open(path)
16
+ total_pages = len(doc)
17
+
18
+ if on_progress:
19
+ on_progress(f"Analyzing structure of {total_pages} pages...")
20
+ logger.info(f"Deep Analysis started for {os.path.basename(path)} ({total_pages} pages)")
21
+
22
+ full_text = []
23
+
24
+ try:
25
+ import pymupdf4llm
26
+ # Process in chunks of 5 pages for high-frequency real-time progress
27
+ chunk_size = 5
28
+ for i in range(0, total_pages, chunk_size):
29
+ end_page = min(i + chunk_size, total_pages)
30
+ if on_progress:
31
+ on_progress(f"Extracting layout (Pages {i+1}-{end_page} of {total_pages})...")
32
+
33
+ # Extract specific page range
34
+ page_text = pymupdf4llm.to_markdown(path, pages=list(range(i, end_page)))
35
+ full_text.append(page_text)
36
+
37
+ return "\n".join(full_text)
38
+
39
+ except Exception as e:
40
+ logger.warning(f" extraction failed, falling back to basic text: {e}")
41
+ return _load_pdf_raw(path, on_progress)
42
+
43
+ def _load_pdf_raw(path: str, on_progress: Optional[Callable[[str], None]] = None) -> str:
44
+ """Raw text extraction from PDF for fair bridge comparison."""
45
+ import fitz
46
+ doc = fitz.open(path)
47
+ total_pages = len(doc)
48
+ if on_progress: on_progress(f"Fast extraction: {total_pages} pages...")
49
+ return "\n".join([page.get_text() for page in doc])
50
+
51
+ def _load_html_processed(path: str) -> str:
52
+ """Clean HTML to plain text using BeautifulSoup - for AutoChunks native chunkers."""
53
+ try:
54
+ from bs4 import BeautifulSoup
55
+ content = read_text_file(path)
56
+ soup = BeautifulSoup(content, "lxml")
57
+
58
+ # Remove script and style elements
59
+ for script in soup(["script", "style"]):
60
+ script.decompose()
61
+
62
+ # Get text
63
+ text = soup.get_text(separator=' ')
64
+
65
+ # Break into lines and remove leading and trailing whitespace
66
+ lines = (line.strip() for line in text.splitlines())
67
+ # Break multi-headlines into a line each
68
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
69
+ # Drop blank lines
70
+ text = '\n'.join(chunk for chunk in chunks if chunk)
71
+ return text
72
+ except Exception as e:
73
+ logger.warning(f"HTML parsing failed for {path}: {e}")
74
+ return read_text_file(path)
75
+
76
+ def _load_html_raw(path: str) -> str:
77
+ """Raw HTML text for LangChain HTML splitter."""
78
+ return read_text_file(path)
79
+
80
+ def load_documents(root: str, on_progress: Optional[Callable[[str], None]] = None, high_fidelity: bool = True) -> List[Dict]:
81
+ """
82
+ Load documents with BOTH raw and processed versions for fair evaluation.
83
+
84
+ Returns documents with:
85
+ - text: Processed version (for AutoChunks native chunkers)
86
+ - raw_text: Original/raw version (for LangChain bridges)
87
+ """
88
+ if on_progress: on_progress(f"Scanning directory: {root}")
89
+ docs = []
90
+
91
+ # Text cache directory
92
+ cache_root = os.path.join(os.getcwd(), ".autochunk", "text_cache")
93
+ os.makedirs(cache_root, exist_ok=True)
94
+
95
+ if not os.path.exists(root):
96
+ raise FileNotFoundError(f"Path not found: {root}")
97
+
98
+ for dirpath, _, filenames in os.walk(root):
99
+ for fn in filenames:
100
+ ext = os.path.splitext(fn)[1].lower()
101
+ if ext in SUPPORTED_EXTS:
102
+ p = os.path.abspath(os.path.normpath(os.path.join(dirpath, fn)))
103
+ try:
104
+ # 1. Quick Binary Hash
105
+ with open(p, "rb") as bf:
106
+ binary_data = bf.read()
107
+ b_hash = sha256_hex(binary_data)
108
+
109
+ cache_file_proc = os.path.join(cache_root, f"{b_hash}_processed.txt")
110
+ cache_file_raw = os.path.join(cache_root, f"{b_hash}_raw.txt")
111
+
112
+ text_processed = ""
113
+ text_raw = ""
114
+
115
+ # Check cache for processed text
116
+ if os.path.exists(cache_file_proc) and os.path.exists(cache_file_raw):
117
+ # Cache Hit!
118
+ if on_progress: on_progress(f"⚡ Cache Hit: Loading {fn}...")
119
+ logger.info(f"CACHE HIT: Reusing text for {fn}")
120
+ with open(cache_file_proc, "r", encoding="utf-8") as tf:
121
+ text_processed = tf.read()
122
+ with open(cache_file_raw, "r", encoding="utf-8") as tf:
123
+ text_raw = tf.read()
124
+ else:
125
+ # Cache Miss: Load both versions
126
+ if on_progress: on_progress(f"Processing ({len(docs)+1}): {fn}...")
127
+ logger.info(f"Loading document: {fn}")
128
+
129
+ if ext == ".pdf":
130
+ # Processed: Markdown extraction (AutoChunks advantage)
131
+ if high_fidelity:
132
+ text_processed = _load_pdf_markdown(p, on_progress)
133
+ else:
134
+ text_processed = _load_pdf_raw(p, on_progress)
135
+ # Raw: Basic text extraction (fair for bridges)
136
+ text_raw = _load_pdf_raw(p, None)
137
+
138
+ elif ext in [".html", ".htm"]:
139
+ # Processed: BeautifulSoup cleaned (for text-based chunkers)
140
+ text_processed = _load_html_processed(p)
141
+ # Raw: Original HTML (for LangChain HTML splitter)
142
+ text_raw = _load_html_raw(p)
143
+
144
+ else:
145
+ # Plain text files: same for both
146
+ text_processed = read_text_file(p)
147
+ text_raw = text_processed
148
+
149
+ # Save to cache
150
+ if text_processed.strip():
151
+ with open(cache_file_proc, "w", encoding="utf-8") as tf:
152
+ tf.write(text_processed)
153
+ if text_raw.strip():
154
+ with open(cache_file_raw, "w", encoding="utf-8") as tf:
155
+ tf.write(text_raw)
156
+
157
+ if text_processed.strip() or text_raw.strip():
158
+ docs.append({
159
+ "id": p,
160
+ "path": p,
161
+ "text": text_processed, # For AutoChunks native chunkers
162
+ "raw_text": text_raw, # For LangChain bridges
163
+ "ext": ext,
164
+ "hash": b_hash
165
+ })
166
+ logger.success(f"Loaded {fn} (processed: {len(text_processed)} chars, raw: {len(text_raw)} chars)")
167
+ else:
168
+ logger.warning(f"Skipping empty document: {p}")
169
+ except Exception as e:
170
+ logger.error(f"Failed to load {p}: {e}")
171
+
172
+ if not docs:
173
+ raise RuntimeError(f"No supported documents found in {root} (supported: {SUPPORTED_EXTS})")
174
+
175
+ logger.info(f"Loaded {len(docs)} documents from {root}")
176
+ return docs
@@ -0,0 +1,64 @@
1
+ import sys, time, logging
2
+ from loguru import logger
3
+ from contextvars import ContextVar
4
+ from typing import List, Dict, Any, Optional
5
+
6
+ # Global storage for jobs
7
+ jobs_data = {}
8
+ # ContextVar to track the job_id in the current task/thread
9
+ current_job_id: ContextVar[Optional[str]] = ContextVar("current_job_id", default=None)
10
+
11
+ # Add a Pretty Console Handler (Keep it)
12
+ logger.remove()
13
+ logger.add(
14
+ sys.stderr,
15
+ format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
16
+ level="INFO"
17
+ )
18
+
19
+ # Custom sink to relay logs to the UI jobs_data
20
+ def ui_log_sink(message):
21
+ record = message.record
22
+ # Try extra first (bind), then contextvar
23
+ job_id = record["extra"].get("job_id") or current_job_id.get()
24
+
25
+ if job_id and job_id in jobs_data:
26
+ msg_text = record["message"]
27
+ level = record["level"].name
28
+ success = level not in ["ERROR", "CRITICAL"]
29
+
30
+ # Format the message for the UI with its level
31
+ formatted_msg = f"[{level}] {msg_text}"
32
+
33
+ entry = {
34
+ "time": time.strftime("%H:%M:%S"),
35
+ "msg": formatted_msg,
36
+ "success": success
37
+ }
38
+ jobs_data[job_id]["logs"].append(entry)
39
+
40
+ # Redirect standard library logging to Loguru
41
+ class PropagateHandler(logging.Handler):
42
+ def emit(self, record):
43
+ try:
44
+ level = logger.level(record.levelname).name
45
+ except ValueError:
46
+ level = record.levelno
47
+
48
+ frame, depth = logging.currentframe(), 2
49
+ while frame and frame.f_code.co_filename == logging.__file__:
50
+ frame = frame.f_back
51
+ depth += 1
52
+
53
+ logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
54
+
55
+ def setup_library_interception():
56
+ logging.basicConfig(handlers=[PropagateHandler()], level=0, force=True)
57
+ # Target specific verbose libraries if needed
58
+ for name in ["pymupdf", "fitz", "pymupdf4llm"]:
59
+ l = logging.getLogger(name)
60
+ l.handlers = [PropagateHandler()]
61
+ l.propagate = False
62
+
63
+ setup_library_interception()
64
+ logger.add(ui_log_sink, level="INFO")
@@ -0,0 +1,44 @@
1
+
2
+ from opentelemetry import trace
3
+ from opentelemetry.sdk.trace import TracerProvider
4
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
5
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
6
+ from .logger import logger
7
+
8
+ _tracer_initialized = False
9
+
10
+ def init_telemetry(endpoint: str = "http://localhost:6006/v1/traces", enabled: bool = True):
11
+ """
12
+ Initialize OpenTelemetry tracing.
13
+ By default, it points to a local Arize Phoenix instance.
14
+ """
15
+ global _tracer_initialized
16
+ if _tracer_initialized or not enabled:
17
+ return
18
+
19
+ try:
20
+ # Quick check: Is the telemetry server even there?
21
+ import urllib.request
22
+ try:
23
+ # We just check the base URL, not the full /v1/traces path necessarily,
24
+ # but a 200 or even a 404/405 from the host is better than a connection refused.
25
+ base_url = endpoint.split("/v1/")[0]
26
+ with urllib.request.urlopen(base_url, timeout=1):
27
+ pass
28
+ except Exception:
29
+ # If we can't connect in 1 second, it's likely not running.
30
+ # We don't want to spam the user with ConnectionRefusedErrors in background threads.
31
+ logger.info("Telemetry server not found at localhost:6006. Skipping tracing.")
32
+ return
33
+
34
+ provider = TracerProvider()
35
+ processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=endpoint))
36
+ provider.add_span_processor(processor)
37
+ trace.set_tracer_provider(provider)
38
+ _tracer_initialized = True
39
+ logger.info(f"Telemetry initialized, exporting to {endpoint}")
40
+ except Exception as e:
41
+ logger.debug(f"Failed to initialize telemetry: {e}. Running without tracing.")
42
+
43
+ def get_tracer(name: str = "autochunk"):
44
+ return trace.get_tracer(name)
@@ -0,0 +1,199 @@
1
+
2
+ import re
3
+ from typing import List, Callable, Optional
4
+ from functools import lru_cache
5
+
6
+ # ============================================================================
7
+ # TOKENIZATION LAYER - Multi-Backend Support
8
+ # ============================================================================
9
+
10
+ # Global tokenizer cache for performance
11
+ _tokenizer_cache = {}
12
+
13
+ def _get_tiktoken_encoder(model: str = "cl100k_base"):
14
+ """Get or create a tiktoken encoder (cached)."""
15
+ if model not in _tokenizer_cache:
16
+ try:
17
+ import tiktoken
18
+ _tokenizer_cache[model] = tiktoken.get_encoding(model)
19
+ except ImportError:
20
+ return None
21
+ return _tokenizer_cache.get(model)
22
+
23
+ def get_tokens(text: str, tokenizer: str = "auto") -> List[str]:
24
+ """
25
+ Unified tokenization across all chunkers.
26
+
27
+ Args:
28
+ text: Input text to tokenize
29
+ tokenizer: One of "auto", "tiktoken", "whitespace", "character"
30
+ "auto" tries tiktoken first, falls back to whitespace
31
+
32
+ Returns:
33
+ List of token strings
34
+ """
35
+ if tokenizer == "auto":
36
+ # Try tiktoken first (most accurate for GPT models)
37
+ enc = _get_tiktoken_encoder()
38
+ if enc:
39
+ token_ids = enc.encode(text)
40
+ # Return token strings for compatibility
41
+ return [enc.decode([t]) for t in token_ids]
42
+ # Fallback to whitespace
43
+ tokenizer = "whitespace"
44
+
45
+ if tokenizer == "tiktoken":
46
+ enc = _get_tiktoken_encoder()
47
+ if enc:
48
+ token_ids = enc.encode(text)
49
+ return [enc.decode([t]) for t in token_ids]
50
+ raise ImportError("tiktoken not installed. Run: pip install tiktoken")
51
+
52
+ if tokenizer == "character":
53
+ return list(text)
54
+
55
+ # Default: whitespace + punctuation splitting
56
+ return [t for t in re.split(r"(\s+|[.,!?;:'\"\(\)\[\]\{\}])", text) if t]
57
+
58
+ def decode_tokens(tokens: List[str]) -> str:
59
+ """Unified detokenization."""
60
+ return "".join(tokens)
61
+
62
+ def count_tokens(text: str, tokenizer: str = "auto") -> int:
63
+ """
64
+ Count tokens using the specified tokenizer.
65
+
66
+ For production RAG with GPT models, use tokenizer="tiktoken".
67
+ """
68
+ if not text:
69
+ return 0
70
+
71
+ if tokenizer == "auto" or tokenizer == "tiktoken":
72
+ enc = _get_tiktoken_encoder()
73
+ if enc:
74
+ return len(enc.encode(text))
75
+
76
+ if tokenizer == "character":
77
+ return len(text)
78
+
79
+ # Fallback to whitespace tokenization
80
+ return len([t for t in re.split(r"(\s+|[.,!?;:'\"\(\)\[\]\{\}])", text) if t.strip()])
81
+
82
+ def create_length_function(method: str = "token") -> Callable[[str], int]:
83
+ """
84
+ Factory for length functions (LangChain compatibility).
85
+
86
+ Args:
87
+ method: "token" (tiktoken), "char" (character count), "word" (word count)
88
+
89
+ Returns:
90
+ A function that takes text and returns length
91
+ """
92
+ if method == "token":
93
+ return lambda text: count_tokens(text, tokenizer="auto")
94
+ elif method == "char":
95
+ return len
96
+ elif method == "word":
97
+ return lambda text: len(text.split())
98
+ else:
99
+ return lambda text: count_tokens(text, tokenizer="auto")
100
+
101
+ # ============================================================================
102
+ # SENTENCE SPLITTING - Multi-Backend Support
103
+ # ============================================================================
104
+
105
+ def split_sentences(text: str, backend: str = "auto") -> List[str]:
106
+ """
107
+ Sentence Splitting with multiple backends.
108
+
109
+ Args:
110
+ text: Input text
111
+ backend: "auto", "nltk", "spacy", "regex"
112
+
113
+ Returns:
114
+ List of sentences
115
+ """
116
+ if not text or not text.strip():
117
+ return []
118
+
119
+ if backend == "auto" or backend == "nltk":
120
+ try:
121
+ import nltk
122
+ try:
123
+ nltk.data.find('tokenizers/punkt_tab')
124
+ except LookupError:
125
+ try:
126
+ nltk.data.find('tokenizers/punkt')
127
+ except LookupError:
128
+ nltk.download('punkt_tab', quiet=True)
129
+ return nltk.sent_tokenize(text)
130
+ except Exception:
131
+ if backend == "nltk":
132
+ raise
133
+ # Fall through to regex
134
+
135
+ if backend == "spacy":
136
+ try:
137
+ import spacy
138
+ nlp = spacy.blank("en")
139
+ nlp.add_pipe("sentencizer")
140
+ doc = nlp(text)
141
+ return [sent.text.strip() for sent in doc.sents]
142
+ except ImportError:
143
+ raise ImportError("spacy not installed. Run: pip install spacy")
144
+
145
+ # Regex fallback - handles abbreviations better than naive split
146
+ # Negative lookbehinds for common abbreviations
147
+ pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=[.!?])\s+'
148
+ parts = re.split(pattern, text.strip())
149
+ return [p.strip() for p in parts if p.strip()]
150
+
151
+ def whitespace_tokens(text: str) -> List[str]:
152
+ """Keep whitespace tokens for reconstruction."""
153
+ return [t for t in re.split(r"(\s+)", text) if t]
154
+
155
+ # ============================================================================
156
+ # TEXT UTILITIES
157
+ # ============================================================================
158
+
159
+ def get_char_to_token_map(text: str) -> List[int]:
160
+ """
161
+ Create a mapping from character index to token index.
162
+ Useful for tracking start indices in chunks.
163
+ """
164
+ tokens = get_tokens(text)
165
+ char_map = []
166
+ token_idx = 0
167
+ char_pos = 0
168
+
169
+ for i, token in enumerate(tokens):
170
+ token_len = len(token)
171
+ for _ in range(token_len):
172
+ char_map.append(i)
173
+ char_pos += token_len
174
+
175
+ return char_map
176
+
177
+ def extract_code_blocks(text: str) -> List[dict]:
178
+ """
179
+ Extract fenced code blocks from markdown text.
180
+ Returns list of {"start": int, "end": int, "content": str, "language": str}
181
+ """
182
+ pattern = r'```(\w*)\n(.*?)```'
183
+ blocks = []
184
+ for match in re.finditer(pattern, text, re.DOTALL):
185
+ blocks.append({
186
+ "start": match.start(),
187
+ "end": match.end(),
188
+ "content": match.group(2),
189
+ "language": match.group(1) or "text"
190
+ })
191
+ return blocks
192
+
193
+ def is_inside_code_block(text: str, position: int) -> bool:
194
+ """Check if a character position is inside a fenced code block."""
195
+ blocks = extract_code_blocks(text)
196
+ for block in blocks:
197
+ if block["start"] <= position < block["end"]:
198
+ return True
199
+ return False
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: autochunks
3
+ Version: 0.0.8
4
+ Summary: Autonomous Retrieval Optimization for RAG
5
+ Author: Sumit Joshi
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/s8ilabs/AutoChunks
8
+ Project-URL: Documentation, https://autochunks.readthedocs.io/
9
+ Project-URL: Repository, https://github.com/s8ilabs/AutoChunks
10
+ Project-URL: Issues, https://github.com/s8ilabs/AutoChunks/issues
11
+ Keywords: rag,chunking,retrieval,nlp
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: numpy>=1.24
22
+ Requires-Dist: pyyaml>=6.0
23
+ Requires-Dist: loguru
24
+ Requires-Dist: arize-phoenix>=4.3.0
25
+ Requires-Dist: opentelemetry-api
26
+ Requires-Dist: opentelemetry-sdk
27
+ Requires-Dist: opentelemetry-exporter-otlp
28
+ Requires-Dist: nltk
29
+ Requires-Dist: pymupdf4llm
30
+ Requires-Dist: sentence-transformers
31
+ Requires-Dist: torch
32
+ Requires-Dist: fastapi
33
+ Requires-Dist: uvicorn
34
+ Requires-Dist: python-multipart
35
+ Dynamic: license-file
36
+
37
+ # AutoChunks
38
+ ### The Intelligent Data Optimization Layer for RAG Engineering
39
+
40
+ [![Version](https://img.shields.io/badge/version-0.08--alpha-blue)](https://github.com/s8ilabs/AutoChunks)
41
+ [![Documentation](https://img.shields.io/badge/docs-read--the--docs-teal)](https://autochunks.readthedocs.io/)
42
+ [![License](https://img.shields.io/badge/license-Apache--2.0-green)](LICENSE)
43
+
44
+ ![AutoChunks Hero](docs/assets/hero_banner.png)
45
+
46
+ AutoChunks is a specialized engine designed to eliminate the guesswork from Retrieval-Augmented Generation (RAG). By treating chunking as an optimization problem rather than a set of heuristics, it empirically discovers the most performant data structures for your specific documents and retrieval models.
47
+
48
+ ---
49
+
50
+ ## From Heuristics to Evidence
51
+
52
+ Most RAG pipelines today rely on arbitrary settings—like a 512-token chunk size with a 10% overlap. These values are often chosen without validation, leading to:
53
+
54
+ * **Fragmented Context**: Related information is split across multiple retrieval units.
55
+ * **Semantic Noise**: Poorly defined boundaries dilute the signal-to-noise ratio in LLM prompts.
56
+ * **Retrieval Gaps**: Critical information hidden in "dead zones" between chunks results in recall failure.
57
+
58
+ **AutoChunks replaces trial-and-error with a data-driven tournament.** It generates adversarial synthetic ground truth from your documents and pits over 15+ chunking strategies against each other to find the mathematical optimum for your corpus.
59
+
60
+ ---
61
+
62
+ ## Core Pillars
63
+
64
+ ### The Vectorized Tournament
65
+ AutoChunks runs an exhaustive parallel search across multiple strategy families—Recursive, Semantic, Layout-Aware, and Hybrid. Every candidate is evaluated in a high-speed NumPy-accelerated retrieval simulation, measuring performance across hundreds of queries in seconds.
66
+
67
+ ### Adversarial Synthetic QA
68
+ The system performs a structural audit of your documents to generate "needle-in-a-haystack" question-answer pairs. This ensures that your chunking strategy is optimized against real-world search intent, not just random text splits.
69
+
70
+ ### Optimization Goals
71
+ Align your data engineering with your business objectives. Choose from intent-based presets that guide the engine toward specific outcomes:
72
+ * **Balanced Ranking**: Optimizes for general-purpose retrieval quality.
73
+ * **Speed and Precision**: Minimizes LLM reading time by prioritizing Rank #1 hits.
74
+ * **Comprehensive Retrieval**: Prioritizes recall for compliance or legal use cases.
75
+ * **Cost Efficiency**: Minimizes vector storage and inference costs for massive datasets.
76
+
77
+ ---
78
+
79
+ ## Advanced Feature Set
80
+
81
+ * **Hybrid Semantic-Statistical Chunker**: Uses real-time embedding distance analysis to detect topic shifts while maintaining strict token limits.
82
+ * **Framework Bridges**: Native adapters for LangChain, LlamaIndex, and Haystack, allowing you to benchmark and optimize your existing framework code directly.
83
+ * **Layout-Aware Processing**: High-fidelity extraction that respects the nested structures of PDFs, HTML sections, and Markdown hierarchies.
84
+ * **Fidelity Inspector**: A visual debugging dashboard to qualitatively verify how different strategies fragment complex documents.
85
+ * **Enterprise Security**: Air-gap compatible. Supports local model deployment, SHA-256 binary fingerprinting for data privacy, and SecretStr protection for all cloud credentials.
86
+
87
+ ---
88
+
89
+ ## Quick Start
90
+
91
+ ### Installation
92
+ ```bash
93
+ pip install -r requirements.txt
94
+ ```
95
+ *Note: For GPU acceleration with Local Embeddings or Ragas, please refer to the [Getting Started guide](docs/getting_started.md).*
96
+
97
+ ### Launch the Dashboard
98
+ The most effective way to optimize your data is through the visual interactive dashboard.
99
+ ```bash
100
+ python -m autochunk.web.server
101
+ ```
102
+ Navigate to `http://localhost:8000` to begin your first optimization run.
103
+
104
+ ### Python API
105
+ ```python
106
+ from autochunk import AutoChunker
107
+
108
+ # Initialize in Light Mode for rapid iteration
109
+ optimizer = AutoChunker(mode="light")
110
+
111
+ # Discover the optimal plan for your dataset
112
+ plan, report = optimizer.optimize(
113
+ documents_path="./my_data_folder",
114
+ objective="balanced"
115
+ )
116
+
117
+ # Apply the winning strategy
118
+ chunks = plan.apply("./new_documents", optimizer)
119
+ ```
120
+
121
+ ---
122
+
123
+ ## Documentation and Resources
124
+
125
+ * [Getting Started](docs/getting_started.md)
126
+ * [The Optimization Lifecycle](docs/core_concepts/eval_flow.md)
127
+ * [Metric Definitions and Scoring](docs/core_concepts/evaluation.md)
128
+ * [RAGAS Semantic Evaluation](docs/guides/ragas_evaluation.md)
129
+
130
+ ---
131
+
132
+ Developed for the RAG and LLM Community.
133
+ AutoChunks is released under the Apache License 2.0.
@@ -0,0 +1,61 @@
1
+ autochunk/__init__.py,sha256=tu8KBFUMu7eymVuURj7np8T_3dXTvv7JQnRcJ7BoSbM,572
2
+ autochunk/__main__.py,sha256=e_fn8GhFWu_dqslEnOKv13ueXdYWekoBFsslltAP4ZI,67
3
+ autochunk/autochunker.py,sha256=CNwCxqym25xeHAU17T2ZJM0psXpa0MrfzpP19mhX8VQ,33759
4
+ autochunk/cli.py,sha256=LQDf_D94vdtwpOsKYtGA_lF6FwFvfFK6jqU--PSVbQs,5632
5
+ autochunk/config.py,sha256=2O_lPanpp3_uLDpbvsRbrbuHy4p9O3Z2CStfzezgTsw,2891
6
+ autochunk/adapters/__init__.py,sha256=K49nSsmEy7vhlf28SzCEQQIXn6NxX2ir8M4D7XlxOGo,150
7
+ autochunk/adapters/haystack.py,sha256=pJuFOGkdpeD6GY7QafeX39lzPOgIP4McZDoL071aUgw,2230
8
+ autochunk/adapters/langchain.py,sha256=HQwqEeuUw64rl6Nh_UkjlB1vjF72LR4UzpULDBVThfk,3061
9
+ autochunk/adapters/llamaindex.py,sha256=62Yfb1VylhF9Ws86h6l70Bpe3JpHqJlSnH2THRztbEg,3213
10
+ autochunk/chunkers/__init__.py,sha256=Asb2U-lEjZyW5AbLCCPhg1xCpBMw4IXJXs6nDbus_gI,3262
11
+ autochunk/chunkers/agentic.py,sha256=rR2Og5c3zmyNciVw_aU_AXcue4lrEtonzFy46uQEU2Y,7687
12
+ autochunk/chunkers/base.py,sha256=MBjwgZyywHb1e8Oe5J5e1gF8_XltxnV2oKyKuZvPSpc,339
13
+ autochunk/chunkers/contextual_retrieval.py,sha256=bxfqBmM0x2yngnZdj1Qg9oIaNSAHtRHj5RwVjEi1soY,5993
14
+ autochunk/chunkers/fixed_length.py,sha256=PI6qcEg3XXUHnhbBP9MZHjD73meyHquJv-ljmClVgQc,3686
15
+ autochunk/chunkers/html_section.py,sha256=rNXTCjcy3emg2mGLuICqxbGUC1Uhfq6v73WvvyUs16M,9598
16
+ autochunk/chunkers/hybrid_semantic_stat.py,sha256=sZinkrtj08DSP9uFXjYZlJ1H2SUyaqmkYsL7LUnxgik,7779
17
+ autochunk/chunkers/layout_aware.py,sha256=H5lhvfgsYU9-fnost-WoRFLD6O2ByMo26ZD62iN44s8,7704
18
+ autochunk/chunkers/parent_child.py,sha256=pFDFj-b_3g77o9A4JyJ-N69ObkWUOSSVk8So-1aA8cQ,6570
19
+ autochunk/chunkers/proposition.py,sha256=glJsZ_ysd6I6ijtuBDCRv16oWBGu1K7boGmB7Tpx368,6317
20
+ autochunk/chunkers/python_ast.py,sha256=ekPjxhxjgAFoLSlmHqi1sdeeCMjSYnkjWgDkfjQcuFw,9563
21
+ autochunk/chunkers/recursive_character.py,sha256=ayRrjYsTDZvaLZY5V6cwViAIqlZ_TbPrsgBDfoaMidk,8975
22
+ autochunk/chunkers/semantic_local.py,sha256=IpYbs7UDKa-bNM5UkWzRGboSxkfGnPEujS-qdluapBY,5988
23
+ autochunk/chunkers/sentence_aware.py,sha256=EPnhNBS_OY8kN6fCTv3Yx_oFLfHFZaw2X2boSwapjoo,4064
24
+ autochunk/embedding/__init__.py,sha256=Ip8EGbBkbFUbjfhlcZy5jXBaI5uwq8glEuYLmk0nDN4,822
25
+ autochunk/embedding/adapter.py,sha256=jj0v-Oc3LJSWHBgVJ-A-_4jo8Bh03ymDjURPTAwUNJw,320
26
+ autochunk/embedding/base.py,sha256=xmcNLZeiyAYMfsddq1qh_YmmTQAoDnZ-V5phH0kZjJc,789
27
+ autochunk/embedding/hashing.py,sha256=uSB8tYiGcd-2hlayx3NbdL9mQITMXPYW4lO6-apkMfs,1174
28
+ autochunk/embedding/local.py,sha256=U1gj2EwSn0ycEDt1WhcSVc951pJSZehlEzds1h93WYc,7100
29
+ autochunk/embedding/ollama.py,sha256=-POiPUUx6qvCxkXuL2hXLzXoUcs5kZMOtF1jzw-vlLQ,2421
30
+ autochunk/embedding/openai.py,sha256=d-Dlgk1lqfikx7yb9Aig1bTuLhOCI0_eY1EDyp2aImE,2080
31
+ autochunk/embedding/tokenizer.py,sha256=gcdS4rtoW54Y8ANhY00Ctd6wkdQ535eF8MrXQ0qF8Fs,188
32
+ autochunk/enrichment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ autochunk/enrichment/contextual.py,sha256=S6pewHKr8Z6pGsqv8qbQcOsMnb9qpAUA7_fOFxLpu8g,1143
34
+ autochunk/eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ autochunk/eval/harness.py,sha256=XP2EldIbNEEeIMcHAKwbSPtjuFrm4Qn4xizFpIqp-e4,7952
36
+ autochunk/eval/metrics.py,sha256=u2MblvFpoLZbAXC0YDOP7mO8r3eJJ9YxDJgMVBXRd0o,731
37
+ autochunk/eval/ragas_eval.py,sha256=dBq-TRPtnpKaUC3BBaiTFMPZodmH7XCzHhCqAv7FPho,10573
38
+ autochunk/eval/synthetic.py,sha256=ecOSh60gPyXYJ8mC_6W2CTePUNdh91zQpYQ3Qfp34-g,4036
39
+ autochunk/quality/__init__.py,sha256=Bsfch7Icm8syUa-g-VR0z7WlUIT4MPw8bddRSDNj93U,823
40
+ autochunk/quality/deduplicator.py,sha256=zYFQE2CJdNN8foZ5SpDJQRtgetHrIJ6D0H33_Vj7qjY,14046
41
+ autochunk/quality/overlap_optimizer.py,sha256=jDX4w4gz57eL-z1tj8cThbT4z6tfXUq0BxprDJTFUuI,15719
42
+ autochunk/quality/post_processor.py,sha256=NSvZ13pqIlR0erY0zs8dZxm2X1luP1njzH-x2N5LLtk,9882
43
+ autochunk/quality/scorer.py,sha256=yK7GkT_ZO1yN6ApMpcH0QMu6vkXteZm5fxiK41rWe-E,18781
44
+ autochunk/retrieval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
+ autochunk/retrieval/in_memory.py,sha256=RGKGc-_C06j4PzKSFj92DhaUmU315P9K1rCGq40eYms,1682
46
+ autochunk/retrieval/parent_child.py,sha256=SVeuVvIqiHdXk78EAqKGvKAiCRIOT0wf84TC1dEWl5w,147
47
+ autochunk/storage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ autochunk/storage/cache.py,sha256=O9nOHkV9C8Lsl3bUmNqqRmPLTDQus7B5IlVY1l-0EAc,1047
49
+ autochunk/storage/plan.py,sha256=J3vf3Lqi6Zc-HLNHOINuPoYv-A3bSxbpzUCXaamdRtQ,1434
50
+ autochunk/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
+ autochunk/utils/hashing.py,sha256=lz4I4EPyulfc3ZUIS4X-vhZNsAwC9Ehx32NSsw5Ko6Y,178
52
+ autochunk/utils/io.py,sha256=di2UPiplB1LWpiNpbVCC250K66w2NGPpU_M2rC3kDa4,7943
53
+ autochunk/utils/logger.py,sha256=kzNkxWfTF6ivEE5WQJKa77RNvY2HpYCybzxgH5yan0E,2202
54
+ autochunk/utils/telemetry.py,sha256=EdYylC-rOTF1beL84AQRLN11nn0QC4X_qwNLYVmlud8,1854
55
+ autochunk/utils/text.py,sha256=prp0oeclNPZMHMAROorGIlthwAVjZZ8Ei0O9uywDQ1I,6474
56
+ autochunks-0.0.8.dist-info/licenses/LICENSE,sha256=QWYXD3-1Anc9MANMS_BgZjU3rZpfeSKGwulkQVdUsn4,572
57
+ autochunks-0.0.8.dist-info/METADATA,sha256=RsNKhIJifOI_0SmlkqITX3WAG85nw168uWbGEDKiFns,5865
58
+ autochunks-0.0.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
59
+ autochunks-0.0.8.dist-info/entry_points.txt,sha256=h1jlX_VnUy2h8FV0Xn2lAjOaofA_ogRMWCOTYORN0eo,50
60
+ autochunks-0.0.8.dist-info/top_level.txt,sha256=KHX0veKaI0ukg1x8Yv_5Pmjy8VI0LUH6KT8e-6KakHU,10
61
+ autochunks-0.0.8.dist-info/RECORD,,