autochunks 0.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunks-0.0.8/LICENSE +15 -0
- autochunks-0.0.8/PKG-INFO +133 -0
- autochunks-0.0.8/README.md +97 -0
- autochunks-0.0.8/autochunk/__init__.py +9 -0
- autochunks-0.0.8/autochunk/__main__.py +5 -0
- autochunks-0.0.8/autochunk/adapters/__init__.py +3 -0
- autochunks-0.0.8/autochunk/adapters/haystack.py +68 -0
- autochunks-0.0.8/autochunk/adapters/langchain.py +81 -0
- autochunks-0.0.8/autochunk/adapters/llamaindex.py +94 -0
- autochunks-0.0.8/autochunk/autochunker.py +606 -0
- autochunks-0.0.8/autochunk/chunkers/__init__.py +100 -0
- autochunks-0.0.8/autochunk/chunkers/agentic.py +184 -0
- autochunks-0.0.8/autochunk/chunkers/base.py +16 -0
- autochunks-0.0.8/autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunks-0.0.8/autochunk/chunkers/fixed_length.py +110 -0
- autochunks-0.0.8/autochunk/chunkers/html_section.py +225 -0
- autochunks-0.0.8/autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunks-0.0.8/autochunk/chunkers/layout_aware.py +192 -0
- autochunks-0.0.8/autochunk/chunkers/parent_child.py +172 -0
- autochunks-0.0.8/autochunk/chunkers/proposition.py +175 -0
- autochunks-0.0.8/autochunk/chunkers/python_ast.py +248 -0
- autochunks-0.0.8/autochunk/chunkers/recursive_character.py +215 -0
- autochunks-0.0.8/autochunk/chunkers/semantic_local.py +140 -0
- autochunks-0.0.8/autochunk/chunkers/sentence_aware.py +102 -0
- autochunks-0.0.8/autochunk/cli.py +135 -0
- autochunks-0.0.8/autochunk/config.py +76 -0
- autochunks-0.0.8/autochunk/embedding/__init__.py +22 -0
- autochunks-0.0.8/autochunk/embedding/adapter.py +14 -0
- autochunks-0.0.8/autochunk/embedding/base.py +33 -0
- autochunks-0.0.8/autochunk/embedding/hashing.py +42 -0
- autochunks-0.0.8/autochunk/embedding/local.py +154 -0
- autochunks-0.0.8/autochunk/embedding/ollama.py +66 -0
- autochunks-0.0.8/autochunk/embedding/openai.py +62 -0
- autochunks-0.0.8/autochunk/embedding/tokenizer.py +9 -0
- autochunks-0.0.8/autochunk/enrichment/__init__.py +0 -0
- autochunks-0.0.8/autochunk/enrichment/contextual.py +29 -0
- autochunks-0.0.8/autochunk/eval/__init__.py +0 -0
- autochunks-0.0.8/autochunk/eval/harness.py +177 -0
- autochunks-0.0.8/autochunk/eval/metrics.py +27 -0
- autochunks-0.0.8/autochunk/eval/ragas_eval.py +234 -0
- autochunks-0.0.8/autochunk/eval/synthetic.py +104 -0
- autochunks-0.0.8/autochunk/quality/__init__.py +31 -0
- autochunks-0.0.8/autochunk/quality/deduplicator.py +326 -0
- autochunks-0.0.8/autochunk/quality/overlap_optimizer.py +402 -0
- autochunks-0.0.8/autochunk/quality/post_processor.py +245 -0
- autochunks-0.0.8/autochunk/quality/scorer.py +459 -0
- autochunks-0.0.8/autochunk/retrieval/__init__.py +0 -0
- autochunks-0.0.8/autochunk/retrieval/in_memory.py +47 -0
- autochunks-0.0.8/autochunk/retrieval/parent_child.py +4 -0
- autochunks-0.0.8/autochunk/storage/__init__.py +0 -0
- autochunks-0.0.8/autochunk/storage/cache.py +34 -0
- autochunks-0.0.8/autochunk/storage/plan.py +40 -0
- autochunks-0.0.8/autochunk/utils/__init__.py +0 -0
- autochunks-0.0.8/autochunk/utils/hashing.py +8 -0
- autochunks-0.0.8/autochunk/utils/io.py +176 -0
- autochunks-0.0.8/autochunk/utils/logger.py +64 -0
- autochunks-0.0.8/autochunk/utils/telemetry.py +44 -0
- autochunks-0.0.8/autochunk/utils/text.py +199 -0
- autochunks-0.0.8/autochunks.egg-info/PKG-INFO +133 -0
- autochunks-0.0.8/autochunks.egg-info/SOURCES.txt +65 -0
- autochunks-0.0.8/autochunks.egg-info/dependency_links.txt +1 -0
- autochunks-0.0.8/autochunks.egg-info/entry_points.txt +2 -0
- autochunks-0.0.8/autochunks.egg-info/requires.txt +14 -0
- autochunks-0.0.8/autochunks.egg-info/top_level.txt +1 -0
- autochunks-0.0.8/pyproject.toml +47 -0
- autochunks-0.0.8/setup.cfg +11 -0
autochunks-0.0.8/LICENSE
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Apache License 2.0
|
|
2
|
+
|
|
3
|
+
Copyright 2026 Sumit Joshi
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autochunks
|
|
3
|
+
Version: 0.0.8
|
|
4
|
+
Summary: Autonomous Retrieval Optimization for RAG
|
|
5
|
+
Author: Sumit Joshi
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/s8ilabs/AutoChunks
|
|
8
|
+
Project-URL: Documentation, https://autochunks.readthedocs.io/
|
|
9
|
+
Project-URL: Repository, https://github.com/s8ilabs/AutoChunks
|
|
10
|
+
Project-URL: Issues, https://github.com/s8ilabs/AutoChunks/issues
|
|
11
|
+
Keywords: rag,chunking,retrieval,nlp
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy>=1.24
|
|
22
|
+
Requires-Dist: pyyaml>=6.0
|
|
23
|
+
Requires-Dist: loguru
|
|
24
|
+
Requires-Dist: arize-phoenix>=4.3.0
|
|
25
|
+
Requires-Dist: opentelemetry-api
|
|
26
|
+
Requires-Dist: opentelemetry-sdk
|
|
27
|
+
Requires-Dist: opentelemetry-exporter-otlp
|
|
28
|
+
Requires-Dist: nltk
|
|
29
|
+
Requires-Dist: pymupdf4llm
|
|
30
|
+
Requires-Dist: sentence-transformers
|
|
31
|
+
Requires-Dist: torch
|
|
32
|
+
Requires-Dist: fastapi
|
|
33
|
+
Requires-Dist: uvicorn
|
|
34
|
+
Requires-Dist: python-multipart
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# AutoChunks
|
|
38
|
+
### The Intelligent Data Optimization Layer for RAG Engineering
|
|
39
|
+
|
|
40
|
+
[](https://github.com/s8ilabs/AutoChunks)
|
|
41
|
+
[](https://autochunks.readthedocs.io/)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
AutoChunks is a specialized engine designed to eliminate the guesswork from Retrieval-Augmented Generation (RAG). By treating chunking as an optimization problem rather than a set of heuristics, it empirically discovers the most performant data structures for your specific documents and retrieval models.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## From Heuristics to Evidence
|
|
51
|
+
|
|
52
|
+
Most RAG pipelines today rely on arbitrary settings—like a 512-token chunk size with a 10% overlap. These values are often chosen without validation, leading to:
|
|
53
|
+
|
|
54
|
+
* **Fragmented Context**: Related information is split across multiple retrieval units.
|
|
55
|
+
* **Semantic Noise**: Poorly defined boundaries dilute the signal-to-noise ratio in LLM prompts.
|
|
56
|
+
* **Retrieval Gaps**: Critical information hidden in "dead zones" between chunks results in recall failure.
|
|
57
|
+
|
|
58
|
+
**AutoChunks replaces trial-and-error with a data-driven tournament.** It generates adversarial synthetic ground truth from your documents and pits over 15+ chunking strategies against each other to find the mathematical optimum for your corpus.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Core Pillars
|
|
63
|
+
|
|
64
|
+
### The Vectorized Tournament
|
|
65
|
+
AutoChunks runs an exhaustive parallel search across multiple strategy families—Recursive, Semantic, Layout-Aware, and Hybrid. Every candidate is evaluated in a high-speed NumPy-accelerated retrieval simulation, measuring performance across hundreds of queries in seconds.
|
|
66
|
+
|
|
67
|
+
### Adversarial Synthetic QA
|
|
68
|
+
The system performs a structural audit of your documents to generate "needle-in-a-haystack" question-answer pairs. This ensures that your chunking strategy is optimized against real-world search intent, not just random text splits.
|
|
69
|
+
|
|
70
|
+
### Optimization Goals
|
|
71
|
+
Align your data engineering with your business objectives. Choose from intent-based presets that guide the engine toward specific outcomes:
|
|
72
|
+
* **Balanced Ranking**: Optimizes for general-purpose retrieval quality.
|
|
73
|
+
* **Speed and Precision**: Minimizes LLM reading time by prioritizing Rank #1 hits.
|
|
74
|
+
* **Comprehensive Retrieval**: Prioritizes recall for compliance or legal use cases.
|
|
75
|
+
* **Cost Efficiency**: Minimizes vector storage and inference costs for massive datasets.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Advanced Feature Set
|
|
80
|
+
|
|
81
|
+
* **Hybrid Semantic-Statistical Chunker**: Uses real-time embedding distance analysis to detect topic shifts while maintaining strict token limits.
|
|
82
|
+
* **Framework Bridges**: Native adapters for LangChain, LlamaIndex, and Haystack, allowing you to benchmark and optimize your existing framework code directly.
|
|
83
|
+
* **Layout-Aware Processing**: High-fidelity extraction that respects the nested structures of PDFs, HTML sections, and Markdown hierarchies.
|
|
84
|
+
* **Fidelity Inspector**: A visual debugging dashboard to qualitatively verify how different strategies fragment complex documents.
|
|
85
|
+
* **Enterprise Security**: Air-gap compatible. Supports local model deployment, SHA-256 binary fingerprinting for data privacy, and SecretStr protection for all cloud credentials.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Quick Start
|
|
90
|
+
|
|
91
|
+
### Installation
|
|
92
|
+
```bash
|
|
93
|
+
pip install -r requirements.txt
|
|
94
|
+
```
|
|
95
|
+
*Note: For GPU acceleration with Local Embeddings or Ragas, please refer to the [Getting Started guide](docs/getting_started.md).*
|
|
96
|
+
|
|
97
|
+
### Launch the Dashboard
|
|
98
|
+
The most effective way to optimize your data is through the visual interactive dashboard.
|
|
99
|
+
```bash
|
|
100
|
+
python -m autochunk.web.server
|
|
101
|
+
```
|
|
102
|
+
Navigate to `http://localhost:8000` to begin your first optimization run.
|
|
103
|
+
|
|
104
|
+
### Python API
|
|
105
|
+
```python
|
|
106
|
+
from autochunk import AutoChunker
|
|
107
|
+
|
|
108
|
+
# Initialize in Light Mode for rapid iteration
|
|
109
|
+
optimizer = AutoChunker(mode="light")
|
|
110
|
+
|
|
111
|
+
# Discover the optimal plan for your dataset
|
|
112
|
+
plan, report = optimizer.optimize(
|
|
113
|
+
documents_path="./my_data_folder",
|
|
114
|
+
objective="balanced"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Apply the winning strategy
|
|
118
|
+
chunks = plan.apply("./new_documents", optimizer)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Documentation and Resources
|
|
124
|
+
|
|
125
|
+
* [Getting Started](docs/getting_started.md)
|
|
126
|
+
* [The Optimization Lifecycle](docs/core_concepts/eval_flow.md)
|
|
127
|
+
* [Metric Definitions and Scoring](docs/core_concepts/evaluation.md)
|
|
128
|
+
* [RAGAS Semantic Evaluation](docs/guides/ragas_evaluation.md)
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
Developed for the RAG and LLM Community.
|
|
133
|
+
AutoChunks is released under the Apache License 2.0.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# AutoChunks
|
|
2
|
+
### The Intelligent Data Optimization Layer for RAG Engineering
|
|
3
|
+
|
|
4
|
+
[](https://github.com/s8ilabs/AutoChunks)
|
|
5
|
+
[](https://autochunks.readthedocs.io/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
AutoChunks is a specialized engine designed to eliminate the guesswork from Retrieval-Augmented Generation (RAG). By treating chunking as an optimization problem rather than a set of heuristics, it empirically discovers the most performant data structures for your specific documents and retrieval models.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## From Heuristics to Evidence
|
|
15
|
+
|
|
16
|
+
Most RAG pipelines today rely on arbitrary settings—like a 512-token chunk size with a 10% overlap. These values are often chosen without validation, leading to:
|
|
17
|
+
|
|
18
|
+
* **Fragmented Context**: Related information is split across multiple retrieval units.
|
|
19
|
+
* **Semantic Noise**: Poorly defined boundaries dilute the signal-to-noise ratio in LLM prompts.
|
|
20
|
+
* **Retrieval Gaps**: Critical information hidden in "dead zones" between chunks results in recall failure.
|
|
21
|
+
|
|
22
|
+
**AutoChunks replaces trial-and-error with a data-driven tournament.** It generates adversarial synthetic ground truth from your documents and pits over 15+ chunking strategies against each other to find the mathematical optimum for your corpus.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Core Pillars
|
|
27
|
+
|
|
28
|
+
### The Vectorized Tournament
|
|
29
|
+
AutoChunks runs an exhaustive parallel search across multiple strategy families—Recursive, Semantic, Layout-Aware, and Hybrid. Every candidate is evaluated in a high-speed NumPy-accelerated retrieval simulation, measuring performance across hundreds of queries in seconds.
|
|
30
|
+
|
|
31
|
+
### Adversarial Synthetic QA
|
|
32
|
+
The system performs a structural audit of your documents to generate "needle-in-a-haystack" question-answer pairs. This ensures that your chunking strategy is optimized against real-world search intent, not just random text splits.
|
|
33
|
+
|
|
34
|
+
### Optimization Goals
|
|
35
|
+
Align your data engineering with your business objectives. Choose from intent-based presets that guide the engine toward specific outcomes:
|
|
36
|
+
* **Balanced Ranking**: Optimizes for general-purpose retrieval quality.
|
|
37
|
+
* **Speed and Precision**: Minimizes LLM reading time by prioritizing Rank #1 hits.
|
|
38
|
+
* **Comprehensive Retrieval**: Prioritizes recall for compliance or legal use cases.
|
|
39
|
+
* **Cost Efficiency**: Minimizes vector storage and inference costs for massive datasets.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Advanced Feature Set
|
|
44
|
+
|
|
45
|
+
* **Hybrid Semantic-Statistical Chunker**: Uses real-time embedding distance analysis to detect topic shifts while maintaining strict token limits.
|
|
46
|
+
* **Framework Bridges**: Native adapters for LangChain, LlamaIndex, and Haystack, allowing you to benchmark and optimize your existing framework code directly.
|
|
47
|
+
* **Layout-Aware Processing**: High-fidelity extraction that respects the nested structures of PDFs, HTML sections, and Markdown hierarchies.
|
|
48
|
+
* **Fidelity Inspector**: A visual debugging dashboard to qualitatively verify how different strategies fragment complex documents.
|
|
49
|
+
* **Enterprise Security**: Air-gap compatible. Supports local model deployment, SHA-256 binary fingerprinting for data privacy, and SecretStr protection for all cloud credentials.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
### Installation
|
|
56
|
+
```bash
|
|
57
|
+
pip install -r requirements.txt
|
|
58
|
+
```
|
|
59
|
+
*Note: For GPU acceleration with Local Embeddings or Ragas, please refer to the [Getting Started guide](docs/getting_started.md).*
|
|
60
|
+
|
|
61
|
+
### Launch the Dashboard
|
|
62
|
+
The most effective way to optimize your data is through the visual interactive dashboard.
|
|
63
|
+
```bash
|
|
64
|
+
python -m autochunk.web.server
|
|
65
|
+
```
|
|
66
|
+
Navigate to `http://localhost:8000` to begin your first optimization run.
|
|
67
|
+
|
|
68
|
+
### Python API
|
|
69
|
+
```python
|
|
70
|
+
from autochunk import AutoChunker
|
|
71
|
+
|
|
72
|
+
# Initialize in Light Mode for rapid iteration
|
|
73
|
+
optimizer = AutoChunker(mode="light")
|
|
74
|
+
|
|
75
|
+
# Discover the optimal plan for your dataset
|
|
76
|
+
plan, report = optimizer.optimize(
|
|
77
|
+
documents_path="./my_data_folder",
|
|
78
|
+
objective="balanced"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Apply the winning strategy
|
|
82
|
+
chunks = plan.apply("./new_documents", optimizer)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Documentation and Resources
|
|
88
|
+
|
|
89
|
+
* [Getting Started](docs/getting_started.md)
|
|
90
|
+
* [The Optimization Lifecycle](docs/core_concepts/eval_flow.md)
|
|
91
|
+
* [Metric Definitions and Scoring](docs/core_concepts/evaluation.md)
|
|
92
|
+
* [RAGAS Semantic Evaluation](docs/guides/ragas_evaluation.md)
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
Developed for the RAG and LLM Community.
|
|
97
|
+
AutoChunks is released under the Apache License 2.0.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
# Suppress Pydantic v2 namespace conflicts common in docling models
|
|
3
|
+
warnings.filterwarnings("ignore", message='.*conflict with protected namespace "model_".*', category=UserWarning)
|
|
4
|
+
|
|
5
|
+
from .autochunker import AutoChunker
|
|
6
|
+
from .embedding.adapter import EmbeddingFn
|
|
7
|
+
from .config import AutoChunkConfig, EvalConfig, ProxyConfig, RetrievalStrategy, SafetyConstraints, ParallelConfig, TokenizerConfig, NetworkConfig, RagasConfig
|
|
8
|
+
from .adapters import AutoChunkLangChainAdapter, AutoChunkLlamaIndexAdapter, AutoChunkHaystackAdapter
|
|
9
|
+
from .storage.plan import Plan
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, Optional, Union
|
|
4
|
+
from ..storage.plan import Plan
|
|
5
|
+
from ..autochunker import AutoChunker
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from haystack import component, Document
|
|
9
|
+
HAYSTACK_AVAILABLE = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
# Robust fallback for environment without Haystack
|
|
12
|
+
def component(cls): return cls
|
|
13
|
+
def output_types(**kwargs):
|
|
14
|
+
def decorator(func): return func
|
|
15
|
+
return decorator
|
|
16
|
+
component.output_types = output_types
|
|
17
|
+
class Document: pass
|
|
18
|
+
HAYSTACK_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
@component
|
|
21
|
+
class AutoChunkHaystackAdapter:
|
|
22
|
+
"""
|
|
23
|
+
Official AutoChunks Adapter for Haystack 2.0.
|
|
24
|
+
Acts as a Pipeline Component for optimized document splitting.
|
|
25
|
+
"""
|
|
26
|
+
def __init__(self, plan: Union[Plan, str]):
|
|
27
|
+
if isinstance(plan, str):
|
|
28
|
+
self.plan = Plan.read(plan)
|
|
29
|
+
else:
|
|
30
|
+
self.plan = plan
|
|
31
|
+
|
|
32
|
+
# Initialize internal engine
|
|
33
|
+
self.chunker = AutoChunker(
|
|
34
|
+
embedding_provider=self.plan.embedding.get("name"),
|
|
35
|
+
embedding_model_or_path=self.plan.embedding.get("model")
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
@component.output_types(documents=List[Document])
|
|
39
|
+
def run(self, documents: List[Document]):
|
|
40
|
+
"""
|
|
41
|
+
Implementation of the Haystack Component interface.
|
|
42
|
+
"""
|
|
43
|
+
if not HAYSTACK_AVAILABLE:
|
|
44
|
+
raise ImportError("Please install haystack-ai: pip install haystack-ai")
|
|
45
|
+
|
|
46
|
+
# Convert Haystack docs to AutoChunks format
|
|
47
|
+
ac_docs = []
|
|
48
|
+
for d in documents:
|
|
49
|
+
ac_docs.append({
|
|
50
|
+
"id": str(getattr(d, "id", hash(d.content))),
|
|
51
|
+
"text": d.content,
|
|
52
|
+
"metadata": d.meta
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
# Process via pipeline
|
|
56
|
+
gen_name = self.plan.generator_pipeline.get("name")
|
|
57
|
+
params = self.plan.generator_pipeline.get("params", {})
|
|
58
|
+
ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
|
|
59
|
+
|
|
60
|
+
# Re-wrap as Haystack Documents
|
|
61
|
+
return {
|
|
62
|
+
"documents": [
|
|
63
|
+
Document(
|
|
64
|
+
content=ch["text"],
|
|
65
|
+
meta={**ch.get("meta", {}), "autochunk_plan_id": self.plan.id}
|
|
66
|
+
) for ch in ac_chunks
|
|
67
|
+
]
|
|
68
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, TYPE_CHECKING, Union
|
|
4
|
+
from ..storage.plan import Plan
|
|
5
|
+
from ..autochunker import AutoChunker, AutoChunkConfig
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from langchain_core.documents import Document
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from langchain_core.documents import BaseDocumentTransformer, Document
|
|
12
|
+
LANGCHAIN_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
class BaseDocumentTransformer: pass
|
|
15
|
+
LANGCHAIN_AVAILABLE = False
|
|
16
|
+
|
|
17
|
+
class AutoChunkLangChainAdapter(BaseDocumentTransformer):
|
|
18
|
+
"""
|
|
19
|
+
Official AutoChunks Adapter for LangChain.
|
|
20
|
+
Inherits from BaseDocumentTransformer for seamless integration
|
|
21
|
+
into LangChain Indexing and LCEL pipelines.
|
|
22
|
+
"""
|
|
23
|
+
def __init__(self, plan: Union[Plan, str], config: AutoChunkConfig = None):
|
|
24
|
+
if isinstance(plan, str):
|
|
25
|
+
self.plan = Plan.read(plan)
|
|
26
|
+
else:
|
|
27
|
+
self.plan = plan
|
|
28
|
+
|
|
29
|
+
# We use a configured AutoChunker to execute the plan
|
|
30
|
+
self.chunker = AutoChunker(
|
|
31
|
+
embedding_provider=self.plan.embedding.get("name"),
|
|
32
|
+
embedding_model_or_path=self.plan.embedding.get("model")
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def transform_documents(self, documents: List[Document], **kwargs: Any) -> List[Document]:
|
|
36
|
+
"""
|
|
37
|
+
Apply the optimized AutoChunks plan to a list of LangChain documents.
|
|
38
|
+
This processes ALL documents provided.
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
from langchain_core.documents import Document
|
|
42
|
+
except ImportError:
|
|
43
|
+
raise ImportError("Please install langchain-core: pip install langchain-core")
|
|
44
|
+
|
|
45
|
+
# Convert LangChain docs to AutoChunks format
|
|
46
|
+
ac_docs = []
|
|
47
|
+
for d in documents:
|
|
48
|
+
# We use metadata.get('source', id(d)) as a unique doc_id
|
|
49
|
+
doc_id = str(d.metadata.get("source", id(d)))
|
|
50
|
+
ac_docs.append({
|
|
51
|
+
"id": doc_id,
|
|
52
|
+
"text": d.page_content,
|
|
53
|
+
"metadata": d.metadata
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
# Run the execution pipeline
|
|
57
|
+
gen_name = self.plan.generator_pipeline.get("name")
|
|
58
|
+
params = self.plan.generator_pipeline.get("params", {})
|
|
59
|
+
|
|
60
|
+
ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
|
|
61
|
+
|
|
62
|
+
# Convert back to LangChain docs
|
|
63
|
+
lc_docs = []
|
|
64
|
+
for ch in ac_chunks:
|
|
65
|
+
# Preserve original metadata and add chunking metadata
|
|
66
|
+
meta = ch.get("meta", {}).copy()
|
|
67
|
+
# If original metadata was passed through, it might be nested or direct
|
|
68
|
+
# For now, we assume simple merger
|
|
69
|
+
lc_docs.append(Document(
|
|
70
|
+
page_content=ch["text"],
|
|
71
|
+
metadata={**meta, "autochunk_plan_id": self.plan.id}
|
|
72
|
+
))
|
|
73
|
+
|
|
74
|
+
return lc_docs
|
|
75
|
+
|
|
76
|
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
|
77
|
+
"""Alias for transform_documents to match TextSplitter interface."""
|
|
78
|
+
return self.transform_documents(documents)
|
|
79
|
+
|
|
80
|
+
def __call__(self, documents: List[Document]) -> List[Document]:
|
|
81
|
+
return self.transform_documents(documents)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, TYPE_CHECKING, Union
|
|
4
|
+
from ..storage.plan import Plan
|
|
5
|
+
from ..autochunker import AutoChunker
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from llama_index.core.schema import BaseNode, Document
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from llama_index.core.node_parser import NodeParser, BaseNodeParser
|
|
12
|
+
from llama_index.core.schema import TextNode, BaseNode, Document
|
|
13
|
+
LLAMA_INDEX_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
class BaseNodeParser: pass
|
|
16
|
+
LLAMA_INDEX_AVAILABLE = False
|
|
17
|
+
|
|
18
|
+
class AutoChunkLlamaIndexAdapter(BaseNodeParser):
|
|
19
|
+
"""
|
|
20
|
+
Official AutoChunks Adapter for LlamaIndex.
|
|
21
|
+
Acts as a native NodeParser for seamless integration into IngestionPipelines.
|
|
22
|
+
"""
|
|
23
|
+
def __init__(self, plan: Union[Plan, str]):
|
|
24
|
+
if isinstance(plan, str):
|
|
25
|
+
self.plan = Plan.read(plan)
|
|
26
|
+
else:
|
|
27
|
+
self.plan = plan
|
|
28
|
+
|
|
29
|
+
self.chunker = AutoChunker(
|
|
30
|
+
embedding_provider=self.plan.embedding.get("name"),
|
|
31
|
+
embedding_model_or_path=self.plan.embedding.get("model")
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _parse_nodes(self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any) -> List[BaseNode]:
|
|
35
|
+
"""
|
|
36
|
+
Internal implementation for LlamaIndex BaseNodeParser.
|
|
37
|
+
"""
|
|
38
|
+
# Convert Nodes to AutoChunks format
|
|
39
|
+
ac_docs = []
|
|
40
|
+
for n in nodes:
|
|
41
|
+
ac_docs.append({
|
|
42
|
+
"id": n.node_id,
|
|
43
|
+
"text": n.get_content(),
|
|
44
|
+
"metadata": n.metadata
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
# Run the execution pipeline
|
|
48
|
+
gen_name = self.plan.generator_pipeline.get("name")
|
|
49
|
+
params = self.plan.generator_pipeline.get("params", {})
|
|
50
|
+
|
|
51
|
+
ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
|
|
52
|
+
|
|
53
|
+
# Convert back to LlamaIndex Nodes
|
|
54
|
+
final_nodes = []
|
|
55
|
+
for ch in ac_chunks:
|
|
56
|
+
node = TextNode(
|
|
57
|
+
text=ch["text"],
|
|
58
|
+
metadata={**ch.get("meta", {}), "autochunk_plan_id": self.plan.id}
|
|
59
|
+
)
|
|
60
|
+
final_nodes.append(node)
|
|
61
|
+
|
|
62
|
+
return final_nodes
|
|
63
|
+
|
|
64
|
+
def get_nodes_from_documents(self, documents: List[Document], **kwargs: Any) -> List[BaseNode]:
|
|
65
|
+
try:
|
|
66
|
+
from llama_index.core.schema import TextNode
|
|
67
|
+
except ImportError:
|
|
68
|
+
raise ImportError("Please install llama-index-core: pip install llama-index-core")
|
|
69
|
+
|
|
70
|
+
# Convert LlamaIndex docs to AutoChunks format
|
|
71
|
+
ac_docs = []
|
|
72
|
+
for d in documents:
|
|
73
|
+
ac_docs.append({
|
|
74
|
+
"id": d.doc_id,
|
|
75
|
+
"text": d.get_content(),
|
|
76
|
+
"metadata": d.metadata
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
# Run the execution pipeline
|
|
80
|
+
gen_name = self.plan.generator_pipeline.get("name")
|
|
81
|
+
params = self.plan.generator_pipeline.get("params", {})
|
|
82
|
+
|
|
83
|
+
ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
|
|
84
|
+
|
|
85
|
+
# Convert back to LlamaIndex Nodes
|
|
86
|
+
nodes = []
|
|
87
|
+
for ch in ac_chunks:
|
|
88
|
+
node = TextNode(
|
|
89
|
+
text=ch["text"],
|
|
90
|
+
metadata={**ch.get("meta", {}), "autochunk_plan_id": self.plan.id}
|
|
91
|
+
)
|
|
92
|
+
nodes.append(node)
|
|
93
|
+
|
|
94
|
+
return nodes
|