autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
autochunk/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ import warnings
2
+ # Suppress Pydantic v2 namespace conflicts common in docling models
3
+ warnings.filterwarnings("ignore", message='.*conflict with protected namespace "model_".*', category=UserWarning)
4
+
5
+ from .autochunker import AutoChunker
6
+ from .embedding.adapter import EmbeddingFn
7
+ from .config import AutoChunkConfig, EvalConfig, ProxyConfig, RetrievalStrategy, SafetyConstraints, ParallelConfig, TokenizerConfig, NetworkConfig, RagasConfig
8
+ from .adapters import AutoChunkLangChainAdapter, AutoChunkLlamaIndexAdapter, AutoChunkHaystackAdapter
9
+ from .storage.plan import Plan
autochunk/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+
2
+ from .cli import main
3
+
4
+ if __name__ == "__main__":
5
+ main()
@@ -0,0 +1,3 @@
1
+ from .langchain import AutoChunkLangChainAdapter
2
+ from .llamaindex import AutoChunkLlamaIndexAdapter
3
+ from .haystack import AutoChunkHaystackAdapter
@@ -0,0 +1,68 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, Optional, Union
4
+ from ..storage.plan import Plan
5
+ from ..autochunker import AutoChunker
6
+
7
+ try:
8
+ from haystack import component, Document
9
+ HAYSTACK_AVAILABLE = True
10
+ except ImportError:
11
+ # Robust fallback for environment without Haystack
12
+ def component(cls): return cls
13
+ def output_types(**kwargs):
14
+ def decorator(func): return func
15
+ return decorator
16
+ component.output_types = output_types
17
+ class Document: pass
18
+ HAYSTACK_AVAILABLE = False
19
+
20
+ @component
21
+ class AutoChunkHaystackAdapter:
22
+ """
23
+ Official AutoChunks Adapter for Haystack 2.0.
24
+ Acts as a Pipeline Component for optimized document splitting.
25
+ """
26
+ def __init__(self, plan: Union[Plan, str]):
27
+ if isinstance(plan, str):
28
+ self.plan = Plan.read(plan)
29
+ else:
30
+ self.plan = plan
31
+
32
+ # Initialize internal engine
33
+ self.chunker = AutoChunker(
34
+ embedding_provider=self.plan.embedding.get("name"),
35
+ embedding_model_or_path=self.plan.embedding.get("model")
36
+ )
37
+
38
+ @component.output_types(documents=List[Document])
39
+ def run(self, documents: List[Document]):
40
+ """
41
+ Implementation of the Haystack Component interface.
42
+ """
43
+ if not HAYSTACK_AVAILABLE:
44
+ raise ImportError("Please install haystack-ai: pip install haystack-ai")
45
+
46
+ # Convert Haystack docs to AutoChunks format
47
+ ac_docs = []
48
+ for d in documents:
49
+ ac_docs.append({
50
+ "id": str(getattr(d, "id", hash(d.content))),
51
+ "text": d.content,
52
+ "metadata": d.meta
53
+ })
54
+
55
+ # Process via pipeline
56
+ gen_name = self.plan.generator_pipeline.get("name")
57
+ params = self.plan.generator_pipeline.get("params", {})
58
+ ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
59
+
60
+ # Re-wrap as Haystack Documents
61
+ return {
62
+ "documents": [
63
+ Document(
64
+ content=ch["text"],
65
+ meta={**ch.get("meta", {}), "autochunk_plan_id": self.plan.id}
66
+ ) for ch in ac_chunks
67
+ ]
68
+ }
@@ -0,0 +1,81 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, TYPE_CHECKING, Union
4
+ from ..storage.plan import Plan
5
+ from ..autochunker import AutoChunker, AutoChunkConfig
6
+
7
+ if TYPE_CHECKING:
8
+ from langchain_core.documents import Document
9
+
10
+ try:
11
+ from langchain_core.documents import BaseDocumentTransformer, Document
12
+ LANGCHAIN_AVAILABLE = True
13
+ except ImportError:
14
+ class BaseDocumentTransformer: pass
15
+ LANGCHAIN_AVAILABLE = False
16
+
17
+ class AutoChunkLangChainAdapter(BaseDocumentTransformer):
18
+ """
19
+ Official AutoChunks Adapter for LangChain.
20
+ Inherits from BaseDocumentTransformer for seamless integration
21
+ into LangChain Indexing and LCEL pipelines.
22
+ """
23
+ def __init__(self, plan: Union[Plan, str], config: AutoChunkConfig = None):
24
+ if isinstance(plan, str):
25
+ self.plan = Plan.read(plan)
26
+ else:
27
+ self.plan = plan
28
+
29
+ # We use a configured AutoChunker to execute the plan
30
+ self.chunker = AutoChunker(
31
+ embedding_provider=self.plan.embedding.get("name"),
32
+ embedding_model_or_path=self.plan.embedding.get("model")
33
+ )
34
+
35
+ def transform_documents(self, documents: List[Document], **kwargs: Any) -> List[Document]:
36
+ """
37
+ Apply the optimized AutoChunks plan to a list of LangChain documents.
38
+ This processes ALL documents provided.
39
+ """
40
+ try:
41
+ from langchain_core.documents import Document
42
+ except ImportError:
43
+ raise ImportError("Please install langchain-core: pip install langchain-core")
44
+
45
+ # Convert LangChain docs to AutoChunks format
46
+ ac_docs = []
47
+ for d in documents:
48
+ # We use metadata.get('source', id(d)) as a unique doc_id
49
+ doc_id = str(d.metadata.get("source", id(d)))
50
+ ac_docs.append({
51
+ "id": doc_id,
52
+ "text": d.page_content,
53
+ "metadata": d.metadata
54
+ })
55
+
56
+ # Run the execution pipeline
57
+ gen_name = self.plan.generator_pipeline.get("name")
58
+ params = self.plan.generator_pipeline.get("params", {})
59
+
60
+ ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
61
+
62
+ # Convert back to LangChain docs
63
+ lc_docs = []
64
+ for ch in ac_chunks:
65
+ # Preserve original metadata and add chunking metadata
66
+ meta = ch.get("meta", {}).copy()
67
+ # If original metadata was passed through, it might be nested or direct
68
+ # For now, we assume simple merger
69
+ lc_docs.append(Document(
70
+ page_content=ch["text"],
71
+ metadata={**meta, "autochunk_plan_id": self.plan.id}
72
+ ))
73
+
74
+ return lc_docs
75
+
76
+ def split_documents(self, documents: List[Document]) -> List[Document]:
77
+ """Alias for transform_documents to match TextSplitter interface."""
78
+ return self.transform_documents(documents)
79
+
80
+ def __call__(self, documents: List[Document]) -> List[Document]:
81
+ return self.transform_documents(documents)
@@ -0,0 +1,94 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, TYPE_CHECKING, Union
4
+ from ..storage.plan import Plan
5
+ from ..autochunker import AutoChunker
6
+
7
+ if TYPE_CHECKING:
8
+ from llama_index.core.schema import BaseNode, Document
9
+
10
+ try:
11
+ from llama_index.core.node_parser import NodeParser, BaseNodeParser
12
+ from llama_index.core.schema import TextNode, BaseNode, Document
13
+ LLAMA_INDEX_AVAILABLE = True
14
+ except ImportError:
15
+ class BaseNodeParser: pass
16
+ LLAMA_INDEX_AVAILABLE = False
17
+
18
+ class AutoChunkLlamaIndexAdapter(BaseNodeParser):
19
+ """
20
+ Official AutoChunks Adapter for LlamaIndex.
21
+ Acts as a native NodeParser for seamless integration into IngestionPipelines.
22
+ """
23
+ def __init__(self, plan: Union[Plan, str]):
24
+ if isinstance(plan, str):
25
+ self.plan = Plan.read(plan)
26
+ else:
27
+ self.plan = plan
28
+
29
+ self.chunker = AutoChunker(
30
+ embedding_provider=self.plan.embedding.get("name"),
31
+ embedding_model_or_path=self.plan.embedding.get("model")
32
+ )
33
+
34
+ def _parse_nodes(self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any) -> List[BaseNode]:
35
+ """
36
+ Internal implementation for LlamaIndex BaseNodeParser.
37
+ """
38
+ # Convert Nodes to AutoChunks format
39
+ ac_docs = []
40
+ for n in nodes:
41
+ ac_docs.append({
42
+ "id": n.node_id,
43
+ "text": n.get_content(),
44
+ "metadata": n.metadata
45
+ })
46
+
47
+ # Run the execution pipeline
48
+ gen_name = self.plan.generator_pipeline.get("name")
49
+ params = self.plan.generator_pipeline.get("params", {})
50
+
51
+ ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
52
+
53
+ # Convert back to LlamaIndex Nodes
54
+ final_nodes = []
55
+ for ch in ac_chunks:
56
+ node = TextNode(
57
+ text=ch["text"],
58
+ metadata={**ch.get("meta", {}), "autochunk_plan_id": self.plan.id}
59
+ )
60
+ final_nodes.append(node)
61
+
62
+ return final_nodes
63
+
64
+ def get_nodes_from_documents(self, documents: List[Document], **kwargs: Any) -> List[BaseNode]:
65
+ try:
66
+ from llama_index.core.schema import TextNode
67
+ except ImportError:
68
+ raise ImportError("Please install llama-index-core: pip install llama-index-core")
69
+
70
+ # Convert LlamaIndex docs to AutoChunks format
71
+ ac_docs = []
72
+ for d in documents:
73
+ ac_docs.append({
74
+ "id": d.doc_id,
75
+ "text": d.get_content(),
76
+ "metadata": d.metadata
77
+ })
78
+
79
+ # Run the execution pipeline
80
+ gen_name = self.plan.generator_pipeline.get("name")
81
+ params = self.plan.generator_pipeline.get("params", {})
82
+
83
+ ac_chunks = self.chunker.apply_with_generator(ac_docs, gen_name, params)
84
+
85
+ # Convert back to LlamaIndex Nodes
86
+ nodes = []
87
+ for ch in ac_chunks:
88
+ node = TextNode(
89
+ text=ch["text"],
90
+ metadata={**ch.get("meta", {}), "autochunk_plan_id": self.plan.id}
91
+ )
92
+ nodes.append(node)
93
+
94
+ return nodes