haiku.rag-slim 0.16.1__tar.gz → 0.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/PKG-INFO +1 -2
- haiku_rag_slim-0.17.1/haiku/rag/chunkers/__init__.py +31 -0
- haiku_rag_slim-0.17.1/haiku/rag/chunkers/base.py +28 -0
- haiku_rag_slim-0.17.1/haiku/rag/chunkers/docling_local.py +110 -0
- haiku_rag_slim-0.17.1/haiku/rag/chunkers/docling_serve.py +111 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/client.py +22 -26
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/__init__.py +2 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/models.py +32 -0
- haiku_rag_slim-0.17.1/haiku/rag/converters/__init__.py +31 -0
- haiku_rag_slim-0.17.1/haiku/rag/converters/base.py +57 -0
- haiku_rag_slim-0.17.1/haiku/rag/converters/docling_local.py +154 -0
- haiku_rag_slim-0.17.1/haiku/rag/converters/docling_serve.py +199 -0
- haiku_rag_slim-0.17.1/haiku/rag/converters/text_utils.py +117 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/monitor.py +21 -9
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/chunk.py +6 -3
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/utils.py +6 -69
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/pyproject.toml +1 -2
- haiku_rag_slim-0.16.1/haiku/rag/chunker.py +0 -65
- haiku_rag_slim-0.16.1/haiku/rag/reader.py +0 -135
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/.gitignore +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/LICENSE +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/README.md +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/app.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/cli.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/loader.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/base.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/vllm.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/cli_renderer.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/emitter.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/events.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/server.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/state.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/stream.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/models.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/nodes.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/prompts.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/utils.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/dependencies.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/graph.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/models.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/prompts.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/state.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/common.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/dependencies.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/graph.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/models.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/prompts.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/state.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/logging.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/mcp.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/qa/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/qa/agent.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/qa/prompts.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/base.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/cohere.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/mxbai.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/vllm.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/zeroentropy.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/engine.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/models/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/models/chunk.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/models/document.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/document.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/settings.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/upgrades/__init__.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/upgrades/v0_10_1.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/upgrades/v0_9_3.py +0 -0
- {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/test_agui_server.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: haiku.rag-slim
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.17.1
|
|
4
4
|
Summary: Agentic Retrieval Augmented Generation (RAG) with LanceDB - Minimal dependencies
|
|
5
5
|
Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -26,7 +26,6 @@ Requires-Dist: pydantic>=2.12.3
|
|
|
26
26
|
Requires-Dist: python-dotenv>=1.2.1
|
|
27
27
|
Requires-Dist: pyyaml>=6.0.3
|
|
28
28
|
Requires-Dist: rich>=14.2.0
|
|
29
|
-
Requires-Dist: tiktoken>=0.12.0
|
|
30
29
|
Requires-Dist: typer<0.20.0,>=0.19.2
|
|
31
30
|
Requires-Dist: watchfiles>=1.1.1
|
|
32
31
|
Provides-Extra: anthropic
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Document chunker abstraction for haiku.rag."""
|
|
2
|
+
|
|
3
|
+
from haiku.rag.chunkers.base import DocumentChunker
|
|
4
|
+
from haiku.rag.config import AppConfig, Config
|
|
5
|
+
|
|
6
|
+
__all__ = ["DocumentChunker", "get_chunker"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_chunker(config: AppConfig = Config) -> DocumentChunker:
|
|
10
|
+
"""Get a document chunker instance based on configuration.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
config: Configuration to use. Defaults to global Config.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
DocumentChunker instance configured according to the config.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If the chunker provider is not recognized.
|
|
20
|
+
"""
|
|
21
|
+
if config.processing.chunker == "docling-local":
|
|
22
|
+
from haiku.rag.chunkers.docling_local import DoclingLocalChunker
|
|
23
|
+
|
|
24
|
+
return DoclingLocalChunker(config)
|
|
25
|
+
|
|
26
|
+
if config.processing.chunker == "docling-serve":
|
|
27
|
+
from haiku.rag.chunkers.docling_serve import DoclingServeChunker
|
|
28
|
+
|
|
29
|
+
return DoclingServeChunker(config)
|
|
30
|
+
|
|
31
|
+
raise ValueError(f"Unsupported chunker: {config.processing.chunker}")
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DocumentChunker(ABC):
|
|
9
|
+
"""Abstract base class for document chunkers.
|
|
10
|
+
|
|
11
|
+
Document chunkers split DoclingDocuments into smaller text chunks suitable
|
|
12
|
+
for embedding and retrieval, respecting document structure and semantic boundaries.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
async def chunk(self, document: "DoclingDocument") -> list[str]:
|
|
17
|
+
"""Split a document into chunks.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
document: The DoclingDocument to chunk.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
List of text chunks with semantic boundaries preserved.
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
ValueError: If chunking fails.
|
|
27
|
+
"""
|
|
28
|
+
pass
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
from haiku.rag.chunkers.base import DocumentChunker
|
|
4
|
+
from haiku.rag.config import AppConfig, Config
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _create_markdown_serializer_provider(use_markdown_tables: bool = True):
|
|
11
|
+
"""Create a markdown serializer provider with configurable table rendering.
|
|
12
|
+
|
|
13
|
+
This function creates a custom serializer provider that extends ChunkingSerializerProvider
|
|
14
|
+
from docling-core. It's implemented as a factory function to avoid importing
|
|
15
|
+
docling-core at module level.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
use_markdown_tables: If True, use MarkdownTableSerializer for rendering tables as
|
|
19
|
+
markdown. If False, use default TripletTableSerializer for narrative format.
|
|
20
|
+
"""
|
|
21
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
22
|
+
ChunkingDocSerializer,
|
|
23
|
+
ChunkingSerializerProvider,
|
|
24
|
+
)
|
|
25
|
+
from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
|
|
26
|
+
|
|
27
|
+
class MDTableSerializerProvider(ChunkingSerializerProvider):
|
|
28
|
+
"""Serializer provider for markdown table output."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, use_markdown_tables: bool = True):
|
|
31
|
+
self.use_markdown_tables = use_markdown_tables
|
|
32
|
+
|
|
33
|
+
def get_serializer(self, doc):
|
|
34
|
+
if self.use_markdown_tables:
|
|
35
|
+
return ChunkingDocSerializer(
|
|
36
|
+
doc=doc,
|
|
37
|
+
table_serializer=MarkdownTableSerializer(),
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
# Use default ChunkingDocSerializer (TripletTableSerializer)
|
|
41
|
+
return ChunkingDocSerializer(doc=doc)
|
|
42
|
+
|
|
43
|
+
return MDTableSerializerProvider(use_markdown_tables=use_markdown_tables)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DoclingLocalChunker(DocumentChunker):
|
|
47
|
+
"""Local document chunker using docling's chunkers.
|
|
48
|
+
|
|
49
|
+
Supports both hybrid (structure-aware) and hierarchical chunking strategies.
|
|
50
|
+
Chunking is performed locally using the HuggingFace tokenizer specified in
|
|
51
|
+
configuration.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
config: Application configuration.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, config: AppConfig = Config):
|
|
58
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
59
|
+
HierarchicalChunker,
|
|
60
|
+
)
|
|
61
|
+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
|
62
|
+
from docling_core.transforms.chunker.tokenizer.huggingface import (
|
|
63
|
+
HuggingFaceTokenizer,
|
|
64
|
+
)
|
|
65
|
+
from transformers import AutoTokenizer
|
|
66
|
+
|
|
67
|
+
self.config = config
|
|
68
|
+
self.chunk_size = config.processing.chunk_size
|
|
69
|
+
self.chunker_type = config.processing.chunker_type
|
|
70
|
+
self.tokenizer_name = config.processing.chunking_tokenizer
|
|
71
|
+
|
|
72
|
+
if self.chunker_type == "hybrid":
|
|
73
|
+
hf_tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
|
|
74
|
+
tokenizer = HuggingFaceTokenizer(
|
|
75
|
+
tokenizer=hf_tokenizer, max_tokens=self.chunk_size
|
|
76
|
+
)
|
|
77
|
+
serializer_provider = _create_markdown_serializer_provider(
|
|
78
|
+
use_markdown_tables=config.processing.chunking_use_markdown_tables
|
|
79
|
+
)
|
|
80
|
+
self.chunker = HybridChunker(
|
|
81
|
+
tokenizer=tokenizer,
|
|
82
|
+
merge_peers=config.processing.chunking_merge_peers,
|
|
83
|
+
serializer_provider=serializer_provider,
|
|
84
|
+
)
|
|
85
|
+
elif self.chunker_type == "hierarchical":
|
|
86
|
+
serializer_provider = _create_markdown_serializer_provider(
|
|
87
|
+
use_markdown_tables=config.processing.chunking_use_markdown_tables
|
|
88
|
+
)
|
|
89
|
+
self.chunker = HierarchicalChunker(serializer_provider=serializer_provider)
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"Unsupported chunker_type: {self.chunker_type}. "
|
|
93
|
+
"Must be 'hybrid' or 'hierarchical'."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
async def chunk(self, document: "DoclingDocument") -> list[str]:
|
|
97
|
+
"""Split the document into chunks using docling's structure-aware chunking.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
document: The DoclingDocument to be split into chunks.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
A list of text chunks with semantic boundaries.
|
|
104
|
+
"""
|
|
105
|
+
if document is None:
|
|
106
|
+
return []
|
|
107
|
+
|
|
108
|
+
# Chunk using docling's hybrid chunker
|
|
109
|
+
chunks = list(self.chunker.chunk(document))
|
|
110
|
+
return [self.chunker.contextualize(chunk) for chunk in chunks]
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from haiku.rag.chunkers.base import DocumentChunker
|
|
7
|
+
from haiku.rag.config import AppConfig, Config
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DoclingServeChunker(DocumentChunker):
|
|
14
|
+
"""Remote document chunker using docling-serve API.
|
|
15
|
+
|
|
16
|
+
Sends DoclingDocument JSON to docling-serve for chunking. Supports both hybrid
|
|
17
|
+
and hierarchical chunking strategies via remote API.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
config: Application configuration containing docling-serve settings.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, config: AppConfig = Config):
|
|
24
|
+
self.config = config
|
|
25
|
+
self.base_url = config.providers.docling_serve.base_url.rstrip("/")
|
|
26
|
+
self.api_key = config.providers.docling_serve.api_key
|
|
27
|
+
self.timeout = config.providers.docling_serve.timeout
|
|
28
|
+
self.chunker_type = config.processing.chunker_type
|
|
29
|
+
|
|
30
|
+
async def chunk(self, document: "DoclingDocument") -> list[str]:
|
|
31
|
+
"""Split the document into chunks via docling-serve.
|
|
32
|
+
|
|
33
|
+
Exports the DoclingDocument to JSON and sends it to docling-serve's chunking
|
|
34
|
+
endpoint. The API will chunk the document and return the text chunks.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
document: The DoclingDocument to be split into chunks.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
A list of text chunks with semantic boundaries.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If chunking fails or service is unavailable.
|
|
44
|
+
"""
|
|
45
|
+
if document is None:
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
# Determine endpoint based on chunker_type
|
|
50
|
+
if self.chunker_type == "hierarchical":
|
|
51
|
+
url = f"{self.base_url}/v1/chunk/hierarchical/file"
|
|
52
|
+
else:
|
|
53
|
+
url = f"{self.base_url}/v1/chunk/hybrid/file"
|
|
54
|
+
|
|
55
|
+
# Export document to JSON
|
|
56
|
+
doc_json = document.model_dump_json()
|
|
57
|
+
doc_bytes = doc_json.encode("utf-8")
|
|
58
|
+
|
|
59
|
+
# Prepare multipart request with DoclingDocument JSON
|
|
60
|
+
files = {"files": ("document.json", BytesIO(doc_bytes), "application/json")}
|
|
61
|
+
|
|
62
|
+
# Build form data with chunking parameters
|
|
63
|
+
data = {
|
|
64
|
+
"chunking_max_tokens": str(self.config.processing.chunk_size),
|
|
65
|
+
"chunking_tokenizer": self.config.processing.chunking_tokenizer,
|
|
66
|
+
"chunking_merge_peers": str(
|
|
67
|
+
self.config.processing.chunking_merge_peers
|
|
68
|
+
).lower(),
|
|
69
|
+
"chunking_use_markdown_tables": str(
|
|
70
|
+
self.config.processing.chunking_use_markdown_tables
|
|
71
|
+
).lower(),
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
headers = {}
|
|
75
|
+
if self.api_key:
|
|
76
|
+
headers["X-Api-Key"] = self.api_key
|
|
77
|
+
|
|
78
|
+
response = requests.post(
|
|
79
|
+
url,
|
|
80
|
+
files=files,
|
|
81
|
+
data=data,
|
|
82
|
+
headers=headers,
|
|
83
|
+
timeout=self.timeout,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
response.raise_for_status()
|
|
87
|
+
|
|
88
|
+
result = response.json()
|
|
89
|
+
|
|
90
|
+
# Extract text from chunks
|
|
91
|
+
chunks = result.get("chunks", [])
|
|
92
|
+
return [chunk["text"] for chunk in chunks]
|
|
93
|
+
|
|
94
|
+
except requests.exceptions.ConnectionError as e:
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Could not connect to docling-serve at {self.base_url}. "
|
|
97
|
+
f"Ensure the service is running and accessible. Error: {e}"
|
|
98
|
+
)
|
|
99
|
+
except requests.exceptions.Timeout as e:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"Request to docling-serve timed out after {self.timeout}s. "
|
|
102
|
+
f"Consider increasing the timeout in configuration. Error: {e}"
|
|
103
|
+
)
|
|
104
|
+
except requests.exceptions.HTTPError as e:
|
|
105
|
+
if e.response.status_code == 401:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"Authentication failed. Check your API key configuration."
|
|
108
|
+
)
|
|
109
|
+
raise ValueError(f"HTTP error from docling-serve: {e}")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise ValueError(f"Failed to chunk via docling-serve: {e}")
|
|
@@ -9,6 +9,7 @@ from urllib.parse import urlparse
|
|
|
9
9
|
import httpx
|
|
10
10
|
|
|
11
11
|
from haiku.rag.config import AppConfig, Config
|
|
12
|
+
from haiku.rag.converters import get_converter
|
|
12
13
|
from haiku.rag.reranking import get_reranker
|
|
13
14
|
from haiku.rag.store.engine import Store
|
|
14
15
|
from haiku.rag.store.models.chunk import Chunk
|
|
@@ -111,10 +112,9 @@ class HaikuRAG:
|
|
|
111
112
|
|
|
112
113
|
# Only create docling_document if we need to generate chunks
|
|
113
114
|
if chunks is None:
|
|
114
|
-
#
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
docling_document = text_to_docling_document(content)
|
|
115
|
+
# Use converter to convert text
|
|
116
|
+
converter = get_converter(self._config)
|
|
117
|
+
docling_document = converter.convert_text(content)
|
|
118
118
|
else:
|
|
119
119
|
# Chunks already provided, no conversion needed
|
|
120
120
|
docling_document = None
|
|
@@ -201,12 +201,10 @@ class HaikuRAG:
|
|
|
201
201
|
Raises:
|
|
202
202
|
ValueError: If the file cannot be parsed or doesn't exist
|
|
203
203
|
"""
|
|
204
|
-
# Lazy import to avoid loading docling
|
|
205
|
-
from haiku.rag.reader import FileReader
|
|
206
|
-
|
|
207
204
|
metadata = metadata or {}
|
|
208
205
|
|
|
209
|
-
|
|
206
|
+
converter = get_converter(self._config)
|
|
207
|
+
if source_path.suffix.lower() not in converter.supported_extensions:
|
|
210
208
|
raise ValueError(f"Unsupported file extension: {source_path.suffix}")
|
|
211
209
|
|
|
212
210
|
if not source_path.exists():
|
|
@@ -242,7 +240,8 @@ class HaikuRAG:
|
|
|
242
240
|
return existing_doc
|
|
243
241
|
|
|
244
242
|
# Parse file only when content changed or new document
|
|
245
|
-
|
|
243
|
+
converter = get_converter(self._config)
|
|
244
|
+
docling_document = converter.convert_file(source_path)
|
|
246
245
|
|
|
247
246
|
if existing_doc:
|
|
248
247
|
# Update existing document
|
|
@@ -283,11 +282,11 @@ class HaikuRAG:
|
|
|
283
282
|
ValueError: If the content cannot be parsed
|
|
284
283
|
httpx.RequestError: If URL request fails
|
|
285
284
|
"""
|
|
286
|
-
# Lazy import to avoid loading docling
|
|
287
|
-
from haiku.rag.reader import FileReader
|
|
288
|
-
|
|
289
285
|
metadata = metadata or {}
|
|
290
286
|
|
|
287
|
+
converter = get_converter(self._config)
|
|
288
|
+
supported_extensions = converter.supported_extensions
|
|
289
|
+
|
|
291
290
|
async with httpx.AsyncClient() as client:
|
|
292
291
|
response = await client.get(url)
|
|
293
292
|
response.raise_for_status()
|
|
@@ -320,7 +319,7 @@ class HaikuRAG:
|
|
|
320
319
|
url, content_type
|
|
321
320
|
)
|
|
322
321
|
|
|
323
|
-
if file_extension not in
|
|
322
|
+
if file_extension not in supported_extensions:
|
|
324
323
|
raise ValueError(
|
|
325
324
|
f"Unsupported content type/extension: {content_type}/{file_extension}"
|
|
326
325
|
)
|
|
@@ -333,8 +332,8 @@ class HaikuRAG:
|
|
|
333
332
|
temp_file.flush() # Ensure content is written to disk
|
|
334
333
|
temp_path = Path(temp_file.name)
|
|
335
334
|
|
|
336
|
-
# Parse the content using
|
|
337
|
-
docling_document =
|
|
335
|
+
# Parse the content using converter
|
|
336
|
+
docling_document = converter.convert_file(temp_path)
|
|
338
337
|
|
|
339
338
|
# Merge metadata with contentType and md5
|
|
340
339
|
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
@@ -410,11 +409,9 @@ class HaikuRAG:
|
|
|
410
409
|
|
|
411
410
|
async def update_document(self, document: Document) -> Document:
|
|
412
411
|
"""Update an existing document."""
|
|
413
|
-
# Lazy import to avoid loading docling
|
|
414
|
-
from haiku.rag.utils import text_to_docling_document
|
|
415
|
-
|
|
416
412
|
# Convert content to DoclingDocument
|
|
417
|
-
|
|
413
|
+
converter = get_converter(self._config)
|
|
414
|
+
docling_document = converter.convert_text(document.content)
|
|
418
415
|
|
|
419
416
|
return await self.document_repository._update_and_rechunk(
|
|
420
417
|
document, docling_document
|
|
@@ -469,8 +466,8 @@ class HaikuRAG:
|
|
|
469
466
|
# No reranking - return direct search results
|
|
470
467
|
return await self.chunk_repository.search(query, limit, search_type, filter)
|
|
471
468
|
|
|
472
|
-
# Get more initial results (
|
|
473
|
-
search_limit = limit *
|
|
469
|
+
# Get more initial results (10X) for reranking
|
|
470
|
+
search_limit = limit * 10
|
|
474
471
|
search_results = await self.chunk_repository.search(
|
|
475
472
|
query, search_limit, search_type, filter
|
|
476
473
|
)
|
|
@@ -646,12 +643,11 @@ class HaikuRAG:
|
|
|
646
643
|
Yields:
|
|
647
644
|
int: The ID of the document currently being processed
|
|
648
645
|
"""
|
|
649
|
-
# Lazy import to avoid loading docling
|
|
650
|
-
from haiku.rag.utils import text_to_docling_document
|
|
651
|
-
|
|
652
646
|
await self.chunk_repository.delete_all()
|
|
653
647
|
self.store.recreate_embeddings_table()
|
|
654
648
|
|
|
649
|
+
converter = get_converter(self._config)
|
|
650
|
+
|
|
655
651
|
# Update settings to current config
|
|
656
652
|
settings_repo = SettingsRepository(self.store)
|
|
657
653
|
settings_repo.save_current_settings()
|
|
@@ -703,14 +699,14 @@ class HaikuRAG:
|
|
|
703
699
|
logger.warning(
|
|
704
700
|
"Source missing for %s, re-embedding from content", doc.uri
|
|
705
701
|
)
|
|
706
|
-
docling_document =
|
|
702
|
+
docling_document = converter.convert_text(doc.content)
|
|
707
703
|
await self.chunk_repository.create_chunks_for_document(
|
|
708
704
|
doc.id, docling_document
|
|
709
705
|
)
|
|
710
706
|
yield doc.id
|
|
711
707
|
else:
|
|
712
708
|
# Document without URI - re-create chunks from existing content
|
|
713
|
-
docling_document =
|
|
709
|
+
docling_document = converter.convert_text(doc.content)
|
|
714
710
|
await self.chunk_repository.create_chunks_for_document(
|
|
715
711
|
doc.id, docling_document
|
|
716
712
|
)
|
|
@@ -8,6 +8,7 @@ from haiku.rag.config.loader import (
|
|
|
8
8
|
from haiku.rag.config.models import (
|
|
9
9
|
AGUIConfig,
|
|
10
10
|
AppConfig,
|
|
11
|
+
ConversionOptions,
|
|
11
12
|
EmbeddingsConfig,
|
|
12
13
|
LanceDBConfig,
|
|
13
14
|
MonitorConfig,
|
|
@@ -25,6 +26,7 @@ __all__ = [
|
|
|
25
26
|
"Config",
|
|
26
27
|
"AGUIConfig",
|
|
27
28
|
"AppConfig",
|
|
29
|
+
"ConversionOptions",
|
|
28
30
|
"StorageConfig",
|
|
29
31
|
"MonitorConfig",
|
|
30
32
|
"LanceDBConfig",
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
+
from typing import Literal
|
|
2
3
|
|
|
3
4
|
from pydantic import BaseModel, Field
|
|
4
5
|
|
|
@@ -50,10 +51,34 @@ class ResearchConfig(BaseModel):
|
|
|
50
51
|
max_concurrency: int = 1
|
|
51
52
|
|
|
52
53
|
|
|
54
|
+
class ConversionOptions(BaseModel):
|
|
55
|
+
"""Options for document conversion."""
|
|
56
|
+
|
|
57
|
+
# OCR options
|
|
58
|
+
do_ocr: bool = True
|
|
59
|
+
force_ocr: bool = False
|
|
60
|
+
ocr_lang: list[str] = []
|
|
61
|
+
|
|
62
|
+
# Table options
|
|
63
|
+
do_table_structure: bool = True
|
|
64
|
+
table_mode: Literal["fast", "accurate"] = "accurate"
|
|
65
|
+
table_cell_matching: bool = True
|
|
66
|
+
|
|
67
|
+
# Image options
|
|
68
|
+
images_scale: float = 2.0
|
|
69
|
+
|
|
70
|
+
|
|
53
71
|
class ProcessingConfig(BaseModel):
|
|
54
72
|
chunk_size: int = 256
|
|
55
73
|
context_chunk_radius: int = 0
|
|
56
74
|
markdown_preprocessor: str = ""
|
|
75
|
+
converter: str = "docling-local"
|
|
76
|
+
chunker: str = "docling-local"
|
|
77
|
+
chunker_type: str = "hybrid"
|
|
78
|
+
chunking_tokenizer: str = "Qwen/Qwen3-Embedding-0.6B"
|
|
79
|
+
chunking_merge_peers: bool = True
|
|
80
|
+
chunking_use_markdown_tables: bool = False
|
|
81
|
+
conversion_options: ConversionOptions = Field(default_factory=ConversionOptions)
|
|
57
82
|
|
|
58
83
|
|
|
59
84
|
class OllamaConfig(BaseModel):
|
|
@@ -71,9 +96,16 @@ class VLLMConfig(BaseModel):
|
|
|
71
96
|
research_base_url: str = ""
|
|
72
97
|
|
|
73
98
|
|
|
99
|
+
class DoclingServeConfig(BaseModel):
|
|
100
|
+
base_url: str = "http://localhost:5001"
|
|
101
|
+
api_key: str = ""
|
|
102
|
+
timeout: int = 300
|
|
103
|
+
|
|
104
|
+
|
|
74
105
|
class ProvidersConfig(BaseModel):
|
|
75
106
|
ollama: OllamaConfig = Field(default_factory=OllamaConfig)
|
|
76
107
|
vllm: VLLMConfig = Field(default_factory=VLLMConfig)
|
|
108
|
+
docling_serve: DoclingServeConfig = Field(default_factory=DoclingServeConfig)
|
|
77
109
|
|
|
78
110
|
|
|
79
111
|
class AGUIConfig(BaseModel):
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Document converter abstraction for haiku.rag."""
|
|
2
|
+
|
|
3
|
+
from haiku.rag.config import AppConfig, Config
|
|
4
|
+
from haiku.rag.converters.base import DocumentConverter
|
|
5
|
+
|
|
6
|
+
__all__ = ["DocumentConverter", "get_converter"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_converter(config: AppConfig = Config) -> DocumentConverter:
|
|
10
|
+
"""Get a document converter instance based on configuration.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
config: Configuration to use. Defaults to global Config.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
DocumentConverter instance configured according to the config.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If the converter provider is not recognized.
|
|
20
|
+
"""
|
|
21
|
+
if config.processing.converter == "docling-local":
|
|
22
|
+
from haiku.rag.converters.docling_local import DoclingLocalConverter
|
|
23
|
+
|
|
24
|
+
return DoclingLocalConverter(config)
|
|
25
|
+
|
|
26
|
+
if config.processing.converter == "docling-serve":
|
|
27
|
+
from haiku.rag.converters.docling_serve import DoclingServeConverter
|
|
28
|
+
|
|
29
|
+
return DoclingServeConverter(config)
|
|
30
|
+
|
|
31
|
+
raise ValueError(f"Unsupported converter provider: {config.processing.converter}")
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Base class for document converters."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocumentConverter(ABC):
|
|
12
|
+
"""Abstract base class for document converters.
|
|
13
|
+
|
|
14
|
+
Document converters are responsible for converting various document formats
|
|
15
|
+
(PDF, DOCX, HTML, etc.) into DoclingDocument format for further processing.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def supported_extensions(self) -> list[str]:
|
|
21
|
+
"""Return list of file extensions supported by this converter.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
List of file extensions (including the dot, e.g., [".pdf", ".docx"]).
|
|
25
|
+
"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def convert_file(self, path: Path) -> "DoclingDocument":
|
|
30
|
+
"""Convert a file to DoclingDocument format.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
path: Path to the file to convert.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
DoclingDocument representation of the file.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the file cannot be converted.
|
|
40
|
+
"""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def convert_text(self, text: str, name: str = "content.md") -> "DoclingDocument":
|
|
45
|
+
"""Convert text content to DoclingDocument format.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
text: The text content to convert.
|
|
49
|
+
name: The name to use for the document (defaults to "content.md").
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
DoclingDocument representation of the text.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: If the text cannot be converted.
|
|
56
|
+
"""
|
|
57
|
+
pass
|