haiku.rag-slim 0.16.1__tar.gz → 0.17.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (81) hide show
  1. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/PKG-INFO +1 -2
  2. haiku_rag_slim-0.17.1/haiku/rag/chunkers/__init__.py +31 -0
  3. haiku_rag_slim-0.17.1/haiku/rag/chunkers/base.py +28 -0
  4. haiku_rag_slim-0.17.1/haiku/rag/chunkers/docling_local.py +110 -0
  5. haiku_rag_slim-0.17.1/haiku/rag/chunkers/docling_serve.py +111 -0
  6. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/client.py +22 -26
  7. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/__init__.py +2 -0
  8. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/models.py +32 -0
  9. haiku_rag_slim-0.17.1/haiku/rag/converters/__init__.py +31 -0
  10. haiku_rag_slim-0.17.1/haiku/rag/converters/base.py +57 -0
  11. haiku_rag_slim-0.17.1/haiku/rag/converters/docling_local.py +154 -0
  12. haiku_rag_slim-0.17.1/haiku/rag/converters/docling_serve.py +199 -0
  13. haiku_rag_slim-0.17.1/haiku/rag/converters/text_utils.py +117 -0
  14. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/monitor.py +21 -9
  15. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/chunk.py +6 -3
  16. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/utils.py +6 -69
  17. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/pyproject.toml +1 -2
  18. haiku_rag_slim-0.16.1/haiku/rag/chunker.py +0 -65
  19. haiku_rag_slim-0.16.1/haiku/rag/reader.py +0 -135
  20. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/.gitignore +0 -0
  21. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/LICENSE +0 -0
  22. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/README.md +0 -0
  23. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/__init__.py +0 -0
  24. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/app.py +0 -0
  25. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/cli.py +0 -0
  26. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/loader.py +0 -0
  27. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/__init__.py +0 -0
  28. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/base.py +0 -0
  29. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/ollama.py +0 -0
  30. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/openai.py +0 -0
  31. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/vllm.py +0 -0
  32. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/embeddings/voyageai.py +0 -0
  33. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/__init__.py +0 -0
  34. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/__init__.py +0 -0
  35. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/cli_renderer.py +0 -0
  36. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/emitter.py +0 -0
  37. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/events.py +0 -0
  38. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/server.py +0 -0
  39. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/state.py +0 -0
  40. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/agui/stream.py +0 -0
  41. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/__init__.py +0 -0
  42. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/models.py +0 -0
  43. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/nodes.py +0 -0
  44. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/prompts.py +0 -0
  45. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/common/utils.py +0 -0
  46. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/__init__.py +0 -0
  47. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/dependencies.py +0 -0
  48. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/graph.py +0 -0
  49. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/models.py +0 -0
  50. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/prompts.py +0 -0
  51. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/deep_qa/state.py +0 -0
  52. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/__init__.py +0 -0
  53. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/common.py +0 -0
  54. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/dependencies.py +0 -0
  55. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/graph.py +0 -0
  56. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/models.py +0 -0
  57. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/prompts.py +0 -0
  58. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/graph/research/state.py +0 -0
  59. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/logging.py +0 -0
  60. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/mcp.py +0 -0
  61. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/qa/__init__.py +0 -0
  62. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/qa/agent.py +0 -0
  63. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/qa/prompts.py +0 -0
  64. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/__init__.py +0 -0
  65. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/base.py +0 -0
  66. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/cohere.py +0 -0
  67. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/mxbai.py +0 -0
  68. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/vllm.py +0 -0
  69. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/reranking/zeroentropy.py +0 -0
  70. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/__init__.py +0 -0
  71. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/engine.py +0 -0
  72. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/models/__init__.py +0 -0
  73. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/models/chunk.py +0 -0
  74. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/models/document.py +0 -0
  75. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/__init__.py +0 -0
  76. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/document.py +0 -0
  77. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/repositories/settings.py +0 -0
  78. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/upgrades/__init__.py +0 -0
  79. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/upgrades/v0_10_1.py +0 -0
  80. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/store/upgrades/v0_9_3.py +0 -0
  81. {haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/test_agui_server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag-slim
3
- Version: 0.16.1
3
+ Version: 0.17.1
4
4
  Summary: Agentic Retrieval Augmented Generation (RAG) with LanceDB - Minimal dependencies
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -26,7 +26,6 @@ Requires-Dist: pydantic>=2.12.3
26
26
  Requires-Dist: python-dotenv>=1.2.1
27
27
  Requires-Dist: pyyaml>=6.0.3
28
28
  Requires-Dist: rich>=14.2.0
29
- Requires-Dist: tiktoken>=0.12.0
30
29
  Requires-Dist: typer<0.20.0,>=0.19.2
31
30
  Requires-Dist: watchfiles>=1.1.1
32
31
  Provides-Extra: anthropic
@@ -0,0 +1,31 @@
1
+ """Document chunker abstraction for haiku.rag."""
2
+
3
+ from haiku.rag.chunkers.base import DocumentChunker
4
+ from haiku.rag.config import AppConfig, Config
5
+
6
+ __all__ = ["DocumentChunker", "get_chunker"]
7
+
8
+
9
+ def get_chunker(config: AppConfig = Config) -> DocumentChunker:
10
+ """Get a document chunker instance based on configuration.
11
+
12
+ Args:
13
+ config: Configuration to use. Defaults to global Config.
14
+
15
+ Returns:
16
+ DocumentChunker instance configured according to the config.
17
+
18
+ Raises:
19
+ ValueError: If the chunker provider is not recognized.
20
+ """
21
+ if config.processing.chunker == "docling-local":
22
+ from haiku.rag.chunkers.docling_local import DoclingLocalChunker
23
+
24
+ return DoclingLocalChunker(config)
25
+
26
+ if config.processing.chunker == "docling-serve":
27
+ from haiku.rag.chunkers.docling_serve import DoclingServeChunker
28
+
29
+ return DoclingServeChunker(config)
30
+
31
+ raise ValueError(f"Unsupported chunker: {config.processing.chunker}")
@@ -0,0 +1,28 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from docling_core.types.doc.document import DoclingDocument
6
+
7
+
8
+ class DocumentChunker(ABC):
9
+ """Abstract base class for document chunkers.
10
+
11
+ Document chunkers split DoclingDocuments into smaller text chunks suitable
12
+ for embedding and retrieval, respecting document structure and semantic boundaries.
13
+ """
14
+
15
+ @abstractmethod
16
+ async def chunk(self, document: "DoclingDocument") -> list[str]:
17
+ """Split a document into chunks.
18
+
19
+ Args:
20
+ document: The DoclingDocument to chunk.
21
+
22
+ Returns:
23
+ List of text chunks with semantic boundaries preserved.
24
+
25
+ Raises:
26
+ ValueError: If chunking fails.
27
+ """
28
+ pass
@@ -0,0 +1,110 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from haiku.rag.chunkers.base import DocumentChunker
4
+ from haiku.rag.config import AppConfig, Config
5
+
6
+ if TYPE_CHECKING:
7
+ from docling_core.types.doc.document import DoclingDocument
8
+
9
+
10
+ def _create_markdown_serializer_provider(use_markdown_tables: bool = True):
11
+ """Create a markdown serializer provider with configurable table rendering.
12
+
13
+ This function creates a custom serializer provider that extends ChunkingSerializerProvider
14
+ from docling-core. It's implemented as a factory function to avoid importing
15
+ docling-core at module level.
16
+
17
+ Args:
18
+ use_markdown_tables: If True, use MarkdownTableSerializer for rendering tables as
19
+ markdown. If False, use default TripletTableSerializer for narrative format.
20
+ """
21
+ from docling_core.transforms.chunker.hierarchical_chunker import (
22
+ ChunkingDocSerializer,
23
+ ChunkingSerializerProvider,
24
+ )
25
+ from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
26
+
27
+ class MDTableSerializerProvider(ChunkingSerializerProvider):
28
+ """Serializer provider for markdown table output."""
29
+
30
+ def __init__(self, use_markdown_tables: bool = True):
31
+ self.use_markdown_tables = use_markdown_tables
32
+
33
+ def get_serializer(self, doc):
34
+ if self.use_markdown_tables:
35
+ return ChunkingDocSerializer(
36
+ doc=doc,
37
+ table_serializer=MarkdownTableSerializer(),
38
+ )
39
+ else:
40
+ # Use default ChunkingDocSerializer (TripletTableSerializer)
41
+ return ChunkingDocSerializer(doc=doc)
42
+
43
+ return MDTableSerializerProvider(use_markdown_tables=use_markdown_tables)
44
+
45
+
46
+ class DoclingLocalChunker(DocumentChunker):
47
+ """Local document chunker using docling's chunkers.
48
+
49
+ Supports both hybrid (structure-aware) and hierarchical chunking strategies.
50
+ Chunking is performed locally using the HuggingFace tokenizer specified in
51
+ configuration.
52
+
53
+ Args:
54
+ config: Application configuration.
55
+ """
56
+
57
+ def __init__(self, config: AppConfig = Config):
58
+ from docling_core.transforms.chunker.hierarchical_chunker import (
59
+ HierarchicalChunker,
60
+ )
61
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
62
+ from docling_core.transforms.chunker.tokenizer.huggingface import (
63
+ HuggingFaceTokenizer,
64
+ )
65
+ from transformers import AutoTokenizer
66
+
67
+ self.config = config
68
+ self.chunk_size = config.processing.chunk_size
69
+ self.chunker_type = config.processing.chunker_type
70
+ self.tokenizer_name = config.processing.chunking_tokenizer
71
+
72
+ if self.chunker_type == "hybrid":
73
+ hf_tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
74
+ tokenizer = HuggingFaceTokenizer(
75
+ tokenizer=hf_tokenizer, max_tokens=self.chunk_size
76
+ )
77
+ serializer_provider = _create_markdown_serializer_provider(
78
+ use_markdown_tables=config.processing.chunking_use_markdown_tables
79
+ )
80
+ self.chunker = HybridChunker(
81
+ tokenizer=tokenizer,
82
+ merge_peers=config.processing.chunking_merge_peers,
83
+ serializer_provider=serializer_provider,
84
+ )
85
+ elif self.chunker_type == "hierarchical":
86
+ serializer_provider = _create_markdown_serializer_provider(
87
+ use_markdown_tables=config.processing.chunking_use_markdown_tables
88
+ )
89
+ self.chunker = HierarchicalChunker(serializer_provider=serializer_provider)
90
+ else:
91
+ raise ValueError(
92
+ f"Unsupported chunker_type: {self.chunker_type}. "
93
+ "Must be 'hybrid' or 'hierarchical'."
94
+ )
95
+
96
+ async def chunk(self, document: "DoclingDocument") -> list[str]:
97
+ """Split the document into chunks using docling's structure-aware chunking.
98
+
99
+ Args:
100
+ document: The DoclingDocument to be split into chunks.
101
+
102
+ Returns:
103
+ A list of text chunks with semantic boundaries.
104
+ """
105
+ if document is None:
106
+ return []
107
+
108
+ # Chunk using docling's hybrid chunker
109
+ chunks = list(self.chunker.chunk(document))
110
+ return [self.chunker.contextualize(chunk) for chunk in chunks]
@@ -0,0 +1,111 @@
1
+ from io import BytesIO
2
+ from typing import TYPE_CHECKING
3
+
4
+ import requests
5
+
6
+ from haiku.rag.chunkers.base import DocumentChunker
7
+ from haiku.rag.config import AppConfig, Config
8
+
9
+ if TYPE_CHECKING:
10
+ from docling_core.types.doc.document import DoclingDocument
11
+
12
+
13
+ class DoclingServeChunker(DocumentChunker):
14
+ """Remote document chunker using docling-serve API.
15
+
16
+ Sends DoclingDocument JSON to docling-serve for chunking. Supports both hybrid
17
+ and hierarchical chunking strategies via remote API.
18
+
19
+ Args:
20
+ config: Application configuration containing docling-serve settings.
21
+ """
22
+
23
+ def __init__(self, config: AppConfig = Config):
24
+ self.config = config
25
+ self.base_url = config.providers.docling_serve.base_url.rstrip("/")
26
+ self.api_key = config.providers.docling_serve.api_key
27
+ self.timeout = config.providers.docling_serve.timeout
28
+ self.chunker_type = config.processing.chunker_type
29
+
30
+ async def chunk(self, document: "DoclingDocument") -> list[str]:
31
+ """Split the document into chunks via docling-serve.
32
+
33
+ Exports the DoclingDocument to JSON and sends it to docling-serve's chunking
34
+ endpoint. The API will chunk the document and return the text chunks.
35
+
36
+ Args:
37
+ document: The DoclingDocument to be split into chunks.
38
+
39
+ Returns:
40
+ A list of text chunks with semantic boundaries.
41
+
42
+ Raises:
43
+ ValueError: If chunking fails or service is unavailable.
44
+ """
45
+ if document is None:
46
+ return []
47
+
48
+ try:
49
+ # Determine endpoint based on chunker_type
50
+ if self.chunker_type == "hierarchical":
51
+ url = f"{self.base_url}/v1/chunk/hierarchical/file"
52
+ else:
53
+ url = f"{self.base_url}/v1/chunk/hybrid/file"
54
+
55
+ # Export document to JSON
56
+ doc_json = document.model_dump_json()
57
+ doc_bytes = doc_json.encode("utf-8")
58
+
59
+ # Prepare multipart request with DoclingDocument JSON
60
+ files = {"files": ("document.json", BytesIO(doc_bytes), "application/json")}
61
+
62
+ # Build form data with chunking parameters
63
+ data = {
64
+ "chunking_max_tokens": str(self.config.processing.chunk_size),
65
+ "chunking_tokenizer": self.config.processing.chunking_tokenizer,
66
+ "chunking_merge_peers": str(
67
+ self.config.processing.chunking_merge_peers
68
+ ).lower(),
69
+ "chunking_use_markdown_tables": str(
70
+ self.config.processing.chunking_use_markdown_tables
71
+ ).lower(),
72
+ }
73
+
74
+ headers = {}
75
+ if self.api_key:
76
+ headers["X-Api-Key"] = self.api_key
77
+
78
+ response = requests.post(
79
+ url,
80
+ files=files,
81
+ data=data,
82
+ headers=headers,
83
+ timeout=self.timeout,
84
+ )
85
+
86
+ response.raise_for_status()
87
+
88
+ result = response.json()
89
+
90
+ # Extract text from chunks
91
+ chunks = result.get("chunks", [])
92
+ return [chunk["text"] for chunk in chunks]
93
+
94
+ except requests.exceptions.ConnectionError as e:
95
+ raise ValueError(
96
+ f"Could not connect to docling-serve at {self.base_url}. "
97
+ f"Ensure the service is running and accessible. Error: {e}"
98
+ )
99
+ except requests.exceptions.Timeout as e:
100
+ raise ValueError(
101
+ f"Request to docling-serve timed out after {self.timeout}s. "
102
+ f"Consider increasing the timeout in configuration. Error: {e}"
103
+ )
104
+ except requests.exceptions.HTTPError as e:
105
+ if e.response.status_code == 401:
106
+ raise ValueError(
107
+ "Authentication failed. Check your API key configuration."
108
+ )
109
+ raise ValueError(f"HTTP error from docling-serve: {e}")
110
+ except Exception as e:
111
+ raise ValueError(f"Failed to chunk via docling-serve: {e}")
@@ -9,6 +9,7 @@ from urllib.parse import urlparse
9
9
  import httpx
10
10
 
11
11
  from haiku.rag.config import AppConfig, Config
12
+ from haiku.rag.converters import get_converter
12
13
  from haiku.rag.reranking import get_reranker
13
14
  from haiku.rag.store.engine import Store
14
15
  from haiku.rag.store.models.chunk import Chunk
@@ -111,10 +112,9 @@ class HaikuRAG:
111
112
 
112
113
  # Only create docling_document if we need to generate chunks
113
114
  if chunks is None:
114
- # Lazy import to avoid loading docling
115
- from haiku.rag.utils import text_to_docling_document
116
-
117
- docling_document = text_to_docling_document(content)
115
+ # Use converter to convert text
116
+ converter = get_converter(self._config)
117
+ docling_document = converter.convert_text(content)
118
118
  else:
119
119
  # Chunks already provided, no conversion needed
120
120
  docling_document = None
@@ -201,12 +201,10 @@ class HaikuRAG:
201
201
  Raises:
202
202
  ValueError: If the file cannot be parsed or doesn't exist
203
203
  """
204
- # Lazy import to avoid loading docling
205
- from haiku.rag.reader import FileReader
206
-
207
204
  metadata = metadata or {}
208
205
 
209
- if source_path.suffix.lower() not in FileReader.extensions:
206
+ converter = get_converter(self._config)
207
+ if source_path.suffix.lower() not in converter.supported_extensions:
210
208
  raise ValueError(f"Unsupported file extension: {source_path.suffix}")
211
209
 
212
210
  if not source_path.exists():
@@ -242,7 +240,8 @@ class HaikuRAG:
242
240
  return existing_doc
243
241
 
244
242
  # Parse file only when content changed or new document
245
- docling_document = FileReader.parse_file(source_path)
243
+ converter = get_converter(self._config)
244
+ docling_document = converter.convert_file(source_path)
246
245
 
247
246
  if existing_doc:
248
247
  # Update existing document
@@ -283,11 +282,11 @@ class HaikuRAG:
283
282
  ValueError: If the content cannot be parsed
284
283
  httpx.RequestError: If URL request fails
285
284
  """
286
- # Lazy import to avoid loading docling
287
- from haiku.rag.reader import FileReader
288
-
289
285
  metadata = metadata or {}
290
286
 
287
+ converter = get_converter(self._config)
288
+ supported_extensions = converter.supported_extensions
289
+
291
290
  async with httpx.AsyncClient() as client:
292
291
  response = await client.get(url)
293
292
  response.raise_for_status()
@@ -320,7 +319,7 @@ class HaikuRAG:
320
319
  url, content_type
321
320
  )
322
321
 
323
- if file_extension not in FileReader.extensions:
322
+ if file_extension not in supported_extensions:
324
323
  raise ValueError(
325
324
  f"Unsupported content type/extension: {content_type}/{file_extension}"
326
325
  )
@@ -333,8 +332,8 @@ class HaikuRAG:
333
332
  temp_file.flush() # Ensure content is written to disk
334
333
  temp_path = Path(temp_file.name)
335
334
 
336
- # Parse the content using FileReader
337
- docling_document = FileReader.parse_file(temp_path)
335
+ # Parse the content using converter
336
+ docling_document = converter.convert_file(temp_path)
338
337
 
339
338
  # Merge metadata with contentType and md5
340
339
  metadata.update({"contentType": content_type, "md5": md5_hash})
@@ -410,11 +409,9 @@ class HaikuRAG:
410
409
 
411
410
  async def update_document(self, document: Document) -> Document:
412
411
  """Update an existing document."""
413
- # Lazy import to avoid loading docling
414
- from haiku.rag.utils import text_to_docling_document
415
-
416
412
  # Convert content to DoclingDocument
417
- docling_document = text_to_docling_document(document.content)
413
+ converter = get_converter(self._config)
414
+ docling_document = converter.convert_text(document.content)
418
415
 
419
416
  return await self.document_repository._update_and_rechunk(
420
417
  document, docling_document
@@ -469,8 +466,8 @@ class HaikuRAG:
469
466
  # No reranking - return direct search results
470
467
  return await self.chunk_repository.search(query, limit, search_type, filter)
471
468
 
472
- # Get more initial results (3X) for reranking
473
- search_limit = limit * 3
469
+ # Get more initial results (10X) for reranking
470
+ search_limit = limit * 10
474
471
  search_results = await self.chunk_repository.search(
475
472
  query, search_limit, search_type, filter
476
473
  )
@@ -646,12 +643,11 @@ class HaikuRAG:
646
643
  Yields:
647
644
  int: The ID of the document currently being processed
648
645
  """
649
- # Lazy import to avoid loading docling
650
- from haiku.rag.utils import text_to_docling_document
651
-
652
646
  await self.chunk_repository.delete_all()
653
647
  self.store.recreate_embeddings_table()
654
648
 
649
+ converter = get_converter(self._config)
650
+
655
651
  # Update settings to current config
656
652
  settings_repo = SettingsRepository(self.store)
657
653
  settings_repo.save_current_settings()
@@ -703,14 +699,14 @@ class HaikuRAG:
703
699
  logger.warning(
704
700
  "Source missing for %s, re-embedding from content", doc.uri
705
701
  )
706
- docling_document = text_to_docling_document(doc.content)
702
+ docling_document = converter.convert_text(doc.content)
707
703
  await self.chunk_repository.create_chunks_for_document(
708
704
  doc.id, docling_document
709
705
  )
710
706
  yield doc.id
711
707
  else:
712
708
  # Document without URI - re-create chunks from existing content
713
- docling_document = text_to_docling_document(doc.content)
709
+ docling_document = converter.convert_text(doc.content)
714
710
  await self.chunk_repository.create_chunks_for_document(
715
711
  doc.id, docling_document
716
712
  )
@@ -8,6 +8,7 @@ from haiku.rag.config.loader import (
8
8
  from haiku.rag.config.models import (
9
9
  AGUIConfig,
10
10
  AppConfig,
11
+ ConversionOptions,
11
12
  EmbeddingsConfig,
12
13
  LanceDBConfig,
13
14
  MonitorConfig,
@@ -25,6 +26,7 @@ __all__ = [
25
26
  "Config",
26
27
  "AGUIConfig",
27
28
  "AppConfig",
29
+ "ConversionOptions",
28
30
  "StorageConfig",
29
31
  "MonitorConfig",
30
32
  "LanceDBConfig",
@@ -1,4 +1,5 @@
1
1
  from pathlib import Path
2
+ from typing import Literal
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
 
@@ -50,10 +51,34 @@ class ResearchConfig(BaseModel):
50
51
  max_concurrency: int = 1
51
52
 
52
53
 
54
+ class ConversionOptions(BaseModel):
55
+ """Options for document conversion."""
56
+
57
+ # OCR options
58
+ do_ocr: bool = True
59
+ force_ocr: bool = False
60
+ ocr_lang: list[str] = []
61
+
62
+ # Table options
63
+ do_table_structure: bool = True
64
+ table_mode: Literal["fast", "accurate"] = "accurate"
65
+ table_cell_matching: bool = True
66
+
67
+ # Image options
68
+ images_scale: float = 2.0
69
+
70
+
53
71
  class ProcessingConfig(BaseModel):
54
72
  chunk_size: int = 256
55
73
  context_chunk_radius: int = 0
56
74
  markdown_preprocessor: str = ""
75
+ converter: str = "docling-local"
76
+ chunker: str = "docling-local"
77
+ chunker_type: str = "hybrid"
78
+ chunking_tokenizer: str = "Qwen/Qwen3-Embedding-0.6B"
79
+ chunking_merge_peers: bool = True
80
+ chunking_use_markdown_tables: bool = False
81
+ conversion_options: ConversionOptions = Field(default_factory=ConversionOptions)
57
82
 
58
83
 
59
84
  class OllamaConfig(BaseModel):
@@ -71,9 +96,16 @@ class VLLMConfig(BaseModel):
71
96
  research_base_url: str = ""
72
97
 
73
98
 
99
+ class DoclingServeConfig(BaseModel):
100
+ base_url: str = "http://localhost:5001"
101
+ api_key: str = ""
102
+ timeout: int = 300
103
+
104
+
74
105
  class ProvidersConfig(BaseModel):
75
106
  ollama: OllamaConfig = Field(default_factory=OllamaConfig)
76
107
  vllm: VLLMConfig = Field(default_factory=VLLMConfig)
108
+ docling_serve: DoclingServeConfig = Field(default_factory=DoclingServeConfig)
77
109
 
78
110
 
79
111
  class AGUIConfig(BaseModel):
@@ -0,0 +1,31 @@
1
+ """Document converter abstraction for haiku.rag."""
2
+
3
+ from haiku.rag.config import AppConfig, Config
4
+ from haiku.rag.converters.base import DocumentConverter
5
+
6
+ __all__ = ["DocumentConverter", "get_converter"]
7
+
8
+
9
+ def get_converter(config: AppConfig = Config) -> DocumentConverter:
10
+ """Get a document converter instance based on configuration.
11
+
12
+ Args:
13
+ config: Configuration to use. Defaults to global Config.
14
+
15
+ Returns:
16
+ DocumentConverter instance configured according to the config.
17
+
18
+ Raises:
19
+ ValueError: If the converter provider is not recognized.
20
+ """
21
+ if config.processing.converter == "docling-local":
22
+ from haiku.rag.converters.docling_local import DoclingLocalConverter
23
+
24
+ return DoclingLocalConverter(config)
25
+
26
+ if config.processing.converter == "docling-serve":
27
+ from haiku.rag.converters.docling_serve import DoclingServeConverter
28
+
29
+ return DoclingServeConverter(config)
30
+
31
+ raise ValueError(f"Unsupported converter provider: {config.processing.converter}")
@@ -0,0 +1,57 @@
1
+ """Base class for document converters."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from docling_core.types.doc.document import DoclingDocument
9
+
10
+
11
+ class DocumentConverter(ABC):
12
+ """Abstract base class for document converters.
13
+
14
+ Document converters are responsible for converting various document formats
15
+ (PDF, DOCX, HTML, etc.) into DoclingDocument format for further processing.
16
+ """
17
+
18
+ @property
19
+ @abstractmethod
20
+ def supported_extensions(self) -> list[str]:
21
+ """Return list of file extensions supported by this converter.
22
+
23
+ Returns:
24
+ List of file extensions (including the dot, e.g., [".pdf", ".docx"]).
25
+ """
26
+ pass
27
+
28
+ @abstractmethod
29
+ def convert_file(self, path: Path) -> "DoclingDocument":
30
+ """Convert a file to DoclingDocument format.
31
+
32
+ Args:
33
+ path: Path to the file to convert.
34
+
35
+ Returns:
36
+ DoclingDocument representation of the file.
37
+
38
+ Raises:
39
+ ValueError: If the file cannot be converted.
40
+ """
41
+ pass
42
+
43
+ @abstractmethod
44
+ def convert_text(self, text: str, name: str = "content.md") -> "DoclingDocument":
45
+ """Convert text content to DoclingDocument format.
46
+
47
+ Args:
48
+ text: The text content to convert.
49
+ name: The name to use for the document (defaults to "content.md").
50
+
51
+ Returns:
52
+ DoclingDocument representation of the text.
53
+
54
+ Raises:
55
+ ValueError: If the text cannot be converted.
56
+ """
57
+ pass