haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- haiku/rag/app.py +430 -72
- haiku/rag/chunkers/__init__.py +31 -0
- haiku/rag/chunkers/base.py +31 -0
- haiku/rag/chunkers/docling_local.py +164 -0
- haiku/rag/chunkers/docling_serve.py +179 -0
- haiku/rag/cli.py +207 -24
- haiku/rag/cli_chat.py +489 -0
- haiku/rag/client.py +1251 -266
- haiku/rag/config/__init__.py +16 -10
- haiku/rag/config/loader.py +5 -44
- haiku/rag/config/models.py +126 -17
- haiku/rag/converters/__init__.py +31 -0
- haiku/rag/converters/base.py +63 -0
- haiku/rag/converters/docling_local.py +193 -0
- haiku/rag/converters/docling_serve.py +229 -0
- haiku/rag/converters/text_utils.py +237 -0
- haiku/rag/embeddings/__init__.py +123 -24
- haiku/rag/embeddings/voyageai.py +175 -20
- haiku/rag/graph/__init__.py +0 -11
- haiku/rag/graph/agui/__init__.py +8 -2
- haiku/rag/graph/agui/cli_renderer.py +1 -1
- haiku/rag/graph/agui/emitter.py +219 -31
- haiku/rag/graph/agui/server.py +20 -62
- haiku/rag/graph/agui/stream.py +1 -2
- haiku/rag/graph/research/__init__.py +5 -2
- haiku/rag/graph/research/dependencies.py +12 -126
- haiku/rag/graph/research/graph.py +390 -135
- haiku/rag/graph/research/models.py +91 -112
- haiku/rag/graph/research/prompts.py +99 -91
- haiku/rag/graph/research/state.py +35 -27
- haiku/rag/inspector/__init__.py +8 -0
- haiku/rag/inspector/app.py +259 -0
- haiku/rag/inspector/widgets/__init__.py +6 -0
- haiku/rag/inspector/widgets/chunk_list.py +100 -0
- haiku/rag/inspector/widgets/context_modal.py +89 -0
- haiku/rag/inspector/widgets/detail_view.py +130 -0
- haiku/rag/inspector/widgets/document_list.py +75 -0
- haiku/rag/inspector/widgets/info_modal.py +209 -0
- haiku/rag/inspector/widgets/search_modal.py +183 -0
- haiku/rag/inspector/widgets/visual_modal.py +126 -0
- haiku/rag/mcp.py +106 -102
- haiku/rag/monitor.py +33 -9
- haiku/rag/providers/__init__.py +5 -0
- haiku/rag/providers/docling_serve.py +108 -0
- haiku/rag/qa/__init__.py +12 -10
- haiku/rag/qa/agent.py +43 -61
- haiku/rag/qa/prompts.py +35 -57
- haiku/rag/reranking/__init__.py +9 -6
- haiku/rag/reranking/base.py +1 -1
- haiku/rag/reranking/cohere.py +5 -4
- haiku/rag/reranking/mxbai.py +5 -2
- haiku/rag/reranking/vllm.py +3 -4
- haiku/rag/reranking/zeroentropy.py +6 -5
- haiku/rag/store/__init__.py +2 -1
- haiku/rag/store/engine.py +242 -42
- haiku/rag/store/exceptions.py +4 -0
- haiku/rag/store/models/__init__.py +8 -2
- haiku/rag/store/models/chunk.py +190 -0
- haiku/rag/store/models/document.py +46 -0
- haiku/rag/store/repositories/chunk.py +141 -121
- haiku/rag/store/repositories/document.py +25 -84
- haiku/rag/store/repositories/settings.py +11 -14
- haiku/rag/store/upgrades/__init__.py +19 -3
- haiku/rag/store/upgrades/v0_10_1.py +1 -1
- haiku/rag/store/upgrades/v0_19_6.py +65 -0
- haiku/rag/store/upgrades/v0_20_0.py +68 -0
- haiku/rag/store/upgrades/v0_23_1.py +100 -0
- haiku/rag/store/upgrades/v0_9_3.py +3 -3
- haiku/rag/utils.py +371 -146
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
- haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
- haiku/rag/chunker.py +0 -65
- haiku/rag/embeddings/base.py +0 -25
- haiku/rag/embeddings/ollama.py +0 -28
- haiku/rag/embeddings/openai.py +0 -26
- haiku/rag/embeddings/vllm.py +0 -29
- haiku/rag/graph/agui/events.py +0 -254
- haiku/rag/graph/common/__init__.py +0 -5
- haiku/rag/graph/common/models.py +0 -42
- haiku/rag/graph/common/nodes.py +0 -265
- haiku/rag/graph/common/prompts.py +0 -46
- haiku/rag/graph/common/utils.py +0 -44
- haiku/rag/graph/deep_qa/__init__.py +0 -1
- haiku/rag/graph/deep_qa/dependencies.py +0 -27
- haiku/rag/graph/deep_qa/graph.py +0 -243
- haiku/rag/graph/deep_qa/models.py +0 -20
- haiku/rag/graph/deep_qa/prompts.py +0 -59
- haiku/rag/graph/deep_qa/state.py +0 -56
- haiku/rag/graph/research/common.py +0 -87
- haiku/rag/reader.py +0 -135
- haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/config/__init__.py
CHANGED
|
@@ -8,38 +8,44 @@ from haiku.rag.config.loader import (
|
|
|
8
8
|
from haiku.rag.config.models import (
|
|
9
9
|
AGUIConfig,
|
|
10
10
|
AppConfig,
|
|
11
|
+
ConversionOptions,
|
|
12
|
+
EmbeddingModelConfig,
|
|
11
13
|
EmbeddingsConfig,
|
|
12
14
|
LanceDBConfig,
|
|
15
|
+
ModelConfig,
|
|
13
16
|
MonitorConfig,
|
|
14
17
|
OllamaConfig,
|
|
15
18
|
ProcessingConfig,
|
|
19
|
+
PromptsConfig,
|
|
16
20
|
ProvidersConfig,
|
|
17
21
|
QAConfig,
|
|
18
22
|
RerankingConfig,
|
|
19
23
|
ResearchConfig,
|
|
20
24
|
StorageConfig,
|
|
21
|
-
VLLMConfig,
|
|
22
25
|
)
|
|
23
26
|
|
|
24
27
|
__all__ = [
|
|
25
28
|
"Config",
|
|
26
29
|
"AGUIConfig",
|
|
27
30
|
"AppConfig",
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"LanceDBConfig",
|
|
31
|
+
"ConversionOptions",
|
|
32
|
+
"EmbeddingModelConfig",
|
|
31
33
|
"EmbeddingsConfig",
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"ProcessingConfig",
|
|
34
|
+
"LanceDBConfig",
|
|
35
|
+
"ModelConfig",
|
|
36
|
+
"MonitorConfig",
|
|
36
37
|
"OllamaConfig",
|
|
37
|
-
"
|
|
38
|
+
"ProcessingConfig",
|
|
39
|
+
"PromptsConfig",
|
|
38
40
|
"ProvidersConfig",
|
|
41
|
+
"QAConfig",
|
|
42
|
+
"RerankingConfig",
|
|
43
|
+
"ResearchConfig",
|
|
44
|
+
"StorageConfig",
|
|
39
45
|
"find_config_file",
|
|
40
|
-
"load_yaml_config",
|
|
41
46
|
"generate_default_config",
|
|
42
47
|
"get_config",
|
|
48
|
+
"load_yaml_config",
|
|
43
49
|
"set_config",
|
|
44
50
|
]
|
|
45
51
|
|
haiku/rag/config/loader.py
CHANGED
|
@@ -48,47 +48,8 @@ def load_yaml_config(path: Path) -> dict:
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def generate_default_config() -> dict:
|
|
51
|
-
"""Generate a default YAML config structure
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
"vacuum_retention_seconds": 86400,
|
|
57
|
-
},
|
|
58
|
-
"monitor": {
|
|
59
|
-
"directories": [],
|
|
60
|
-
"ignore_patterns": [],
|
|
61
|
-
"include_patterns": [],
|
|
62
|
-
},
|
|
63
|
-
"lancedb": {"uri": "", "api_key": "", "region": ""},
|
|
64
|
-
"embeddings": {
|
|
65
|
-
"provider": "ollama",
|
|
66
|
-
"model": "qwen3-embedding",
|
|
67
|
-
"vector_dim": 4096,
|
|
68
|
-
},
|
|
69
|
-
"reranking": {"provider": "", "model": ""},
|
|
70
|
-
"qa": {"provider": "ollama", "model": "gpt-oss"},
|
|
71
|
-
"research": {"provider": "", "model": ""},
|
|
72
|
-
"processing": {
|
|
73
|
-
"chunk_size": 256,
|
|
74
|
-
"context_chunk_radius": 0,
|
|
75
|
-
"markdown_preprocessor": "",
|
|
76
|
-
},
|
|
77
|
-
"providers": {
|
|
78
|
-
"ollama": {"base_url": "http://localhost:11434"},
|
|
79
|
-
"vllm": {
|
|
80
|
-
"embeddings_base_url": "",
|
|
81
|
-
"rerank_base_url": "",
|
|
82
|
-
"qa_base_url": "",
|
|
83
|
-
"research_base_url": "",
|
|
84
|
-
},
|
|
85
|
-
},
|
|
86
|
-
"agui": {
|
|
87
|
-
"host": "0.0.0.0",
|
|
88
|
-
"port": 8000,
|
|
89
|
-
"cors_origins": ["*"],
|
|
90
|
-
"cors_credentials": True,
|
|
91
|
-
"cors_methods": ["GET", "POST", "OPTIONS"],
|
|
92
|
-
"cors_headers": ["*"],
|
|
93
|
-
},
|
|
94
|
-
}
|
|
51
|
+
"""Generate a default YAML config structure from AppConfig defaults."""
|
|
52
|
+
from haiku.rag.config.models import AppConfig
|
|
53
|
+
|
|
54
|
+
default_config = AppConfig()
|
|
55
|
+
return default_config.model_dump(mode="json", exclude_none=False)
|
haiku/rag/config/models.py
CHANGED
|
@@ -1,12 +1,51 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
+
from typing import Literal
|
|
2
3
|
|
|
3
4
|
from pydantic import BaseModel, Field
|
|
4
5
|
|
|
5
6
|
from haiku.rag.utils import get_default_data_dir
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
class ModelConfig(BaseModel):
|
|
10
|
+
"""Configuration for a language model.
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
provider: Model provider (ollama, openai, anthropic, etc.)
|
|
14
|
+
name: Model name/identifier
|
|
15
|
+
base_url: Optional base URL for OpenAI-compatible servers (vLLM, LM Studio, etc.)
|
|
16
|
+
enable_thinking: Control reasoning behavior (true/false/None for default)
|
|
17
|
+
temperature: Sampling temperature (0.0 to 1.0+)
|
|
18
|
+
max_tokens: Maximum tokens to generate
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
provider: str = "ollama"
|
|
22
|
+
name: str = "gpt-oss"
|
|
23
|
+
base_url: str | None = None
|
|
24
|
+
|
|
25
|
+
enable_thinking: bool | None = None
|
|
26
|
+
temperature: float | None = None
|
|
27
|
+
max_tokens: int | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EmbeddingModelConfig(BaseModel):
|
|
31
|
+
"""Configuration for an embedding model.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
provider: Model provider (ollama, openai, voyageai, cohere, sentence-transformers)
|
|
35
|
+
name: Model name/identifier
|
|
36
|
+
vector_dim: Vector dimensions produced by the model
|
|
37
|
+
base_url: Optional base URL for OpenAI-compatible servers (vLLM, LM Studio, etc.)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
provider: str = "ollama"
|
|
41
|
+
name: str = "qwen3-embedding:4b"
|
|
42
|
+
vector_dim: int = 2560
|
|
43
|
+
base_url: str | None = None
|
|
44
|
+
|
|
45
|
+
|
|
8
46
|
class StorageConfig(BaseModel):
|
|
9
47
|
data_dir: Path = Field(default_factory=get_default_data_dir)
|
|
48
|
+
auto_vacuum: bool = True
|
|
10
49
|
vacuum_retention_seconds: int = 86400
|
|
11
50
|
|
|
12
51
|
|
|
@@ -24,36 +63,94 @@ class LanceDBConfig(BaseModel):
|
|
|
24
63
|
|
|
25
64
|
|
|
26
65
|
class EmbeddingsConfig(BaseModel):
|
|
27
|
-
|
|
28
|
-
model: str = "qwen3-embedding"
|
|
29
|
-
vector_dim: int = 4096
|
|
66
|
+
model: EmbeddingModelConfig = Field(default_factory=EmbeddingModelConfig)
|
|
30
67
|
|
|
31
68
|
|
|
32
69
|
class RerankingConfig(BaseModel):
|
|
33
|
-
|
|
34
|
-
model: str = ""
|
|
70
|
+
model: ModelConfig | None = None
|
|
35
71
|
|
|
36
72
|
|
|
37
73
|
class QAConfig(BaseModel):
|
|
38
|
-
|
|
39
|
-
|
|
74
|
+
model: ModelConfig = Field(
|
|
75
|
+
default_factory=lambda: ModelConfig(
|
|
76
|
+
provider="ollama",
|
|
77
|
+
name="gpt-oss",
|
|
78
|
+
enable_thinking=False,
|
|
79
|
+
)
|
|
80
|
+
)
|
|
40
81
|
max_sub_questions: int = 3
|
|
41
82
|
max_iterations: int = 2
|
|
42
83
|
max_concurrency: int = 1
|
|
43
84
|
|
|
44
85
|
|
|
45
86
|
class ResearchConfig(BaseModel):
|
|
46
|
-
|
|
47
|
-
|
|
87
|
+
model: ModelConfig = Field(
|
|
88
|
+
default_factory=lambda: ModelConfig(
|
|
89
|
+
provider="ollama",
|
|
90
|
+
name="gpt-oss",
|
|
91
|
+
enable_thinking=False,
|
|
92
|
+
)
|
|
93
|
+
)
|
|
48
94
|
max_iterations: int = 3
|
|
49
95
|
confidence_threshold: float = 0.8
|
|
50
96
|
max_concurrency: int = 1
|
|
51
97
|
|
|
52
98
|
|
|
99
|
+
class PictureDescriptionConfig(BaseModel):
|
|
100
|
+
"""Configuration for VLM-based picture description."""
|
|
101
|
+
|
|
102
|
+
enabled: bool = False
|
|
103
|
+
model: ModelConfig = Field(
|
|
104
|
+
default_factory=lambda: ModelConfig(
|
|
105
|
+
provider="ollama",
|
|
106
|
+
name="ministral-3",
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
timeout: int = 90
|
|
110
|
+
max_tokens: int = 200
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ConversionOptions(BaseModel):
|
|
114
|
+
"""Options for document conversion."""
|
|
115
|
+
|
|
116
|
+
# OCR options
|
|
117
|
+
do_ocr: bool = True
|
|
118
|
+
force_ocr: bool = False
|
|
119
|
+
ocr_lang: list[str] = []
|
|
120
|
+
|
|
121
|
+
# Table options
|
|
122
|
+
do_table_structure: bool = True
|
|
123
|
+
table_mode: Literal["fast", "accurate"] = "accurate"
|
|
124
|
+
table_cell_matching: bool = True
|
|
125
|
+
|
|
126
|
+
# Image options
|
|
127
|
+
images_scale: float = 2.0
|
|
128
|
+
generate_picture_images: bool = False
|
|
129
|
+
|
|
130
|
+
# VLM picture description
|
|
131
|
+
picture_description: PictureDescriptionConfig = Field(
|
|
132
|
+
default_factory=PictureDescriptionConfig
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
53
136
|
class ProcessingConfig(BaseModel):
|
|
54
137
|
chunk_size: int = 256
|
|
55
|
-
|
|
56
|
-
|
|
138
|
+
converter: str = "docling-local"
|
|
139
|
+
chunker: str = "docling-local"
|
|
140
|
+
chunker_type: str = "hybrid"
|
|
141
|
+
chunking_tokenizer: str = "Qwen/Qwen3-Embedding-0.6B"
|
|
142
|
+
chunking_merge_peers: bool = True
|
|
143
|
+
chunking_use_markdown_tables: bool = False
|
|
144
|
+
conversion_options: ConversionOptions = Field(default_factory=ConversionOptions)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class SearchConfig(BaseModel):
|
|
148
|
+
limit: int = 5
|
|
149
|
+
context_radius: int = 0
|
|
150
|
+
max_context_items: int = 10
|
|
151
|
+
max_context_chars: int = 10000
|
|
152
|
+
vector_index_metric: Literal["cosine", "l2", "dot"] = "cosine"
|
|
153
|
+
vector_refine_factor: int = 30
|
|
57
154
|
|
|
58
155
|
|
|
59
156
|
class OllamaConfig(BaseModel):
|
|
@@ -64,16 +161,14 @@ class OllamaConfig(BaseModel):
|
|
|
64
161
|
)
|
|
65
162
|
|
|
66
163
|
|
|
67
|
-
class
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
qa_base_url: str = ""
|
|
71
|
-
research_base_url: str = ""
|
|
164
|
+
class DoclingServeConfig(BaseModel):
|
|
165
|
+
base_url: str = "http://localhost:5001"
|
|
166
|
+
api_key: str = ""
|
|
72
167
|
|
|
73
168
|
|
|
74
169
|
class ProvidersConfig(BaseModel):
|
|
75
170
|
ollama: OllamaConfig = Field(default_factory=OllamaConfig)
|
|
76
|
-
|
|
171
|
+
docling_serve: DoclingServeConfig = Field(default_factory=DoclingServeConfig)
|
|
77
172
|
|
|
78
173
|
|
|
79
174
|
class AGUIConfig(BaseModel):
|
|
@@ -85,6 +180,18 @@ class AGUIConfig(BaseModel):
|
|
|
85
180
|
cors_headers: list[str] = ["*"]
|
|
86
181
|
|
|
87
182
|
|
|
183
|
+
class PromptsConfig(BaseModel):
|
|
184
|
+
domain_preamble: str = ""
|
|
185
|
+
qa: str | None = None
|
|
186
|
+
synthesis: str | None = None
|
|
187
|
+
picture_description: str = (
|
|
188
|
+
"Describe this image for a blind user. "
|
|
189
|
+
"State the image type (screenshot, chart, photo, etc.), "
|
|
190
|
+
"what it depicts, any visible text, and key visual details. "
|
|
191
|
+
"Be concise and accurate."
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
88
195
|
class AppConfig(BaseModel):
|
|
89
196
|
environment: str = "production"
|
|
90
197
|
storage: StorageConfig = Field(default_factory=StorageConfig)
|
|
@@ -95,5 +202,7 @@ class AppConfig(BaseModel):
|
|
|
95
202
|
qa: QAConfig = Field(default_factory=QAConfig)
|
|
96
203
|
research: ResearchConfig = Field(default_factory=ResearchConfig)
|
|
97
204
|
processing: ProcessingConfig = Field(default_factory=ProcessingConfig)
|
|
205
|
+
search: SearchConfig = Field(default_factory=SearchConfig)
|
|
98
206
|
providers: ProvidersConfig = Field(default_factory=ProvidersConfig)
|
|
99
207
|
agui: AGUIConfig = Field(default_factory=AGUIConfig)
|
|
208
|
+
prompts: PromptsConfig = Field(default_factory=PromptsConfig)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Document converter abstraction for haiku.rag."""
|
|
2
|
+
|
|
3
|
+
from haiku.rag.config import AppConfig, Config
|
|
4
|
+
from haiku.rag.converters.base import DocumentConverter
|
|
5
|
+
|
|
6
|
+
__all__ = ["DocumentConverter", "get_converter"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_converter(config: AppConfig = Config) -> DocumentConverter:
|
|
10
|
+
"""Get a document converter instance based on configuration.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
config: Configuration to use. Defaults to global Config.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
DocumentConverter instance configured according to the config.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If the converter provider is not recognized.
|
|
20
|
+
"""
|
|
21
|
+
if config.processing.converter == "docling-local":
|
|
22
|
+
from haiku.rag.converters.docling_local import DoclingLocalConverter
|
|
23
|
+
|
|
24
|
+
return DoclingLocalConverter(config)
|
|
25
|
+
|
|
26
|
+
if config.processing.converter == "docling-serve":
|
|
27
|
+
from haiku.rag.converters.docling_serve import DoclingServeConverter
|
|
28
|
+
|
|
29
|
+
return DoclingServeConverter(config)
|
|
30
|
+
|
|
31
|
+
raise ValueError(f"Unsupported converter provider: {config.processing.converter}")
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Base class for document converters."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocumentConverter(ABC):
|
|
12
|
+
"""Abstract base class for document converters.
|
|
13
|
+
|
|
14
|
+
Document converters are responsible for converting various document formats
|
|
15
|
+
(PDF, DOCX, HTML, etc.) into DoclingDocument format for further processing.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def supported_extensions(self) -> list[str]:
|
|
21
|
+
"""Return list of file extensions supported by this converter.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
List of file extensions (including the dot, e.g., [".pdf", ".docx"]).
|
|
25
|
+
"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def convert_file(self, path: Path) -> "DoclingDocument":
|
|
30
|
+
"""Convert a file to DoclingDocument format.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
path: Path to the file to convert.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
DoclingDocument representation of the file.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the file cannot be converted.
|
|
40
|
+
"""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
SUPPORTED_FORMATS = ("md", "html", "plain")
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
async def convert_text(
|
|
47
|
+
self, text: str, name: str = "content.md", format: str = "md"
|
|
48
|
+
) -> "DoclingDocument":
|
|
49
|
+
"""Convert text content to DoclingDocument format.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
text: The text content to convert.
|
|
53
|
+
name: The name to use for the document (defaults to "content.md").
|
|
54
|
+
format: The format of the text content ("md", "html", or "plain").
|
|
55
|
+
Defaults to "md". Use "plain" for plain text without parsing.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
DoclingDocument representation of the text.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
ValueError: If the text cannot be converted or format is unsupported.
|
|
62
|
+
"""
|
|
63
|
+
pass
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Local docling converter implementation."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, ClassVar, cast
|
|
6
|
+
|
|
7
|
+
from haiku.rag.config import AppConfig
|
|
8
|
+
from haiku.rag.converters.base import DocumentConverter
|
|
9
|
+
from haiku.rag.converters.text_utils import TextFileHandler
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
13
|
+
|
|
14
|
+
from haiku.rag.config.models import ModelConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DoclingLocalConverter(DocumentConverter):
|
|
18
|
+
"""Converter that uses local docling for document conversion.
|
|
19
|
+
|
|
20
|
+
This converter runs docling locally in-process to convert documents.
|
|
21
|
+
It handles various document formats including PDF, DOCX, HTML, and plain text.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Extensions supported by docling
|
|
25
|
+
docling_extensions: ClassVar[list[str]] = [
|
|
26
|
+
".adoc",
|
|
27
|
+
".asc",
|
|
28
|
+
".asciidoc",
|
|
29
|
+
".bmp",
|
|
30
|
+
".csv",
|
|
31
|
+
".docx",
|
|
32
|
+
".html",
|
|
33
|
+
".xhtml",
|
|
34
|
+
".jpeg",
|
|
35
|
+
".jpg",
|
|
36
|
+
".md",
|
|
37
|
+
".pdf",
|
|
38
|
+
".png",
|
|
39
|
+
".pptx",
|
|
40
|
+
".tiff",
|
|
41
|
+
".xlsx",
|
|
42
|
+
".xml",
|
|
43
|
+
".webp",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
def __init__(self, config: AppConfig):
|
|
47
|
+
"""Initialize the converter with configuration.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
config: Application configuration containing conversion options.
|
|
51
|
+
"""
|
|
52
|
+
self.config = config
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def supported_extensions(self) -> list[str]:
|
|
56
|
+
"""Return list of file extensions supported by this converter."""
|
|
57
|
+
return self.docling_extensions + TextFileHandler.text_extensions
|
|
58
|
+
|
|
59
|
+
def _get_vlm_api_url(self, model: "ModelConfig") -> str:
|
|
60
|
+
"""Construct VLM API URL from model config."""
|
|
61
|
+
if model.base_url:
|
|
62
|
+
base = model.base_url.rstrip("/")
|
|
63
|
+
return f"{base}/v1/chat/completions"
|
|
64
|
+
|
|
65
|
+
if model.provider == "ollama":
|
|
66
|
+
base = self.config.providers.ollama.base_url.rstrip("/")
|
|
67
|
+
return f"{base}/v1/chat/completions"
|
|
68
|
+
|
|
69
|
+
if model.provider == "openai":
|
|
70
|
+
return "https://api.openai.com/v1/chat/completions"
|
|
71
|
+
|
|
72
|
+
raise ValueError(f"Unsupported VLM provider: {model.provider}")
|
|
73
|
+
|
|
74
|
+
def _sync_convert_docling_file(self, path: Path) -> "DoclingDocument":
|
|
75
|
+
"""Synchronous conversion of docling-supported files."""
|
|
76
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
77
|
+
from docling.datamodel.base_models import InputFormat
|
|
78
|
+
from docling.datamodel.pipeline_options import (
|
|
79
|
+
OcrAutoOptions,
|
|
80
|
+
PdfPipelineOptions,
|
|
81
|
+
PictureDescriptionApiOptions,
|
|
82
|
+
TableFormerMode,
|
|
83
|
+
TableStructureOptions,
|
|
84
|
+
)
|
|
85
|
+
from docling.document_converter import (
|
|
86
|
+
DocumentConverter as DoclingDocConverter,
|
|
87
|
+
)
|
|
88
|
+
from docling.document_converter import (
|
|
89
|
+
FormatOption,
|
|
90
|
+
PdfFormatOption,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
opts = self.config.processing.conversion_options
|
|
94
|
+
pic_desc = opts.picture_description
|
|
95
|
+
|
|
96
|
+
pipeline_options = PdfPipelineOptions(
|
|
97
|
+
do_ocr=opts.do_ocr,
|
|
98
|
+
do_table_structure=opts.do_table_structure,
|
|
99
|
+
images_scale=opts.images_scale,
|
|
100
|
+
generate_page_images=True,
|
|
101
|
+
generate_picture_images=opts.generate_picture_images or pic_desc.enabled,
|
|
102
|
+
table_structure_options=TableStructureOptions(
|
|
103
|
+
do_cell_matching=opts.table_cell_matching,
|
|
104
|
+
mode=(
|
|
105
|
+
TableFormerMode.FAST
|
|
106
|
+
if opts.table_mode == "fast"
|
|
107
|
+
else TableFormerMode.ACCURATE
|
|
108
|
+
),
|
|
109
|
+
),
|
|
110
|
+
ocr_options=OcrAutoOptions(
|
|
111
|
+
force_full_page_ocr=opts.force_ocr,
|
|
112
|
+
lang=opts.ocr_lang if opts.ocr_lang else [],
|
|
113
|
+
),
|
|
114
|
+
do_picture_description=pic_desc.enabled,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if pic_desc.enabled:
|
|
118
|
+
from pydantic import AnyUrl
|
|
119
|
+
|
|
120
|
+
prompt = self.config.prompts.picture_description
|
|
121
|
+
|
|
122
|
+
pipeline_options.enable_remote_services = True
|
|
123
|
+
pipeline_options.picture_description_options = PictureDescriptionApiOptions(
|
|
124
|
+
url=AnyUrl(self._get_vlm_api_url(pic_desc.model)),
|
|
125
|
+
params=dict(
|
|
126
|
+
model=pic_desc.model.name,
|
|
127
|
+
max_completion_tokens=pic_desc.max_tokens,
|
|
128
|
+
),
|
|
129
|
+
prompt=prompt,
|
|
130
|
+
timeout=pic_desc.timeout,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
format_options = cast(
|
|
134
|
+
dict[InputFormat, FormatOption],
|
|
135
|
+
{
|
|
136
|
+
InputFormat.PDF: PdfFormatOption(
|
|
137
|
+
pipeline_options=pipeline_options,
|
|
138
|
+
backend=DoclingParseDocumentBackend,
|
|
139
|
+
)
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
converter = DoclingDocConverter(format_options=format_options)
|
|
144
|
+
result = converter.convert(path)
|
|
145
|
+
return result.document
|
|
146
|
+
|
|
147
|
+
async def convert_file(self, path: Path) -> "DoclingDocument":
|
|
148
|
+
"""Convert a file to DoclingDocument using local docling.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
path: Path to the file to convert.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
DoclingDocument representation of the file.
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
ValueError: If the file cannot be converted.
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
file_extension = path.suffix.lower()
|
|
161
|
+
|
|
162
|
+
if file_extension in self.docling_extensions:
|
|
163
|
+
return await asyncio.to_thread(self._sync_convert_docling_file, path)
|
|
164
|
+
elif file_extension in TextFileHandler.text_extensions:
|
|
165
|
+
content = await asyncio.to_thread(path.read_text, encoding="utf-8")
|
|
166
|
+
prepared_content = TextFileHandler.prepare_text_content(
|
|
167
|
+
content, file_extension
|
|
168
|
+
)
|
|
169
|
+
return await self.convert_text(prepared_content, name=f"{path.stem}.md")
|
|
170
|
+
else:
|
|
171
|
+
content = await asyncio.to_thread(path.read_text, encoding="utf-8")
|
|
172
|
+
return await self.convert_text(content, name=f"{path.stem}.md")
|
|
173
|
+
except Exception:
|
|
174
|
+
raise ValueError(f"Failed to parse file: {path}")
|
|
175
|
+
|
|
176
|
+
async def convert_text(
|
|
177
|
+
self, text: str, name: str = "content.md", format: str = "md"
|
|
178
|
+
) -> "DoclingDocument":
|
|
179
|
+
"""Convert text content to DoclingDocument using local docling.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
text: The text content to convert.
|
|
183
|
+
name: The name to use for the document (defaults to "content.md").
|
|
184
|
+
format: The format of the text content ("md", "html", or "plain").
|
|
185
|
+
Defaults to "md". Use "plain" for plain text without parsing.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
DoclingDocument representation of the text.
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
ValueError: If the text cannot be converted or format is unsupported.
|
|
192
|
+
"""
|
|
193
|
+
return await TextFileHandler.text_to_docling_document(text, name, format)
|