haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (94) hide show
  1. haiku/rag/app.py +430 -72
  2. haiku/rag/chunkers/__init__.py +31 -0
  3. haiku/rag/chunkers/base.py +31 -0
  4. haiku/rag/chunkers/docling_local.py +164 -0
  5. haiku/rag/chunkers/docling_serve.py +179 -0
  6. haiku/rag/cli.py +207 -24
  7. haiku/rag/cli_chat.py +489 -0
  8. haiku/rag/client.py +1251 -266
  9. haiku/rag/config/__init__.py +16 -10
  10. haiku/rag/config/loader.py +5 -44
  11. haiku/rag/config/models.py +126 -17
  12. haiku/rag/converters/__init__.py +31 -0
  13. haiku/rag/converters/base.py +63 -0
  14. haiku/rag/converters/docling_local.py +193 -0
  15. haiku/rag/converters/docling_serve.py +229 -0
  16. haiku/rag/converters/text_utils.py +237 -0
  17. haiku/rag/embeddings/__init__.py +123 -24
  18. haiku/rag/embeddings/voyageai.py +175 -20
  19. haiku/rag/graph/__init__.py +0 -11
  20. haiku/rag/graph/agui/__init__.py +8 -2
  21. haiku/rag/graph/agui/cli_renderer.py +1 -1
  22. haiku/rag/graph/agui/emitter.py +219 -31
  23. haiku/rag/graph/agui/server.py +20 -62
  24. haiku/rag/graph/agui/stream.py +1 -2
  25. haiku/rag/graph/research/__init__.py +5 -2
  26. haiku/rag/graph/research/dependencies.py +12 -126
  27. haiku/rag/graph/research/graph.py +390 -135
  28. haiku/rag/graph/research/models.py +91 -112
  29. haiku/rag/graph/research/prompts.py +99 -91
  30. haiku/rag/graph/research/state.py +35 -27
  31. haiku/rag/inspector/__init__.py +8 -0
  32. haiku/rag/inspector/app.py +259 -0
  33. haiku/rag/inspector/widgets/__init__.py +6 -0
  34. haiku/rag/inspector/widgets/chunk_list.py +100 -0
  35. haiku/rag/inspector/widgets/context_modal.py +89 -0
  36. haiku/rag/inspector/widgets/detail_view.py +130 -0
  37. haiku/rag/inspector/widgets/document_list.py +75 -0
  38. haiku/rag/inspector/widgets/info_modal.py +209 -0
  39. haiku/rag/inspector/widgets/search_modal.py +183 -0
  40. haiku/rag/inspector/widgets/visual_modal.py +126 -0
  41. haiku/rag/mcp.py +106 -102
  42. haiku/rag/monitor.py +33 -9
  43. haiku/rag/providers/__init__.py +5 -0
  44. haiku/rag/providers/docling_serve.py +108 -0
  45. haiku/rag/qa/__init__.py +12 -10
  46. haiku/rag/qa/agent.py +43 -61
  47. haiku/rag/qa/prompts.py +35 -57
  48. haiku/rag/reranking/__init__.py +9 -6
  49. haiku/rag/reranking/base.py +1 -1
  50. haiku/rag/reranking/cohere.py +5 -4
  51. haiku/rag/reranking/mxbai.py +5 -2
  52. haiku/rag/reranking/vllm.py +3 -4
  53. haiku/rag/reranking/zeroentropy.py +6 -5
  54. haiku/rag/store/__init__.py +2 -1
  55. haiku/rag/store/engine.py +242 -42
  56. haiku/rag/store/exceptions.py +4 -0
  57. haiku/rag/store/models/__init__.py +8 -2
  58. haiku/rag/store/models/chunk.py +190 -0
  59. haiku/rag/store/models/document.py +46 -0
  60. haiku/rag/store/repositories/chunk.py +141 -121
  61. haiku/rag/store/repositories/document.py +25 -84
  62. haiku/rag/store/repositories/settings.py +11 -14
  63. haiku/rag/store/upgrades/__init__.py +19 -3
  64. haiku/rag/store/upgrades/v0_10_1.py +1 -1
  65. haiku/rag/store/upgrades/v0_19_6.py +65 -0
  66. haiku/rag/store/upgrades/v0_20_0.py +68 -0
  67. haiku/rag/store/upgrades/v0_23_1.py +100 -0
  68. haiku/rag/store/upgrades/v0_9_3.py +3 -3
  69. haiku/rag/utils.py +371 -146
  70. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
  71. haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
  72. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
  73. haiku/rag/chunker.py +0 -65
  74. haiku/rag/embeddings/base.py +0 -25
  75. haiku/rag/embeddings/ollama.py +0 -28
  76. haiku/rag/embeddings/openai.py +0 -26
  77. haiku/rag/embeddings/vllm.py +0 -29
  78. haiku/rag/graph/agui/events.py +0 -254
  79. haiku/rag/graph/common/__init__.py +0 -5
  80. haiku/rag/graph/common/models.py +0 -42
  81. haiku/rag/graph/common/nodes.py +0 -265
  82. haiku/rag/graph/common/prompts.py +0 -46
  83. haiku/rag/graph/common/utils.py +0 -44
  84. haiku/rag/graph/deep_qa/__init__.py +0 -1
  85. haiku/rag/graph/deep_qa/dependencies.py +0 -27
  86. haiku/rag/graph/deep_qa/graph.py +0 -243
  87. haiku/rag/graph/deep_qa/models.py +0 -20
  88. haiku/rag/graph/deep_qa/prompts.py +0 -59
  89. haiku/rag/graph/deep_qa/state.py +0 -56
  90. haiku/rag/graph/research/common.py +0 -87
  91. haiku/rag/reader.py +0 -135
  92. haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
  93. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
  94. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
@@ -8,38 +8,44 @@ from haiku.rag.config.loader import (
8
8
  from haiku.rag.config.models import (
9
9
  AGUIConfig,
10
10
  AppConfig,
11
+ ConversionOptions,
12
+ EmbeddingModelConfig,
11
13
  EmbeddingsConfig,
12
14
  LanceDBConfig,
15
+ ModelConfig,
13
16
  MonitorConfig,
14
17
  OllamaConfig,
15
18
  ProcessingConfig,
19
+ PromptsConfig,
16
20
  ProvidersConfig,
17
21
  QAConfig,
18
22
  RerankingConfig,
19
23
  ResearchConfig,
20
24
  StorageConfig,
21
- VLLMConfig,
22
25
  )
23
26
 
24
27
  __all__ = [
25
28
  "Config",
26
29
  "AGUIConfig",
27
30
  "AppConfig",
28
- "StorageConfig",
29
- "MonitorConfig",
30
- "LanceDBConfig",
31
+ "ConversionOptions",
32
+ "EmbeddingModelConfig",
31
33
  "EmbeddingsConfig",
32
- "RerankingConfig",
33
- "QAConfig",
34
- "ResearchConfig",
35
- "ProcessingConfig",
34
+ "LanceDBConfig",
35
+ "ModelConfig",
36
+ "MonitorConfig",
36
37
  "OllamaConfig",
37
- "VLLMConfig",
38
+ "ProcessingConfig",
39
+ "PromptsConfig",
38
40
  "ProvidersConfig",
41
+ "QAConfig",
42
+ "RerankingConfig",
43
+ "ResearchConfig",
44
+ "StorageConfig",
39
45
  "find_config_file",
40
- "load_yaml_config",
41
46
  "generate_default_config",
42
47
  "get_config",
48
+ "load_yaml_config",
43
49
  "set_config",
44
50
  ]
45
51
 
@@ -48,47 +48,8 @@ def load_yaml_config(path: Path) -> dict:
48
48
 
49
49
 
50
50
  def generate_default_config() -> dict:
51
- """Generate a default YAML config structure with documentation."""
52
- return {
53
- "environment": "production",
54
- "storage": {
55
- "data_dir": "",
56
- "vacuum_retention_seconds": 86400,
57
- },
58
- "monitor": {
59
- "directories": [],
60
- "ignore_patterns": [],
61
- "include_patterns": [],
62
- },
63
- "lancedb": {"uri": "", "api_key": "", "region": ""},
64
- "embeddings": {
65
- "provider": "ollama",
66
- "model": "qwen3-embedding",
67
- "vector_dim": 4096,
68
- },
69
- "reranking": {"provider": "", "model": ""},
70
- "qa": {"provider": "ollama", "model": "gpt-oss"},
71
- "research": {"provider": "", "model": ""},
72
- "processing": {
73
- "chunk_size": 256,
74
- "context_chunk_radius": 0,
75
- "markdown_preprocessor": "",
76
- },
77
- "providers": {
78
- "ollama": {"base_url": "http://localhost:11434"},
79
- "vllm": {
80
- "embeddings_base_url": "",
81
- "rerank_base_url": "",
82
- "qa_base_url": "",
83
- "research_base_url": "",
84
- },
85
- },
86
- "agui": {
87
- "host": "0.0.0.0",
88
- "port": 8000,
89
- "cors_origins": ["*"],
90
- "cors_credentials": True,
91
- "cors_methods": ["GET", "POST", "OPTIONS"],
92
- "cors_headers": ["*"],
93
- },
94
- }
51
+ """Generate a default YAML config structure from AppConfig defaults."""
52
+ from haiku.rag.config.models import AppConfig
53
+
54
+ default_config = AppConfig()
55
+ return default_config.model_dump(mode="json", exclude_none=False)
@@ -1,12 +1,51 @@
1
1
  from pathlib import Path
2
+ from typing import Literal
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
 
5
6
  from haiku.rag.utils import get_default_data_dir
6
7
 
7
8
 
9
+ class ModelConfig(BaseModel):
10
+ """Configuration for a language model.
11
+
12
+ Attributes:
13
+ provider: Model provider (ollama, openai, anthropic, etc.)
14
+ name: Model name/identifier
15
+ base_url: Optional base URL for OpenAI-compatible servers (vLLM, LM Studio, etc.)
16
+ enable_thinking: Control reasoning behavior (true/false/None for default)
17
+ temperature: Sampling temperature (0.0 to 1.0+)
18
+ max_tokens: Maximum tokens to generate
19
+ """
20
+
21
+ provider: str = "ollama"
22
+ name: str = "gpt-oss"
23
+ base_url: str | None = None
24
+
25
+ enable_thinking: bool | None = None
26
+ temperature: float | None = None
27
+ max_tokens: int | None = None
28
+
29
+
30
+ class EmbeddingModelConfig(BaseModel):
31
+ """Configuration for an embedding model.
32
+
33
+ Attributes:
34
+ provider: Model provider (ollama, openai, voyageai, cohere, sentence-transformers)
35
+ name: Model name/identifier
36
+ vector_dim: Vector dimensions produced by the model
37
+ base_url: Optional base URL for OpenAI-compatible servers (vLLM, LM Studio, etc.)
38
+ """
39
+
40
+ provider: str = "ollama"
41
+ name: str = "qwen3-embedding:4b"
42
+ vector_dim: int = 2560
43
+ base_url: str | None = None
44
+
45
+
8
46
  class StorageConfig(BaseModel):
9
47
  data_dir: Path = Field(default_factory=get_default_data_dir)
48
+ auto_vacuum: bool = True
10
49
  vacuum_retention_seconds: int = 86400
11
50
 
12
51
 
@@ -24,36 +63,94 @@ class LanceDBConfig(BaseModel):
24
63
 
25
64
 
26
65
  class EmbeddingsConfig(BaseModel):
27
- provider: str = "ollama"
28
- model: str = "qwen3-embedding"
29
- vector_dim: int = 4096
66
+ model: EmbeddingModelConfig = Field(default_factory=EmbeddingModelConfig)
30
67
 
31
68
 
32
69
  class RerankingConfig(BaseModel):
33
- provider: str = ""
34
- model: str = ""
70
+ model: ModelConfig | None = None
35
71
 
36
72
 
37
73
  class QAConfig(BaseModel):
38
- provider: str = "ollama"
39
- model: str = "gpt-oss"
74
+ model: ModelConfig = Field(
75
+ default_factory=lambda: ModelConfig(
76
+ provider="ollama",
77
+ name="gpt-oss",
78
+ enable_thinking=False,
79
+ )
80
+ )
40
81
  max_sub_questions: int = 3
41
82
  max_iterations: int = 2
42
83
  max_concurrency: int = 1
43
84
 
44
85
 
45
86
  class ResearchConfig(BaseModel):
46
- provider: str = "ollama"
47
- model: str = "gpt-oss"
87
+ model: ModelConfig = Field(
88
+ default_factory=lambda: ModelConfig(
89
+ provider="ollama",
90
+ name="gpt-oss",
91
+ enable_thinking=False,
92
+ )
93
+ )
48
94
  max_iterations: int = 3
49
95
  confidence_threshold: float = 0.8
50
96
  max_concurrency: int = 1
51
97
 
52
98
 
99
+ class PictureDescriptionConfig(BaseModel):
100
+ """Configuration for VLM-based picture description."""
101
+
102
+ enabled: bool = False
103
+ model: ModelConfig = Field(
104
+ default_factory=lambda: ModelConfig(
105
+ provider="ollama",
106
+ name="ministral-3",
107
+ )
108
+ )
109
+ timeout: int = 90
110
+ max_tokens: int = 200
111
+
112
+
113
+ class ConversionOptions(BaseModel):
114
+ """Options for document conversion."""
115
+
116
+ # OCR options
117
+ do_ocr: bool = True
118
+ force_ocr: bool = False
119
+ ocr_lang: list[str] = []
120
+
121
+ # Table options
122
+ do_table_structure: bool = True
123
+ table_mode: Literal["fast", "accurate"] = "accurate"
124
+ table_cell_matching: bool = True
125
+
126
+ # Image options
127
+ images_scale: float = 2.0
128
+ generate_picture_images: bool = False
129
+
130
+ # VLM picture description
131
+ picture_description: PictureDescriptionConfig = Field(
132
+ default_factory=PictureDescriptionConfig
133
+ )
134
+
135
+
53
136
  class ProcessingConfig(BaseModel):
54
137
  chunk_size: int = 256
55
- context_chunk_radius: int = 0
56
- markdown_preprocessor: str = ""
138
+ converter: str = "docling-local"
139
+ chunker: str = "docling-local"
140
+ chunker_type: str = "hybrid"
141
+ chunking_tokenizer: str = "Qwen/Qwen3-Embedding-0.6B"
142
+ chunking_merge_peers: bool = True
143
+ chunking_use_markdown_tables: bool = False
144
+ conversion_options: ConversionOptions = Field(default_factory=ConversionOptions)
145
+
146
+
147
+ class SearchConfig(BaseModel):
148
+ limit: int = 5
149
+ context_radius: int = 0
150
+ max_context_items: int = 10
151
+ max_context_chars: int = 10000
152
+ vector_index_metric: Literal["cosine", "l2", "dot"] = "cosine"
153
+ vector_refine_factor: int = 30
57
154
 
58
155
 
59
156
  class OllamaConfig(BaseModel):
@@ -64,16 +161,14 @@ class OllamaConfig(BaseModel):
64
161
  )
65
162
 
66
163
 
67
- class VLLMConfig(BaseModel):
68
- embeddings_base_url: str = ""
69
- rerank_base_url: str = ""
70
- qa_base_url: str = ""
71
- research_base_url: str = ""
164
+ class DoclingServeConfig(BaseModel):
165
+ base_url: str = "http://localhost:5001"
166
+ api_key: str = ""
72
167
 
73
168
 
74
169
  class ProvidersConfig(BaseModel):
75
170
  ollama: OllamaConfig = Field(default_factory=OllamaConfig)
76
- vllm: VLLMConfig = Field(default_factory=VLLMConfig)
171
+ docling_serve: DoclingServeConfig = Field(default_factory=DoclingServeConfig)
77
172
 
78
173
 
79
174
  class AGUIConfig(BaseModel):
@@ -85,6 +180,18 @@ class AGUIConfig(BaseModel):
85
180
  cors_headers: list[str] = ["*"]
86
181
 
87
182
 
183
+ class PromptsConfig(BaseModel):
184
+ domain_preamble: str = ""
185
+ qa: str | None = None
186
+ synthesis: str | None = None
187
+ picture_description: str = (
188
+ "Describe this image for a blind user. "
189
+ "State the image type (screenshot, chart, photo, etc.), "
190
+ "what it depicts, any visible text, and key visual details. "
191
+ "Be concise and accurate."
192
+ )
193
+
194
+
88
195
  class AppConfig(BaseModel):
89
196
  environment: str = "production"
90
197
  storage: StorageConfig = Field(default_factory=StorageConfig)
@@ -95,5 +202,7 @@ class AppConfig(BaseModel):
95
202
  qa: QAConfig = Field(default_factory=QAConfig)
96
203
  research: ResearchConfig = Field(default_factory=ResearchConfig)
97
204
  processing: ProcessingConfig = Field(default_factory=ProcessingConfig)
205
+ search: SearchConfig = Field(default_factory=SearchConfig)
98
206
  providers: ProvidersConfig = Field(default_factory=ProvidersConfig)
99
207
  agui: AGUIConfig = Field(default_factory=AGUIConfig)
208
+ prompts: PromptsConfig = Field(default_factory=PromptsConfig)
@@ -0,0 +1,31 @@
1
+ """Document converter abstraction for haiku.rag."""
2
+
3
+ from haiku.rag.config import AppConfig, Config
4
+ from haiku.rag.converters.base import DocumentConverter
5
+
6
+ __all__ = ["DocumentConverter", "get_converter"]
7
+
8
+
9
+ def get_converter(config: AppConfig = Config) -> DocumentConverter:
10
+ """Get a document converter instance based on configuration.
11
+
12
+ Args:
13
+ config: Configuration to use. Defaults to global Config.
14
+
15
+ Returns:
16
+ DocumentConverter instance configured according to the config.
17
+
18
+ Raises:
19
+ ValueError: If the converter provider is not recognized.
20
+ """
21
+ if config.processing.converter == "docling-local":
22
+ from haiku.rag.converters.docling_local import DoclingLocalConverter
23
+
24
+ return DoclingLocalConverter(config)
25
+
26
+ if config.processing.converter == "docling-serve":
27
+ from haiku.rag.converters.docling_serve import DoclingServeConverter
28
+
29
+ return DoclingServeConverter(config)
30
+
31
+ raise ValueError(f"Unsupported converter provider: {config.processing.converter}")
@@ -0,0 +1,63 @@
1
+ """Base class for document converters."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from docling_core.types.doc.document import DoclingDocument
9
+
10
+
11
+ class DocumentConverter(ABC):
12
+ """Abstract base class for document converters.
13
+
14
+ Document converters are responsible for converting various document formats
15
+ (PDF, DOCX, HTML, etc.) into DoclingDocument format for further processing.
16
+ """
17
+
18
+ @property
19
+ @abstractmethod
20
+ def supported_extensions(self) -> list[str]:
21
+ """Return list of file extensions supported by this converter.
22
+
23
+ Returns:
24
+ List of file extensions (including the dot, e.g., [".pdf", ".docx"]).
25
+ """
26
+ pass
27
+
28
+ @abstractmethod
29
+ async def convert_file(self, path: Path) -> "DoclingDocument":
30
+ """Convert a file to DoclingDocument format.
31
+
32
+ Args:
33
+ path: Path to the file to convert.
34
+
35
+ Returns:
36
+ DoclingDocument representation of the file.
37
+
38
+ Raises:
39
+ ValueError: If the file cannot be converted.
40
+ """
41
+ pass
42
+
43
+ SUPPORTED_FORMATS = ("md", "html", "plain")
44
+
45
+ @abstractmethod
46
+ async def convert_text(
47
+ self, text: str, name: str = "content.md", format: str = "md"
48
+ ) -> "DoclingDocument":
49
+ """Convert text content to DoclingDocument format.
50
+
51
+ Args:
52
+ text: The text content to convert.
53
+ name: The name to use for the document (defaults to "content.md").
54
+ format: The format of the text content ("md", "html", or "plain").
55
+ Defaults to "md". Use "plain" for plain text without parsing.
56
+
57
+ Returns:
58
+ DoclingDocument representation of the text.
59
+
60
+ Raises:
61
+ ValueError: If the text cannot be converted or format is unsupported.
62
+ """
63
+ pass
@@ -0,0 +1,193 @@
1
+ """Local docling converter implementation."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, ClassVar, cast
6
+
7
+ from haiku.rag.config import AppConfig
8
+ from haiku.rag.converters.base import DocumentConverter
9
+ from haiku.rag.converters.text_utils import TextFileHandler
10
+
11
+ if TYPE_CHECKING:
12
+ from docling_core.types.doc.document import DoclingDocument
13
+
14
+ from haiku.rag.config.models import ModelConfig
15
+
16
+
17
+ class DoclingLocalConverter(DocumentConverter):
18
+ """Converter that uses local docling for document conversion.
19
+
20
+ This converter runs docling locally in-process to convert documents.
21
+ It handles various document formats including PDF, DOCX, HTML, and plain text.
22
+ """
23
+
24
+ # Extensions supported by docling
25
+ docling_extensions: ClassVar[list[str]] = [
26
+ ".adoc",
27
+ ".asc",
28
+ ".asciidoc",
29
+ ".bmp",
30
+ ".csv",
31
+ ".docx",
32
+ ".html",
33
+ ".xhtml",
34
+ ".jpeg",
35
+ ".jpg",
36
+ ".md",
37
+ ".pdf",
38
+ ".png",
39
+ ".pptx",
40
+ ".tiff",
41
+ ".xlsx",
42
+ ".xml",
43
+ ".webp",
44
+ ]
45
+
46
+ def __init__(self, config: AppConfig):
47
+ """Initialize the converter with configuration.
48
+
49
+ Args:
50
+ config: Application configuration containing conversion options.
51
+ """
52
+ self.config = config
53
+
54
+ @property
55
+ def supported_extensions(self) -> list[str]:
56
+ """Return list of file extensions supported by this converter."""
57
+ return self.docling_extensions + TextFileHandler.text_extensions
58
+
59
+ def _get_vlm_api_url(self, model: "ModelConfig") -> str:
60
+ """Construct VLM API URL from model config."""
61
+ if model.base_url:
62
+ base = model.base_url.rstrip("/")
63
+ return f"{base}/v1/chat/completions"
64
+
65
+ if model.provider == "ollama":
66
+ base = self.config.providers.ollama.base_url.rstrip("/")
67
+ return f"{base}/v1/chat/completions"
68
+
69
+ if model.provider == "openai":
70
+ return "https://api.openai.com/v1/chat/completions"
71
+
72
+ raise ValueError(f"Unsupported VLM provider: {model.provider}")
73
+
74
+ def _sync_convert_docling_file(self, path: Path) -> "DoclingDocument":
75
+ """Synchronous conversion of docling-supported files."""
76
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
77
+ from docling.datamodel.base_models import InputFormat
78
+ from docling.datamodel.pipeline_options import (
79
+ OcrAutoOptions,
80
+ PdfPipelineOptions,
81
+ PictureDescriptionApiOptions,
82
+ TableFormerMode,
83
+ TableStructureOptions,
84
+ )
85
+ from docling.document_converter import (
86
+ DocumentConverter as DoclingDocConverter,
87
+ )
88
+ from docling.document_converter import (
89
+ FormatOption,
90
+ PdfFormatOption,
91
+ )
92
+
93
+ opts = self.config.processing.conversion_options
94
+ pic_desc = opts.picture_description
95
+
96
+ pipeline_options = PdfPipelineOptions(
97
+ do_ocr=opts.do_ocr,
98
+ do_table_structure=opts.do_table_structure,
99
+ images_scale=opts.images_scale,
100
+ generate_page_images=True,
101
+ generate_picture_images=opts.generate_picture_images or pic_desc.enabled,
102
+ table_structure_options=TableStructureOptions(
103
+ do_cell_matching=opts.table_cell_matching,
104
+ mode=(
105
+ TableFormerMode.FAST
106
+ if opts.table_mode == "fast"
107
+ else TableFormerMode.ACCURATE
108
+ ),
109
+ ),
110
+ ocr_options=OcrAutoOptions(
111
+ force_full_page_ocr=opts.force_ocr,
112
+ lang=opts.ocr_lang if opts.ocr_lang else [],
113
+ ),
114
+ do_picture_description=pic_desc.enabled,
115
+ )
116
+
117
+ if pic_desc.enabled:
118
+ from pydantic import AnyUrl
119
+
120
+ prompt = self.config.prompts.picture_description
121
+
122
+ pipeline_options.enable_remote_services = True
123
+ pipeline_options.picture_description_options = PictureDescriptionApiOptions(
124
+ url=AnyUrl(self._get_vlm_api_url(pic_desc.model)),
125
+ params=dict(
126
+ model=pic_desc.model.name,
127
+ max_completion_tokens=pic_desc.max_tokens,
128
+ ),
129
+ prompt=prompt,
130
+ timeout=pic_desc.timeout,
131
+ )
132
+
133
+ format_options = cast(
134
+ dict[InputFormat, FormatOption],
135
+ {
136
+ InputFormat.PDF: PdfFormatOption(
137
+ pipeline_options=pipeline_options,
138
+ backend=DoclingParseDocumentBackend,
139
+ )
140
+ },
141
+ )
142
+
143
+ converter = DoclingDocConverter(format_options=format_options)
144
+ result = converter.convert(path)
145
+ return result.document
146
+
147
+ async def convert_file(self, path: Path) -> "DoclingDocument":
148
+ """Convert a file to DoclingDocument using local docling.
149
+
150
+ Args:
151
+ path: Path to the file to convert.
152
+
153
+ Returns:
154
+ DoclingDocument representation of the file.
155
+
156
+ Raises:
157
+ ValueError: If the file cannot be converted.
158
+ """
159
+ try:
160
+ file_extension = path.suffix.lower()
161
+
162
+ if file_extension in self.docling_extensions:
163
+ return await asyncio.to_thread(self._sync_convert_docling_file, path)
164
+ elif file_extension in TextFileHandler.text_extensions:
165
+ content = await asyncio.to_thread(path.read_text, encoding="utf-8")
166
+ prepared_content = TextFileHandler.prepare_text_content(
167
+ content, file_extension
168
+ )
169
+ return await self.convert_text(prepared_content, name=f"{path.stem}.md")
170
+ else:
171
+ content = await asyncio.to_thread(path.read_text, encoding="utf-8")
172
+ return await self.convert_text(content, name=f"{path.stem}.md")
173
+ except Exception:
174
+ raise ValueError(f"Failed to parse file: {path}")
175
+
176
+ async def convert_text(
177
+ self, text: str, name: str = "content.md", format: str = "md"
178
+ ) -> "DoclingDocument":
179
+ """Convert text content to DoclingDocument using local docling.
180
+
181
+ Args:
182
+ text: The text content to convert.
183
+ name: The name to use for the document (defaults to "content.md").
184
+ format: The format of the text content ("md", "html", or "plain").
185
+ Defaults to "md". Use "plain" for plain text without parsing.
186
+
187
+ Returns:
188
+ DoclingDocument representation of the text.
189
+
190
+ Raises:
191
+ ValueError: If the text cannot be converted or format is unsupported.
192
+ """
193
+ return await TextFileHandler.text_to_docling_document(text, name, format)