biblicus 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ """
2
+ Granite Docling VLM-backed document text extraction plugin.
3
+
4
+ This extractor uses the Granite Docling-258M vision-language model for document understanding.
5
+ It supports PDF, Office documents (DOCX, XLSX, PPTX), HTML, and image formats.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from ..corpus import Corpus
15
+ from ..errors import ExtractionRunFatalError
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+ DOCLING_SUPPORTED_MEDIA_TYPES = frozenset(
20
+ [
21
+ "application/pdf",
22
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
23
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
24
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
25
+ "text/html",
26
+ "application/xhtml+xml",
27
+ "image/png",
28
+ "image/jpeg",
29
+ "image/gif",
30
+ "image/webp",
31
+ "image/tiff",
32
+ "image/bmp",
33
+ ]
34
+ )
35
+
36
+
37
+ class DoclingGraniteExtractorConfig(BaseModel):
38
+ """
39
+ Configuration for the Granite Docling VLM extractor.
40
+
41
+ :ivar output_format: Output format for extracted content (markdown, text, or html).
42
+ :vartype output_format: str
43
+ :ivar backend: Inference backend (mlx or transformers).
44
+ :vartype backend: str
45
+ """
46
+
47
+ model_config = ConfigDict(extra="forbid")
48
+
49
+ output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
50
+ backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
51
+
52
+
53
+ class DoclingGraniteExtractor(TextExtractor):
54
+ """
55
+ Extractor plugin backed by the Granite Docling-258M vision-language model.
56
+
57
+ This extractor converts documents into text using Docling with the Granite VLM.
58
+ It skips text items (text/plain, text/markdown) to let pass-through handle those.
59
+
60
+ :ivar extractor_id: Extractor identifier.
61
+ :vartype extractor_id: str
62
+ """
63
+
64
+ extractor_id = "docling-granite"
65
+
66
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
67
+ """
68
+ Validate extractor configuration and ensure the dependency is installed.
69
+
70
+ :param config: Configuration mapping.
71
+ :type config: dict[str, Any]
72
+ :return: Parsed config.
73
+ :rtype: DoclingGraniteExtractorConfig
74
+ :raises ExtractionRunFatalError: If the optional dependency is not installed.
75
+ """
76
+ parsed = DoclingGraniteExtractorConfig.model_validate(config)
77
+
78
+ try:
79
+ from docling.document_converter import DocumentConverter # noqa: F401
80
+ from docling.pipeline_options import ( # noqa: F401
81
+ VlmPipelineOptions,
82
+ vlm_model_specs,
83
+ )
84
+ except ImportError as import_error:
85
+ raise ExtractionRunFatalError(
86
+ "DoclingGranite extractor requires an optional dependency. "
87
+ 'Install it with pip install "biblicus[docling]".'
88
+ ) from import_error
89
+
90
+ if parsed.backend == "mlx":
91
+ try:
92
+ from docling.pipeline_options import vlm_model_specs
93
+
94
+ _ = vlm_model_specs.GRANITE_DOCLING_MLX
95
+ except (ImportError, AttributeError) as exc:
96
+ raise ExtractionRunFatalError(
97
+ "DoclingGranite extractor with MLX backend requires MLX support. "
98
+ 'Install it with pip install "biblicus[docling-mlx]".'
99
+ ) from exc
100
+
101
+ return parsed
102
+
103
+ def extract_text(
104
+ self,
105
+ *,
106
+ corpus: Corpus,
107
+ item: CatalogItem,
108
+ config: BaseModel,
109
+ previous_extractions: List[ExtractionStepOutput],
110
+ ) -> Optional[ExtractedText]:
111
+ """
112
+ Extract text for a document item using Granite Docling.
113
+
114
+ :param corpus: Corpus containing the item bytes.
115
+ :type corpus: Corpus
116
+ :param item: Catalog item being processed.
117
+ :type item: CatalogItem
118
+ :param config: Parsed configuration model.
119
+ :type config: DoclingGraniteExtractorConfig
120
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
121
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
122
+ :return: Extracted text payload, or None when the item is not supported.
123
+ :rtype: ExtractedText or None
124
+ """
125
+ _ = previous_extractions
126
+
127
+ if not self._is_supported_media_type(item.media_type):
128
+ return None
129
+
130
+ parsed_config = (
131
+ config
132
+ if isinstance(config, DoclingGraniteExtractorConfig)
133
+ else DoclingGraniteExtractorConfig.model_validate(config)
134
+ )
135
+
136
+ source_path = corpus.root / item.relpath
137
+ text = self._convert_document(source_path, parsed_config)
138
+ return ExtractedText(text=text.strip(), producer_extractor_id=self.extractor_id)
139
+
140
+ def _is_supported_media_type(self, media_type: str) -> bool:
141
+ """
142
+ Check if a media type is supported by this extractor.
143
+
144
+ :param media_type: Media type string.
145
+ :type media_type: str
146
+ :return: True if supported, False otherwise.
147
+ :rtype: bool
148
+ """
149
+ if media_type in DOCLING_SUPPORTED_MEDIA_TYPES:
150
+ return True
151
+ if media_type.startswith("image/"):
152
+ return True
153
+ return False
154
+
155
+ def _convert_document(self, source_path, config: DoclingGraniteExtractorConfig) -> str:
156
+ """
157
+ Convert a document using Docling with the Granite Docling VLM.
158
+
159
+ :param source_path: Path to the source document.
160
+ :type source_path: pathlib.Path
161
+ :param config: Parsed configuration.
162
+ :type config: DoclingGraniteExtractorConfig
163
+ :return: Extracted text content.
164
+ :rtype: str
165
+ """
166
+ from docling.document_converter import DocumentConverter, DocumentConverterOptions
167
+ from docling.format_options import InputFormat, PdfFormatOption
168
+ from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
169
+
170
+ if config.backend == "mlx":
171
+ vlm_options = vlm_model_specs.GRANITE_DOCLING_MLX
172
+ else:
173
+ vlm_options = vlm_model_specs.GRANITE_DOCLING_TRANSFORMERS
174
+
175
+ pipeline_options = DocumentConverterOptions(
176
+ pipeline_options=VlmPipelineOptions(vlm_options=vlm_options)
177
+ )
178
+
179
+ pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
180
+ converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
181
+ result = converter.convert(str(source_path))
182
+
183
+ if config.output_format == "html":
184
+ return result.document.export_to_html()
185
+ elif config.output_format == "text":
186
+ return result.document.export_to_text()
187
+ else:
188
+ return result.document.export_to_markdown()
@@ -0,0 +1,188 @@
1
+ """
2
+ SmolDocling VLM-backed document text extraction plugin.
3
+
4
+ This extractor uses the SmolDocling-256M vision-language model for document understanding.
5
+ It supports PDF, Office documents (DOCX, XLSX, PPTX), HTML, and image formats.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from ..corpus import Corpus
15
+ from ..errors import ExtractionRunFatalError
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+ DOCLING_SUPPORTED_MEDIA_TYPES = frozenset(
20
+ [
21
+ "application/pdf",
22
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
23
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
24
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
25
+ "text/html",
26
+ "application/xhtml+xml",
27
+ "image/png",
28
+ "image/jpeg",
29
+ "image/gif",
30
+ "image/webp",
31
+ "image/tiff",
32
+ "image/bmp",
33
+ ]
34
+ )
35
+
36
+
37
+ class DoclingSmolExtractorConfig(BaseModel):
38
+ """
39
+ Configuration for the SmolDocling VLM extractor.
40
+
41
+ :ivar output_format: Output format for extracted content (markdown, text, or html).
42
+ :vartype output_format: str
43
+ :ivar backend: Inference backend (mlx or transformers).
44
+ :vartype backend: str
45
+ """
46
+
47
+ model_config = ConfigDict(extra="forbid")
48
+
49
+ output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
50
+ backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
51
+
52
+
53
+ class DoclingSmolExtractor(TextExtractor):
54
+ """
55
+ Extractor plugin backed by the SmolDocling-256M vision-language model.
56
+
57
+ This extractor converts documents into text using Docling with the SmolDocling VLM.
58
+ It skips text items (text/plain, text/markdown) to let pass-through handle those.
59
+
60
+ :ivar extractor_id: Extractor identifier.
61
+ :vartype extractor_id: str
62
+ """
63
+
64
+ extractor_id = "docling-smol"
65
+
66
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
67
+ """
68
+ Validate extractor configuration and ensure the dependency is installed.
69
+
70
+ :param config: Configuration mapping.
71
+ :type config: dict[str, Any]
72
+ :return: Parsed config.
73
+ :rtype: DoclingSmolExtractorConfig
74
+ :raises ExtractionRunFatalError: If the optional dependency is not installed.
75
+ """
76
+ parsed = DoclingSmolExtractorConfig.model_validate(config)
77
+
78
+ try:
79
+ from docling.document_converter import DocumentConverter # noqa: F401
80
+ from docling.pipeline_options import ( # noqa: F401
81
+ VlmPipelineOptions,
82
+ vlm_model_specs,
83
+ )
84
+ except ImportError as import_error:
85
+ raise ExtractionRunFatalError(
86
+ "DoclingSmol extractor requires an optional dependency. "
87
+ 'Install it with pip install "biblicus[docling]".'
88
+ ) from import_error
89
+
90
+ if parsed.backend == "mlx":
91
+ try:
92
+ from docling.pipeline_options import vlm_model_specs
93
+
94
+ _ = vlm_model_specs.SMOLDOCLING_MLX
95
+ except (ImportError, AttributeError) as exc:
96
+ raise ExtractionRunFatalError(
97
+ "DoclingSmol extractor with MLX backend requires MLX support. "
98
+ 'Install it with pip install "biblicus[docling-mlx]".'
99
+ ) from exc
100
+
101
+ return parsed
102
+
103
+ def extract_text(
104
+ self,
105
+ *,
106
+ corpus: Corpus,
107
+ item: CatalogItem,
108
+ config: BaseModel,
109
+ previous_extractions: List[ExtractionStepOutput],
110
+ ) -> Optional[ExtractedText]:
111
+ """
112
+ Extract text for a document item using SmolDocling.
113
+
114
+ :param corpus: Corpus containing the item bytes.
115
+ :type corpus: Corpus
116
+ :param item: Catalog item being processed.
117
+ :type item: CatalogItem
118
+ :param config: Parsed configuration model.
119
+ :type config: DoclingSmolExtractorConfig
120
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
121
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
122
+ :return: Extracted text payload, or None when the item is not supported.
123
+ :rtype: ExtractedText or None
124
+ """
125
+ _ = previous_extractions
126
+
127
+ if not self._is_supported_media_type(item.media_type):
128
+ return None
129
+
130
+ parsed_config = (
131
+ config
132
+ if isinstance(config, DoclingSmolExtractorConfig)
133
+ else DoclingSmolExtractorConfig.model_validate(config)
134
+ )
135
+
136
+ source_path = corpus.root / item.relpath
137
+ text = self._convert_document(source_path, parsed_config)
138
+ return ExtractedText(text=text.strip(), producer_extractor_id=self.extractor_id)
139
+
140
+ def _is_supported_media_type(self, media_type: str) -> bool:
141
+ """
142
+ Check if a media type is supported by this extractor.
143
+
144
+ :param media_type: Media type string.
145
+ :type media_type: str
146
+ :return: True if supported, False otherwise.
147
+ :rtype: bool
148
+ """
149
+ if media_type in DOCLING_SUPPORTED_MEDIA_TYPES:
150
+ return True
151
+ if media_type.startswith("image/"):
152
+ return True
153
+ return False
154
+
155
+ def _convert_document(self, source_path, config: DoclingSmolExtractorConfig) -> str:
156
+ """
157
+ Convert a document using Docling with the SmolDocling VLM.
158
+
159
+ :param source_path: Path to the source document.
160
+ :type source_path: pathlib.Path
161
+ :param config: Parsed configuration.
162
+ :type config: DoclingSmolExtractorConfig
163
+ :return: Extracted text content.
164
+ :rtype: str
165
+ """
166
+ from docling.document_converter import DocumentConverter, DocumentConverterOptions
167
+ from docling.format_options import InputFormat, PdfFormatOption
168
+ from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
169
+
170
+ if config.backend == "mlx":
171
+ vlm_options = vlm_model_specs.SMOLDOCLING_MLX
172
+ else:
173
+ vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
174
+
175
+ pipeline_options = DocumentConverterOptions(
176
+ pipeline_options=VlmPipelineOptions(vlm_options=vlm_options)
177
+ )
178
+
179
+ pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
180
+ converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
181
+ result = converter.convert(str(source_path))
182
+
183
+ if config.output_format == "html":
184
+ return result.document.export_to_html()
185
+ elif config.output_format == "text":
186
+ return result.document.export_to_text()
187
+ else:
188
+ return result.document.export_to_markdown()
@@ -0,0 +1,128 @@
1
+ """
2
+ MarkItDown-based text extraction plugin.
3
+
4
+ This extractor depends on an optional library so the core installation stays small.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import sys
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from ..corpus import Corpus
15
+ from ..errors import ExtractionRunFatalError
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+
20
+ class MarkItDownExtractorConfig(BaseModel):
21
+ """
22
+ Configuration for the MarkItDown extractor.
23
+
24
+ :ivar enable_plugins: Whether to enable MarkItDown plugins.
25
+ :vartype enable_plugins: bool
26
+ """
27
+
28
+ model_config = ConfigDict(extra="forbid")
29
+
30
+ enable_plugins: bool = Field(default=False)
31
+
32
+ class MarkItDownExtractor(TextExtractor):
33
+ """
34
+ Extractor plugin backed by the `markitdown` library.
35
+
36
+ This extractor converts non-text items into Markdown-like text. It skips text items so
37
+ the pass-through extractor remains the canonical choice for text inputs and Markdown
38
+ front matter handling.
39
+
40
+ :ivar extractor_id: Extractor identifier.
41
+ :vartype extractor_id: str
42
+ """
43
+
44
+ extractor_id = "markitdown"
45
+
46
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
47
+ """
48
+ Validate extractor configuration and ensure the dependency is installed.
49
+
50
+ :param config: Configuration mapping.
51
+ :type config: dict[str, Any]
52
+ :return: Parsed config.
53
+ :rtype: MarkItDownExtractorConfig
54
+ :raises ExtractionRunFatalError: If the optional dependency is not installed.
55
+ """
56
+ try:
57
+ import markitdown
58
+ from markitdown import MarkItDown # noqa: F401
59
+ except ImportError as import_error:
60
+ raise ExtractionRunFatalError(
61
+ "MarkItDown extractor requires an optional dependency. "
62
+ 'Install it with pip install "biblicus[markitdown]".'
63
+ ) from import_error
64
+ if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
65
+ raise ExtractionRunFatalError(
66
+ "MarkItDown requires Python 3.10 or higher. "
67
+ "Upgrade your interpreter or use a compatible extractor."
68
+ )
69
+ return MarkItDownExtractorConfig.model_validate(config)
70
+
71
+ def extract_text(
72
+ self,
73
+ *,
74
+ corpus: Corpus,
75
+ item: CatalogItem,
76
+ config: BaseModel,
77
+ previous_extractions: List[ExtractionStepOutput],
78
+ ) -> Optional[ExtractedText]:
79
+ """
80
+ Extract text for a non-text item using MarkItDown.
81
+
82
+ :param corpus: Corpus containing the item bytes.
83
+ :type corpus: Corpus
84
+ :param item: Catalog item being processed.
85
+ :type item: CatalogItem
86
+ :param config: Parsed configuration model.
87
+ :type config: MarkItDownExtractorConfig
88
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
89
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
90
+ :return: Extracted text payload, or None when the item is already text.
91
+ :rtype: ExtractedText or None
92
+ """
93
+ parsed_config = (
94
+ config
95
+ if isinstance(config, MarkItDownExtractorConfig)
96
+ else MarkItDownExtractorConfig.model_validate(config)
97
+ )
98
+ _ = previous_extractions
99
+ media_type = item.media_type
100
+ if media_type == "text/markdown" or media_type.startswith("text/"):
101
+ return None
102
+
103
+ from markitdown import MarkItDown
104
+
105
+ source_path = corpus.root / item.relpath
106
+ converter = MarkItDown(enable_plugins=parsed_config.enable_plugins)
107
+ conversion_result = converter.convert(str(source_path))
108
+ extracted_text = _resolve_markitdown_text(conversion_result).strip()
109
+ return ExtractedText(text=extracted_text, producer_extractor_id=self.extractor_id)
110
+
111
+
112
+ def _resolve_markitdown_text(conversion_result: object) -> str:
113
+ """
114
+ Resolve a text payload from a MarkItDown conversion result.
115
+
116
+ :param conversion_result: Result returned by the MarkItDown converter.
117
+ :type conversion_result: object
118
+ :return: Extracted text payload or an empty string.
119
+ :rtype: str
120
+ """
121
+ if isinstance(conversion_result, str):
122
+ return conversion_result
123
+ if conversion_result is None:
124
+ return ""
125
+ text_content = getattr(conversion_result, "text_content", None)
126
+ if isinstance(text_content, str):
127
+ return text_content
128
+ return ""