biblicus 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/__init__.py +40 -0
- biblicus/analysis/base.py +49 -0
- biblicus/analysis/llm.py +106 -0
- biblicus/analysis/models.py +512 -0
- biblicus/analysis/schema.py +18 -0
- biblicus/analysis/topic_modeling.py +561 -0
- biblicus/cli.py +160 -11
- biblicus/constants.py +2 -0
- biblicus/corpus.py +42 -0
- biblicus/extraction.py +5 -0
- biblicus/extractors/__init__.py +14 -0
- biblicus/extractors/deepgram_stt.py +166 -0
- biblicus/extractors/docling_granite_text.py +188 -0
- biblicus/extractors/docling_smol_text.py +188 -0
- biblicus/extractors/markitdown_text.py +128 -0
- biblicus/extractors/paddleocr_vl_text.py +305 -0
- biblicus/extractors/rapidocr_text.py +8 -1
- biblicus/extractors/select_override.py +121 -0
- biblicus/extractors/select_smart_override.py +187 -0
- biblicus/inference.py +104 -0
- biblicus/models.py +6 -0
- biblicus/user_config.py +76 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/METADATA +120 -5
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/RECORD +29 -15
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/WHEEL +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Granite Docling VLM-backed document text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor uses the Granite Docling-258M vision-language model for document understanding.
|
|
5
|
+
It supports PDF, Office documents (DOCX, XLSX, PPTX), HTML, and image formats.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
DOCLING_SUPPORTED_MEDIA_TYPES = frozenset(
|
|
20
|
+
[
|
|
21
|
+
"application/pdf",
|
|
22
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
23
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
24
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
25
|
+
"text/html",
|
|
26
|
+
"application/xhtml+xml",
|
|
27
|
+
"image/png",
|
|
28
|
+
"image/jpeg",
|
|
29
|
+
"image/gif",
|
|
30
|
+
"image/webp",
|
|
31
|
+
"image/tiff",
|
|
32
|
+
"image/bmp",
|
|
33
|
+
]
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DoclingGraniteExtractorConfig(BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Configuration for the Granite Docling VLM extractor.
|
|
40
|
+
|
|
41
|
+
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
|
+
:vartype output_format: str
|
|
43
|
+
:ivar backend: Inference backend (mlx or transformers).
|
|
44
|
+
:vartype backend: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
model_config = ConfigDict(extra="forbid")
|
|
48
|
+
|
|
49
|
+
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
+
backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DoclingGraniteExtractor(TextExtractor):
|
|
54
|
+
"""
|
|
55
|
+
Extractor plugin backed by the Granite Docling-258M vision-language model.
|
|
56
|
+
|
|
57
|
+
This extractor converts documents into text using Docling with the Granite VLM.
|
|
58
|
+
It skips text items (text/plain, text/markdown) to let pass-through handle those.
|
|
59
|
+
|
|
60
|
+
:ivar extractor_id: Extractor identifier.
|
|
61
|
+
:vartype extractor_id: str
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
extractor_id = "docling-granite"
|
|
65
|
+
|
|
66
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
67
|
+
"""
|
|
68
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
69
|
+
|
|
70
|
+
:param config: Configuration mapping.
|
|
71
|
+
:type config: dict[str, Any]
|
|
72
|
+
:return: Parsed config.
|
|
73
|
+
:rtype: DoclingGraniteExtractorConfig
|
|
74
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
75
|
+
"""
|
|
76
|
+
parsed = DoclingGraniteExtractorConfig.model_validate(config)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from docling.document_converter import DocumentConverter # noqa: F401
|
|
80
|
+
from docling.pipeline_options import ( # noqa: F401
|
|
81
|
+
VlmPipelineOptions,
|
|
82
|
+
vlm_model_specs,
|
|
83
|
+
)
|
|
84
|
+
except ImportError as import_error:
|
|
85
|
+
raise ExtractionRunFatalError(
|
|
86
|
+
"DoclingGranite extractor requires an optional dependency. "
|
|
87
|
+
'Install it with pip install "biblicus[docling]".'
|
|
88
|
+
) from import_error
|
|
89
|
+
|
|
90
|
+
if parsed.backend == "mlx":
|
|
91
|
+
try:
|
|
92
|
+
from docling.pipeline_options import vlm_model_specs
|
|
93
|
+
|
|
94
|
+
_ = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
95
|
+
except (ImportError, AttributeError) as exc:
|
|
96
|
+
raise ExtractionRunFatalError(
|
|
97
|
+
"DoclingGranite extractor with MLX backend requires MLX support. "
|
|
98
|
+
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
|
+
) from exc
|
|
100
|
+
|
|
101
|
+
return parsed
|
|
102
|
+
|
|
103
|
+
def extract_text(
|
|
104
|
+
self,
|
|
105
|
+
*,
|
|
106
|
+
corpus: Corpus,
|
|
107
|
+
item: CatalogItem,
|
|
108
|
+
config: BaseModel,
|
|
109
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
110
|
+
) -> Optional[ExtractedText]:
|
|
111
|
+
"""
|
|
112
|
+
Extract text for a document item using Granite Docling.
|
|
113
|
+
|
|
114
|
+
:param corpus: Corpus containing the item bytes.
|
|
115
|
+
:type corpus: Corpus
|
|
116
|
+
:param item: Catalog item being processed.
|
|
117
|
+
:type item: CatalogItem
|
|
118
|
+
:param config: Parsed configuration model.
|
|
119
|
+
:type config: DoclingGraniteExtractorConfig
|
|
120
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
121
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
122
|
+
:return: Extracted text payload, or None when the item is not supported.
|
|
123
|
+
:rtype: ExtractedText or None
|
|
124
|
+
"""
|
|
125
|
+
_ = previous_extractions
|
|
126
|
+
|
|
127
|
+
if not self._is_supported_media_type(item.media_type):
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
parsed_config = (
|
|
131
|
+
config
|
|
132
|
+
if isinstance(config, DoclingGraniteExtractorConfig)
|
|
133
|
+
else DoclingGraniteExtractorConfig.model_validate(config)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
source_path = corpus.root / item.relpath
|
|
137
|
+
text = self._convert_document(source_path, parsed_config)
|
|
138
|
+
return ExtractedText(text=text.strip(), producer_extractor_id=self.extractor_id)
|
|
139
|
+
|
|
140
|
+
def _is_supported_media_type(self, media_type: str) -> bool:
|
|
141
|
+
"""
|
|
142
|
+
Check if a media type is supported by this extractor.
|
|
143
|
+
|
|
144
|
+
:param media_type: Media type string.
|
|
145
|
+
:type media_type: str
|
|
146
|
+
:return: True if supported, False otherwise.
|
|
147
|
+
:rtype: bool
|
|
148
|
+
"""
|
|
149
|
+
if media_type in DOCLING_SUPPORTED_MEDIA_TYPES:
|
|
150
|
+
return True
|
|
151
|
+
if media_type.startswith("image/"):
|
|
152
|
+
return True
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
def _convert_document(self, source_path, config: DoclingGraniteExtractorConfig) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Convert a document using Docling with the Granite Docling VLM.
|
|
158
|
+
|
|
159
|
+
:param source_path: Path to the source document.
|
|
160
|
+
:type source_path: pathlib.Path
|
|
161
|
+
:param config: Parsed configuration.
|
|
162
|
+
:type config: DoclingGraniteExtractorConfig
|
|
163
|
+
:return: Extracted text content.
|
|
164
|
+
:rtype: str
|
|
165
|
+
"""
|
|
166
|
+
from docling.document_converter import DocumentConverter, DocumentConverterOptions
|
|
167
|
+
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
|
+
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
|
+
|
|
170
|
+
if config.backend == "mlx":
|
|
171
|
+
vlm_options = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
172
|
+
else:
|
|
173
|
+
vlm_options = vlm_model_specs.GRANITE_DOCLING_TRANSFORMERS
|
|
174
|
+
|
|
175
|
+
pipeline_options = DocumentConverterOptions(
|
|
176
|
+
pipeline_options=VlmPipelineOptions(vlm_options=vlm_options)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
|
|
180
|
+
converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
|
|
181
|
+
result = converter.convert(str(source_path))
|
|
182
|
+
|
|
183
|
+
if config.output_format == "html":
|
|
184
|
+
return result.document.export_to_html()
|
|
185
|
+
elif config.output_format == "text":
|
|
186
|
+
return result.document.export_to_text()
|
|
187
|
+
else:
|
|
188
|
+
return result.document.export_to_markdown()
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SmolDocling VLM-backed document text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor uses the SmolDocling-256M vision-language model for document understanding.
|
|
5
|
+
It supports PDF, Office documents (DOCX, XLSX, PPTX), HTML, and image formats.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
DOCLING_SUPPORTED_MEDIA_TYPES = frozenset(
|
|
20
|
+
[
|
|
21
|
+
"application/pdf",
|
|
22
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
23
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
24
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
25
|
+
"text/html",
|
|
26
|
+
"application/xhtml+xml",
|
|
27
|
+
"image/png",
|
|
28
|
+
"image/jpeg",
|
|
29
|
+
"image/gif",
|
|
30
|
+
"image/webp",
|
|
31
|
+
"image/tiff",
|
|
32
|
+
"image/bmp",
|
|
33
|
+
]
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DoclingSmolExtractorConfig(BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Configuration for the SmolDocling VLM extractor.
|
|
40
|
+
|
|
41
|
+
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
|
+
:vartype output_format: str
|
|
43
|
+
:ivar backend: Inference backend (mlx or transformers).
|
|
44
|
+
:vartype backend: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
model_config = ConfigDict(extra="forbid")
|
|
48
|
+
|
|
49
|
+
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
+
backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DoclingSmolExtractor(TextExtractor):
|
|
54
|
+
"""
|
|
55
|
+
Extractor plugin backed by the SmolDocling-256M vision-language model.
|
|
56
|
+
|
|
57
|
+
This extractor converts documents into text using Docling with the SmolDocling VLM.
|
|
58
|
+
It skips text items (text/plain, text/markdown) to let pass-through handle those.
|
|
59
|
+
|
|
60
|
+
:ivar extractor_id: Extractor identifier.
|
|
61
|
+
:vartype extractor_id: str
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
extractor_id = "docling-smol"
|
|
65
|
+
|
|
66
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
67
|
+
"""
|
|
68
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
69
|
+
|
|
70
|
+
:param config: Configuration mapping.
|
|
71
|
+
:type config: dict[str, Any]
|
|
72
|
+
:return: Parsed config.
|
|
73
|
+
:rtype: DoclingSmolExtractorConfig
|
|
74
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
75
|
+
"""
|
|
76
|
+
parsed = DoclingSmolExtractorConfig.model_validate(config)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from docling.document_converter import DocumentConverter # noqa: F401
|
|
80
|
+
from docling.pipeline_options import ( # noqa: F401
|
|
81
|
+
VlmPipelineOptions,
|
|
82
|
+
vlm_model_specs,
|
|
83
|
+
)
|
|
84
|
+
except ImportError as import_error:
|
|
85
|
+
raise ExtractionRunFatalError(
|
|
86
|
+
"DoclingSmol extractor requires an optional dependency. "
|
|
87
|
+
'Install it with pip install "biblicus[docling]".'
|
|
88
|
+
) from import_error
|
|
89
|
+
|
|
90
|
+
if parsed.backend == "mlx":
|
|
91
|
+
try:
|
|
92
|
+
from docling.pipeline_options import vlm_model_specs
|
|
93
|
+
|
|
94
|
+
_ = vlm_model_specs.SMOLDOCLING_MLX
|
|
95
|
+
except (ImportError, AttributeError) as exc:
|
|
96
|
+
raise ExtractionRunFatalError(
|
|
97
|
+
"DoclingSmol extractor with MLX backend requires MLX support. "
|
|
98
|
+
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
|
+
) from exc
|
|
100
|
+
|
|
101
|
+
return parsed
|
|
102
|
+
|
|
103
|
+
def extract_text(
|
|
104
|
+
self,
|
|
105
|
+
*,
|
|
106
|
+
corpus: Corpus,
|
|
107
|
+
item: CatalogItem,
|
|
108
|
+
config: BaseModel,
|
|
109
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
110
|
+
) -> Optional[ExtractedText]:
|
|
111
|
+
"""
|
|
112
|
+
Extract text for a document item using SmolDocling.
|
|
113
|
+
|
|
114
|
+
:param corpus: Corpus containing the item bytes.
|
|
115
|
+
:type corpus: Corpus
|
|
116
|
+
:param item: Catalog item being processed.
|
|
117
|
+
:type item: CatalogItem
|
|
118
|
+
:param config: Parsed configuration model.
|
|
119
|
+
:type config: DoclingSmolExtractorConfig
|
|
120
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
121
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
122
|
+
:return: Extracted text payload, or None when the item is not supported.
|
|
123
|
+
:rtype: ExtractedText or None
|
|
124
|
+
"""
|
|
125
|
+
_ = previous_extractions
|
|
126
|
+
|
|
127
|
+
if not self._is_supported_media_type(item.media_type):
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
parsed_config = (
|
|
131
|
+
config
|
|
132
|
+
if isinstance(config, DoclingSmolExtractorConfig)
|
|
133
|
+
else DoclingSmolExtractorConfig.model_validate(config)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
source_path = corpus.root / item.relpath
|
|
137
|
+
text = self._convert_document(source_path, parsed_config)
|
|
138
|
+
return ExtractedText(text=text.strip(), producer_extractor_id=self.extractor_id)
|
|
139
|
+
|
|
140
|
+
def _is_supported_media_type(self, media_type: str) -> bool:
|
|
141
|
+
"""
|
|
142
|
+
Check if a media type is supported by this extractor.
|
|
143
|
+
|
|
144
|
+
:param media_type: Media type string.
|
|
145
|
+
:type media_type: str
|
|
146
|
+
:return: True if supported, False otherwise.
|
|
147
|
+
:rtype: bool
|
|
148
|
+
"""
|
|
149
|
+
if media_type in DOCLING_SUPPORTED_MEDIA_TYPES:
|
|
150
|
+
return True
|
|
151
|
+
if media_type.startswith("image/"):
|
|
152
|
+
return True
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
def _convert_document(self, source_path, config: DoclingSmolExtractorConfig) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Convert a document using Docling with the SmolDocling VLM.
|
|
158
|
+
|
|
159
|
+
:param source_path: Path to the source document.
|
|
160
|
+
:type source_path: pathlib.Path
|
|
161
|
+
:param config: Parsed configuration.
|
|
162
|
+
:type config: DoclingSmolExtractorConfig
|
|
163
|
+
:return: Extracted text content.
|
|
164
|
+
:rtype: str
|
|
165
|
+
"""
|
|
166
|
+
from docling.document_converter import DocumentConverter, DocumentConverterOptions
|
|
167
|
+
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
|
+
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
|
+
|
|
170
|
+
if config.backend == "mlx":
|
|
171
|
+
vlm_options = vlm_model_specs.SMOLDOCLING_MLX
|
|
172
|
+
else:
|
|
173
|
+
vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
|
|
174
|
+
|
|
175
|
+
pipeline_options = DocumentConverterOptions(
|
|
176
|
+
pipeline_options=VlmPipelineOptions(vlm_options=vlm_options)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
|
|
180
|
+
converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
|
|
181
|
+
result = converter.convert(str(source_path))
|
|
182
|
+
|
|
183
|
+
if config.output_format == "html":
|
|
184
|
+
return result.document.export_to_html()
|
|
185
|
+
elif config.output_format == "text":
|
|
186
|
+
return result.document.export_to_text()
|
|
187
|
+
else:
|
|
188
|
+
return result.document.export_to_markdown()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MarkItDown-based text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor depends on an optional library so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MarkItDownExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for the MarkItDown extractor.
|
|
23
|
+
|
|
24
|
+
:ivar enable_plugins: Whether to enable MarkItDown plugins.
|
|
25
|
+
:vartype enable_plugins: bool
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
enable_plugins: bool = Field(default=False)
|
|
31
|
+
|
|
32
|
+
class MarkItDownExtractor(TextExtractor):
|
|
33
|
+
"""
|
|
34
|
+
Extractor plugin backed by the `markitdown` library.
|
|
35
|
+
|
|
36
|
+
This extractor converts non-text items into Markdown-like text. It skips text items so
|
|
37
|
+
the pass-through extractor remains the canonical choice for text inputs and Markdown
|
|
38
|
+
front matter handling.
|
|
39
|
+
|
|
40
|
+
:ivar extractor_id: Extractor identifier.
|
|
41
|
+
:vartype extractor_id: str
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
extractor_id = "markitdown"
|
|
45
|
+
|
|
46
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
47
|
+
"""
|
|
48
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
49
|
+
|
|
50
|
+
:param config: Configuration mapping.
|
|
51
|
+
:type config: dict[str, Any]
|
|
52
|
+
:return: Parsed config.
|
|
53
|
+
:rtype: MarkItDownExtractorConfig
|
|
54
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
import markitdown
|
|
58
|
+
from markitdown import MarkItDown # noqa: F401
|
|
59
|
+
except ImportError as import_error:
|
|
60
|
+
raise ExtractionRunFatalError(
|
|
61
|
+
"MarkItDown extractor requires an optional dependency. "
|
|
62
|
+
'Install it with pip install "biblicus[markitdown]".'
|
|
63
|
+
) from import_error
|
|
64
|
+
if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
|
|
65
|
+
raise ExtractionRunFatalError(
|
|
66
|
+
"MarkItDown requires Python 3.10 or higher. "
|
|
67
|
+
"Upgrade your interpreter or use a compatible extractor."
|
|
68
|
+
)
|
|
69
|
+
return MarkItDownExtractorConfig.model_validate(config)
|
|
70
|
+
|
|
71
|
+
def extract_text(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
corpus: Corpus,
|
|
75
|
+
item: CatalogItem,
|
|
76
|
+
config: BaseModel,
|
|
77
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
78
|
+
) -> Optional[ExtractedText]:
|
|
79
|
+
"""
|
|
80
|
+
Extract text for a non-text item using MarkItDown.
|
|
81
|
+
|
|
82
|
+
:param corpus: Corpus containing the item bytes.
|
|
83
|
+
:type corpus: Corpus
|
|
84
|
+
:param item: Catalog item being processed.
|
|
85
|
+
:type item: CatalogItem
|
|
86
|
+
:param config: Parsed configuration model.
|
|
87
|
+
:type config: MarkItDownExtractorConfig
|
|
88
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
89
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
90
|
+
:return: Extracted text payload, or None when the item is already text.
|
|
91
|
+
:rtype: ExtractedText or None
|
|
92
|
+
"""
|
|
93
|
+
parsed_config = (
|
|
94
|
+
config
|
|
95
|
+
if isinstance(config, MarkItDownExtractorConfig)
|
|
96
|
+
else MarkItDownExtractorConfig.model_validate(config)
|
|
97
|
+
)
|
|
98
|
+
_ = previous_extractions
|
|
99
|
+
media_type = item.media_type
|
|
100
|
+
if media_type == "text/markdown" or media_type.startswith("text/"):
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
from markitdown import MarkItDown
|
|
104
|
+
|
|
105
|
+
source_path = corpus.root / item.relpath
|
|
106
|
+
converter = MarkItDown(enable_plugins=parsed_config.enable_plugins)
|
|
107
|
+
conversion_result = converter.convert(str(source_path))
|
|
108
|
+
extracted_text = _resolve_markitdown_text(conversion_result).strip()
|
|
109
|
+
return ExtractedText(text=extracted_text, producer_extractor_id=self.extractor_id)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _resolve_markitdown_text(conversion_result: object) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Resolve a text payload from a MarkItDown conversion result.
|
|
115
|
+
|
|
116
|
+
:param conversion_result: Result returned by the MarkItDown converter.
|
|
117
|
+
:type conversion_result: object
|
|
118
|
+
:return: Extracted text payload or an empty string.
|
|
119
|
+
:rtype: str
|
|
120
|
+
"""
|
|
121
|
+
if isinstance(conversion_result, str):
|
|
122
|
+
return conversion_result
|
|
123
|
+
if conversion_result is None:
|
|
124
|
+
return ""
|
|
125
|
+
text_content = getattr(conversion_result, "text_content", None)
|
|
126
|
+
if isinstance(text_content, str):
|
|
127
|
+
return text_content
|
|
128
|
+
return ""
|