biblicus 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/__init__.py +40 -0
- biblicus/analysis/base.py +49 -0
- biblicus/analysis/llm.py +106 -0
- biblicus/analysis/models.py +554 -0
- biblicus/analysis/schema.py +18 -0
- biblicus/analysis/topic_modeling.py +585 -0
- biblicus/cli.py +160 -11
- biblicus/constants.py +2 -0
- biblicus/corpus.py +42 -0
- biblicus/extraction.py +5 -0
- biblicus/extractors/__init__.py +12 -0
- biblicus/extractors/deepgram_stt.py +166 -0
- biblicus/extractors/docling_granite_text.py +188 -0
- biblicus/extractors/docling_smol_text.py +188 -0
- biblicus/extractors/paddleocr_vl_text.py +305 -0
- biblicus/extractors/rapidocr_text.py +8 -1
- biblicus/extractors/select_override.py +121 -0
- biblicus/extractors/select_smart_override.py +187 -0
- biblicus/inference.py +104 -0
- biblicus/models.py +6 -0
- biblicus/user_config.py +76 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/METADATA +120 -16
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/RECORD +28 -15
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/WHEEL +0 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.7.0.dist-info → biblicus-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Granite Docling VLM-backed document text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor uses the Granite Docling-258M vision-language model for document understanding.
|
|
5
|
+
It supports PDF, Office documents (DOCX, XLSX, PPTX), HTML, and image formats.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
DOCLING_SUPPORTED_MEDIA_TYPES = frozenset(
|
|
20
|
+
[
|
|
21
|
+
"application/pdf",
|
|
22
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
23
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
24
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
25
|
+
"text/html",
|
|
26
|
+
"application/xhtml+xml",
|
|
27
|
+
"image/png",
|
|
28
|
+
"image/jpeg",
|
|
29
|
+
"image/gif",
|
|
30
|
+
"image/webp",
|
|
31
|
+
"image/tiff",
|
|
32
|
+
"image/bmp",
|
|
33
|
+
]
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DoclingGraniteExtractorConfig(BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Configuration for the Granite Docling VLM extractor.
|
|
40
|
+
|
|
41
|
+
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
|
+
:vartype output_format: str
|
|
43
|
+
:ivar backend: Inference backend (mlx or transformers).
|
|
44
|
+
:vartype backend: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
model_config = ConfigDict(extra="forbid")
|
|
48
|
+
|
|
49
|
+
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
+
backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DoclingGraniteExtractor(TextExtractor):
|
|
54
|
+
"""
|
|
55
|
+
Extractor plugin backed by the Granite Docling-258M vision-language model.
|
|
56
|
+
|
|
57
|
+
This extractor converts documents into text using Docling with the Granite VLM.
|
|
58
|
+
It skips text items (text/plain, text/markdown) to let pass-through handle those.
|
|
59
|
+
|
|
60
|
+
:ivar extractor_id: Extractor identifier.
|
|
61
|
+
:vartype extractor_id: str
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
extractor_id = "docling-granite"
|
|
65
|
+
|
|
66
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
67
|
+
"""
|
|
68
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
69
|
+
|
|
70
|
+
:param config: Configuration mapping.
|
|
71
|
+
:type config: dict[str, Any]
|
|
72
|
+
:return: Parsed config.
|
|
73
|
+
:rtype: DoclingGraniteExtractorConfig
|
|
74
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
75
|
+
"""
|
|
76
|
+
parsed = DoclingGraniteExtractorConfig.model_validate(config)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from docling.document_converter import DocumentConverter # noqa: F401
|
|
80
|
+
from docling.pipeline_options import ( # noqa: F401
|
|
81
|
+
VlmPipelineOptions,
|
|
82
|
+
vlm_model_specs,
|
|
83
|
+
)
|
|
84
|
+
except ImportError as import_error:
|
|
85
|
+
raise ExtractionRunFatalError(
|
|
86
|
+
"DoclingGranite extractor requires an optional dependency. "
|
|
87
|
+
'Install it with pip install "biblicus[docling]".'
|
|
88
|
+
) from import_error
|
|
89
|
+
|
|
90
|
+
if parsed.backend == "mlx":
|
|
91
|
+
try:
|
|
92
|
+
from docling.pipeline_options import vlm_model_specs
|
|
93
|
+
|
|
94
|
+
_ = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
95
|
+
except (ImportError, AttributeError) as exc:
|
|
96
|
+
raise ExtractionRunFatalError(
|
|
97
|
+
"DoclingGranite extractor with MLX backend requires MLX support. "
|
|
98
|
+
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
|
+
) from exc
|
|
100
|
+
|
|
101
|
+
return parsed
|
|
102
|
+
|
|
103
|
+
def extract_text(
|
|
104
|
+
self,
|
|
105
|
+
*,
|
|
106
|
+
corpus: Corpus,
|
|
107
|
+
item: CatalogItem,
|
|
108
|
+
config: BaseModel,
|
|
109
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
110
|
+
) -> Optional[ExtractedText]:
|
|
111
|
+
"""
|
|
112
|
+
Extract text for a document item using Granite Docling.
|
|
113
|
+
|
|
114
|
+
:param corpus: Corpus containing the item bytes.
|
|
115
|
+
:type corpus: Corpus
|
|
116
|
+
:param item: Catalog item being processed.
|
|
117
|
+
:type item: CatalogItem
|
|
118
|
+
:param config: Parsed configuration model.
|
|
119
|
+
:type config: DoclingGraniteExtractorConfig
|
|
120
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
121
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
122
|
+
:return: Extracted text payload, or None when the item is not supported.
|
|
123
|
+
:rtype: ExtractedText or None
|
|
124
|
+
"""
|
|
125
|
+
_ = previous_extractions
|
|
126
|
+
|
|
127
|
+
if not self._is_supported_media_type(item.media_type):
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
parsed_config = (
|
|
131
|
+
config
|
|
132
|
+
if isinstance(config, DoclingGraniteExtractorConfig)
|
|
133
|
+
else DoclingGraniteExtractorConfig.model_validate(config)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
source_path = corpus.root / item.relpath
|
|
137
|
+
text = self._convert_document(source_path, parsed_config)
|
|
138
|
+
return ExtractedText(text=text.strip(), producer_extractor_id=self.extractor_id)
|
|
139
|
+
|
|
140
|
+
def _is_supported_media_type(self, media_type: str) -> bool:
|
|
141
|
+
"""
|
|
142
|
+
Check if a media type is supported by this extractor.
|
|
143
|
+
|
|
144
|
+
:param media_type: Media type string.
|
|
145
|
+
:type media_type: str
|
|
146
|
+
:return: True if supported, False otherwise.
|
|
147
|
+
:rtype: bool
|
|
148
|
+
"""
|
|
149
|
+
if media_type in DOCLING_SUPPORTED_MEDIA_TYPES:
|
|
150
|
+
return True
|
|
151
|
+
if media_type.startswith("image/"):
|
|
152
|
+
return True
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
def _convert_document(self, source_path, config: DoclingGraniteExtractorConfig) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Convert a document using Docling with the Granite Docling VLM.
|
|
158
|
+
|
|
159
|
+
:param source_path: Path to the source document.
|
|
160
|
+
:type source_path: pathlib.Path
|
|
161
|
+
:param config: Parsed configuration.
|
|
162
|
+
:type config: DoclingGraniteExtractorConfig
|
|
163
|
+
:return: Extracted text content.
|
|
164
|
+
:rtype: str
|
|
165
|
+
"""
|
|
166
|
+
from docling.document_converter import DocumentConverter, DocumentConverterOptions
|
|
167
|
+
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
|
+
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
|
+
|
|
170
|
+
if config.backend == "mlx":
|
|
171
|
+
vlm_options = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
172
|
+
else:
|
|
173
|
+
vlm_options = vlm_model_specs.GRANITE_DOCLING_TRANSFORMERS
|
|
174
|
+
|
|
175
|
+
pipeline_options = DocumentConverterOptions(
|
|
176
|
+
pipeline_options=VlmPipelineOptions(vlm_options=vlm_options)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
|
|
180
|
+
converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
|
|
181
|
+
result = converter.convert(str(source_path))
|
|
182
|
+
|
|
183
|
+
if config.output_format == "html":
|
|
184
|
+
return result.document.export_to_html()
|
|
185
|
+
elif config.output_format == "text":
|
|
186
|
+
return result.document.export_to_text()
|
|
187
|
+
else:
|
|
188
|
+
return result.document.export_to_markdown()
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SmolDocling VLM-backed document text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor uses the SmolDocling-256M vision-language model for document understanding.
|
|
5
|
+
It supports PDF, Office documents (DOCX, XLSX, PPTX), HTML, and image formats.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
DOCLING_SUPPORTED_MEDIA_TYPES = frozenset(
|
|
20
|
+
[
|
|
21
|
+
"application/pdf",
|
|
22
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
23
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
24
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
25
|
+
"text/html",
|
|
26
|
+
"application/xhtml+xml",
|
|
27
|
+
"image/png",
|
|
28
|
+
"image/jpeg",
|
|
29
|
+
"image/gif",
|
|
30
|
+
"image/webp",
|
|
31
|
+
"image/tiff",
|
|
32
|
+
"image/bmp",
|
|
33
|
+
]
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DoclingSmolExtractorConfig(BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Configuration for the SmolDocling VLM extractor.
|
|
40
|
+
|
|
41
|
+
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
|
+
:vartype output_format: str
|
|
43
|
+
:ivar backend: Inference backend (mlx or transformers).
|
|
44
|
+
:vartype backend: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
model_config = ConfigDict(extra="forbid")
|
|
48
|
+
|
|
49
|
+
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
+
backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DoclingSmolExtractor(TextExtractor):
|
|
54
|
+
"""
|
|
55
|
+
Extractor plugin backed by the SmolDocling-256M vision-language model.
|
|
56
|
+
|
|
57
|
+
This extractor converts documents into text using Docling with the SmolDocling VLM.
|
|
58
|
+
It skips text items (text/plain, text/markdown) to let pass-through handle those.
|
|
59
|
+
|
|
60
|
+
:ivar extractor_id: Extractor identifier.
|
|
61
|
+
:vartype extractor_id: str
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
extractor_id = "docling-smol"
|
|
65
|
+
|
|
66
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
67
|
+
"""
|
|
68
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
69
|
+
|
|
70
|
+
:param config: Configuration mapping.
|
|
71
|
+
:type config: dict[str, Any]
|
|
72
|
+
:return: Parsed config.
|
|
73
|
+
:rtype: DoclingSmolExtractorConfig
|
|
74
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
75
|
+
"""
|
|
76
|
+
parsed = DoclingSmolExtractorConfig.model_validate(config)
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from docling.document_converter import DocumentConverter # noqa: F401
|
|
80
|
+
from docling.pipeline_options import ( # noqa: F401
|
|
81
|
+
VlmPipelineOptions,
|
|
82
|
+
vlm_model_specs,
|
|
83
|
+
)
|
|
84
|
+
except ImportError as import_error:
|
|
85
|
+
raise ExtractionRunFatalError(
|
|
86
|
+
"DoclingSmol extractor requires an optional dependency. "
|
|
87
|
+
'Install it with pip install "biblicus[docling]".'
|
|
88
|
+
) from import_error
|
|
89
|
+
|
|
90
|
+
if parsed.backend == "mlx":
|
|
91
|
+
try:
|
|
92
|
+
from docling.pipeline_options import vlm_model_specs
|
|
93
|
+
|
|
94
|
+
_ = vlm_model_specs.SMOLDOCLING_MLX
|
|
95
|
+
except (ImportError, AttributeError) as exc:
|
|
96
|
+
raise ExtractionRunFatalError(
|
|
97
|
+
"DoclingSmol extractor with MLX backend requires MLX support. "
|
|
98
|
+
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
|
+
) from exc
|
|
100
|
+
|
|
101
|
+
return parsed
|
|
102
|
+
|
|
103
|
+
def extract_text(
|
|
104
|
+
self,
|
|
105
|
+
*,
|
|
106
|
+
corpus: Corpus,
|
|
107
|
+
item: CatalogItem,
|
|
108
|
+
config: BaseModel,
|
|
109
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
110
|
+
) -> Optional[ExtractedText]:
|
|
111
|
+
"""
|
|
112
|
+
Extract text for a document item using SmolDocling.
|
|
113
|
+
|
|
114
|
+
:param corpus: Corpus containing the item bytes.
|
|
115
|
+
:type corpus: Corpus
|
|
116
|
+
:param item: Catalog item being processed.
|
|
117
|
+
:type item: CatalogItem
|
|
118
|
+
:param config: Parsed configuration model.
|
|
119
|
+
:type config: DoclingSmolExtractorConfig
|
|
120
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
121
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
122
|
+
:return: Extracted text payload, or None when the item is not supported.
|
|
123
|
+
:rtype: ExtractedText or None
|
|
124
|
+
"""
|
|
125
|
+
_ = previous_extractions
|
|
126
|
+
|
|
127
|
+
if not self._is_supported_media_type(item.media_type):
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
parsed_config = (
|
|
131
|
+
config
|
|
132
|
+
if isinstance(config, DoclingSmolExtractorConfig)
|
|
133
|
+
else DoclingSmolExtractorConfig.model_validate(config)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
source_path = corpus.root / item.relpath
|
|
137
|
+
text = self._convert_document(source_path, parsed_config)
|
|
138
|
+
return ExtractedText(text=text.strip(), producer_extractor_id=self.extractor_id)
|
|
139
|
+
|
|
140
|
+
def _is_supported_media_type(self, media_type: str) -> bool:
|
|
141
|
+
"""
|
|
142
|
+
Check if a media type is supported by this extractor.
|
|
143
|
+
|
|
144
|
+
:param media_type: Media type string.
|
|
145
|
+
:type media_type: str
|
|
146
|
+
:return: True if supported, False otherwise.
|
|
147
|
+
:rtype: bool
|
|
148
|
+
"""
|
|
149
|
+
if media_type in DOCLING_SUPPORTED_MEDIA_TYPES:
|
|
150
|
+
return True
|
|
151
|
+
if media_type.startswith("image/"):
|
|
152
|
+
return True
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
def _convert_document(self, source_path, config: DoclingSmolExtractorConfig) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Convert a document using Docling with the SmolDocling VLM.
|
|
158
|
+
|
|
159
|
+
:param source_path: Path to the source document.
|
|
160
|
+
:type source_path: pathlib.Path
|
|
161
|
+
:param config: Parsed configuration.
|
|
162
|
+
:type config: DoclingSmolExtractorConfig
|
|
163
|
+
:return: Extracted text content.
|
|
164
|
+
:rtype: str
|
|
165
|
+
"""
|
|
166
|
+
from docling.document_converter import DocumentConverter, DocumentConverterOptions
|
|
167
|
+
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
|
+
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
|
+
|
|
170
|
+
if config.backend == "mlx":
|
|
171
|
+
vlm_options = vlm_model_specs.SMOLDOCLING_MLX
|
|
172
|
+
else:
|
|
173
|
+
vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
|
|
174
|
+
|
|
175
|
+
pipeline_options = DocumentConverterOptions(
|
|
176
|
+
pipeline_options=VlmPipelineOptions(vlm_options=vlm_options)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
|
|
180
|
+
converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
|
|
181
|
+
result = converter.convert(str(source_path))
|
|
182
|
+
|
|
183
|
+
if config.output_format == "html":
|
|
184
|
+
return result.document.export_to_html()
|
|
185
|
+
elif config.output_format == "text":
|
|
186
|
+
return result.document.export_to_text()
|
|
187
|
+
else:
|
|
188
|
+
return result.document.export_to_markdown()
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PaddleOCR-VL backed optical character recognition extractor plugin.
|
|
3
|
+
|
|
4
|
+
This extractor uses PaddleOCR-VL, a vision-language model that provides
|
|
5
|
+
improved optical character recognition accuracy especially for complex layouts and multilingual text.
|
|
6
|
+
|
|
7
|
+
The extractor supports both local inference and application programming interface based inference via
|
|
8
|
+
the inference backend abstraction.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, ClassVar, Dict, List, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
17
|
+
|
|
18
|
+
from ..corpus import Corpus
|
|
19
|
+
from ..errors import ExtractionRunFatalError
|
|
20
|
+
from ..inference import ApiProvider, InferenceBackendConfig, InferenceBackendMode, resolve_api_key
|
|
21
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
22
|
+
from .base import TextExtractor
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PaddleOcrVlExtractorConfig(BaseModel):
|
|
26
|
+
"""
|
|
27
|
+
Configuration for the PaddleOCR-VL extractor.
|
|
28
|
+
|
|
29
|
+
:ivar backend: Inference backend configuration for local or application programming interface execution.
|
|
30
|
+
:vartype backend: InferenceBackendConfig
|
|
31
|
+
:ivar min_confidence: Minimum confidence threshold for including text.
|
|
32
|
+
:vartype min_confidence: float
|
|
33
|
+
:ivar joiner: String used to join recognized text lines.
|
|
34
|
+
:vartype joiner: str
|
|
35
|
+
:ivar use_angle_cls: Whether to use angle classification for rotated text.
|
|
36
|
+
:vartype use_angle_cls: bool
|
|
37
|
+
:ivar lang: Language code for optical character recognition model.
|
|
38
|
+
:vartype lang: str
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(extra="forbid")
|
|
42
|
+
|
|
43
|
+
backend: InferenceBackendConfig = Field(default_factory=InferenceBackendConfig)
|
|
44
|
+
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
45
|
+
joiner: str = Field(default="\n")
|
|
46
|
+
use_angle_cls: bool = Field(default=True)
|
|
47
|
+
lang: str = Field(default="en")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class PaddleOcrVlExtractor(TextExtractor):
|
|
51
|
+
"""
|
|
52
|
+
Extractor plugin using PaddleOCR-VL for optical character recognition.
|
|
53
|
+
|
|
54
|
+
This extractor handles image media types and returns text with confidence scores.
|
|
55
|
+
It supports both local inference and application programming interface based inference.
|
|
56
|
+
|
|
57
|
+
:ivar extractor_id: Extractor identifier.
|
|
58
|
+
:vartype extractor_id: str
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
extractor_id = "ocr-paddleocr-vl"
|
|
62
|
+
|
|
63
|
+
_model_cache: ClassVar[Dict[Tuple[str, bool], Any]] = {}
|
|
64
|
+
|
|
65
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
66
|
+
"""
|
|
67
|
+
Validate extractor configuration and ensure prerequisites are available.
|
|
68
|
+
|
|
69
|
+
:param config: Configuration mapping.
|
|
70
|
+
:type config: dict[str, Any]
|
|
71
|
+
:return: Parsed configuration model.
|
|
72
|
+
:rtype: PaddleOcrVlExtractorConfig
|
|
73
|
+
:raises ExtractionRunFatalError: If required dependencies are missing.
|
|
74
|
+
"""
|
|
75
|
+
import json
|
|
76
|
+
|
|
77
|
+
parsed_config = {}
|
|
78
|
+
for key, value in config.items():
|
|
79
|
+
if isinstance(value, str) and (value.startswith("{") or value.startswith("[")):
|
|
80
|
+
try:
|
|
81
|
+
parsed_config[key] = json.loads(value)
|
|
82
|
+
except json.JSONDecodeError:
|
|
83
|
+
parsed_config[key] = value
|
|
84
|
+
else:
|
|
85
|
+
parsed_config[key] = value
|
|
86
|
+
|
|
87
|
+
parsed = PaddleOcrVlExtractorConfig.model_validate(parsed_config)
|
|
88
|
+
|
|
89
|
+
if parsed.backend.mode == InferenceBackendMode.LOCAL:
|
|
90
|
+
try:
|
|
91
|
+
from paddleocr import PaddleOCR # noqa: F401
|
|
92
|
+
except ImportError as import_error:
|
|
93
|
+
raise ExtractionRunFatalError(
|
|
94
|
+
"PaddleOCR-VL extractor (local mode) requires paddleocr. "
|
|
95
|
+
'Install it with pip install "biblicus[paddleocr]".'
|
|
96
|
+
) from import_error
|
|
97
|
+
else:
|
|
98
|
+
# api_provider is guaranteed to be set by InferenceBackendConfig validator
|
|
99
|
+
api_key = resolve_api_key(
|
|
100
|
+
parsed.backend.api_provider,
|
|
101
|
+
config_override=parsed.backend.api_key,
|
|
102
|
+
)
|
|
103
|
+
if api_key is None:
|
|
104
|
+
provider_name = parsed.backend.api_provider.value.upper()
|
|
105
|
+
raise ExtractionRunFatalError(
|
|
106
|
+
f"PaddleOCR-VL extractor (API mode) requires an API key for {provider_name}. "
|
|
107
|
+
f"Set {provider_name}_API_KEY environment variable or configure "
|
|
108
|
+
f"{parsed.backend.api_provider.value} in user config."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return parsed
|
|
112
|
+
|
|
113
|
+
def extract_text(
|
|
114
|
+
self,
|
|
115
|
+
*,
|
|
116
|
+
corpus: Corpus,
|
|
117
|
+
item: CatalogItem,
|
|
118
|
+
config: BaseModel,
|
|
119
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
120
|
+
) -> Optional[ExtractedText]:
|
|
121
|
+
"""
|
|
122
|
+
Extract text from an image using PaddleOCR-VL.
|
|
123
|
+
|
|
124
|
+
:param corpus: Corpus containing the item bytes.
|
|
125
|
+
:type corpus: Corpus
|
|
126
|
+
:param item: Catalog item being processed.
|
|
127
|
+
:type item: CatalogItem
|
|
128
|
+
:param config: Parsed configuration model.
|
|
129
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
130
|
+
:param previous_extractions: Prior step outputs for this item.
|
|
131
|
+
:type previous_extractions: list[ExtractionStepOutput]
|
|
132
|
+
:return: Extracted text with confidence, or None for non-image items.
|
|
133
|
+
:rtype: ExtractedText or None
|
|
134
|
+
"""
|
|
135
|
+
_ = previous_extractions
|
|
136
|
+
|
|
137
|
+
if not item.media_type.startswith("image/"):
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
parsed_config = (
|
|
141
|
+
config
|
|
142
|
+
if isinstance(config, PaddleOcrVlExtractorConfig)
|
|
143
|
+
else PaddleOcrVlExtractorConfig.model_validate(config)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
source_path = corpus.root / item.relpath
|
|
147
|
+
|
|
148
|
+
if parsed_config.backend.mode == InferenceBackendMode.LOCAL:
|
|
149
|
+
text, confidence = self._extract_local(source_path, parsed_config)
|
|
150
|
+
else:
|
|
151
|
+
api_key = resolve_api_key(
|
|
152
|
+
parsed_config.backend.api_provider,
|
|
153
|
+
config_override=parsed_config.backend.api_key,
|
|
154
|
+
)
|
|
155
|
+
text, confidence = self._extract_via_api(
|
|
156
|
+
source_path, parsed_config, api_key
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return ExtractedText(
|
|
160
|
+
text=text,
|
|
161
|
+
producer_extractor_id=self.extractor_id,
|
|
162
|
+
confidence=confidence,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _extract_local(
|
|
166
|
+
self, source_path: Path, config: PaddleOcrVlExtractorConfig
|
|
167
|
+
) -> Tuple[str, Optional[float]]:
|
|
168
|
+
"""
|
|
169
|
+
Perform local inference using PaddleOCR.
|
|
170
|
+
|
|
171
|
+
:param source_path: Path to the image file.
|
|
172
|
+
:type source_path: Path
|
|
173
|
+
:param config: Parsed extractor configuration.
|
|
174
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
175
|
+
:return: Tuple of extracted text and average confidence score.
|
|
176
|
+
:rtype: tuple[str, float or None]
|
|
177
|
+
"""
|
|
178
|
+
from paddleocr import PaddleOCR
|
|
179
|
+
|
|
180
|
+
cache_key = (config.lang, config.use_angle_cls)
|
|
181
|
+
ocr = PaddleOcrVlExtractor._model_cache.get(cache_key)
|
|
182
|
+
if ocr is None:
|
|
183
|
+
ocr = PaddleOCR(
|
|
184
|
+
use_angle_cls=config.use_angle_cls,
|
|
185
|
+
lang=config.lang,
|
|
186
|
+
)
|
|
187
|
+
PaddleOcrVlExtractor._model_cache[cache_key] = ocr
|
|
188
|
+
result = ocr.ocr(str(source_path), cls=config.use_angle_cls)
|
|
189
|
+
|
|
190
|
+
if result is None or not result:
|
|
191
|
+
return "", None
|
|
192
|
+
|
|
193
|
+
lines: list[str] = []
|
|
194
|
+
confidences: list[float] = []
|
|
195
|
+
|
|
196
|
+
for page_result in result:
|
|
197
|
+
if page_result is None:
|
|
198
|
+
continue
|
|
199
|
+
for line_result in page_result:
|
|
200
|
+
if not isinstance(line_result, (list, tuple)) or len(line_result) < 2:
|
|
201
|
+
continue
|
|
202
|
+
text_info = line_result[1]
|
|
203
|
+
if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
|
|
204
|
+
text_value = text_info[0]
|
|
205
|
+
conf_value = text_info[1]
|
|
206
|
+
if isinstance(conf_value, (int, float)):
|
|
207
|
+
confidence = float(conf_value)
|
|
208
|
+
if confidence >= config.min_confidence:
|
|
209
|
+
if isinstance(text_value, str) and text_value.strip():
|
|
210
|
+
lines.append(text_value.strip())
|
|
211
|
+
confidences.append(confidence)
|
|
212
|
+
|
|
213
|
+
text = config.joiner.join(lines).strip()
|
|
214
|
+
avg_confidence = sum(confidences) / len(confidences) if confidences else None
|
|
215
|
+
|
|
216
|
+
return text, avg_confidence
|
|
217
|
+
|
|
218
|
+
def _extract_via_api(
|
|
219
|
+
self, source_path: Path, config: PaddleOcrVlExtractorConfig, api_key: Optional[str]
|
|
220
|
+
) -> Tuple[str, Optional[float]]:
|
|
221
|
+
"""
|
|
222
|
+
Perform inference via application programming interface.
|
|
223
|
+
|
|
224
|
+
:param source_path: Path to the image file.
|
|
225
|
+
:type source_path: Path
|
|
226
|
+
:param config: Parsed extractor configuration.
|
|
227
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
228
|
+
:param api_key: Application programming interface key for the provider.
|
|
229
|
+
:type api_key: str or None
|
|
230
|
+
:return: Tuple of extracted text and confidence score.
|
|
231
|
+
:rtype: tuple[str, float or None]
|
|
232
|
+
"""
|
|
233
|
+
if config.backend.api_provider == ApiProvider.HUGGINGFACE:
|
|
234
|
+
return self._extract_via_huggingface_api(source_path, config, api_key)
|
|
235
|
+
else:
|
|
236
|
+
return "", None
|
|
237
|
+
|
|
238
|
+
def _extract_via_huggingface_api(
|
|
239
|
+
self, source_path: Path, config: PaddleOcrVlExtractorConfig, api_key: Optional[str]
|
|
240
|
+
) -> Tuple[str, Optional[float]]:
|
|
241
|
+
"""
|
|
242
|
+
Perform inference via HuggingFace Inference API.
|
|
243
|
+
|
|
244
|
+
:param source_path: Path to the image file.
|
|
245
|
+
:type source_path: Path
|
|
246
|
+
:param config: Parsed extractor configuration.
|
|
247
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
248
|
+
:param api_key: HuggingFace application programming interface key.
|
|
249
|
+
:type api_key: str or None
|
|
250
|
+
:return: Tuple of extracted text and confidence score.
|
|
251
|
+
:rtype: tuple[str, float or None]
|
|
252
|
+
"""
|
|
253
|
+
import base64
|
|
254
|
+
|
|
255
|
+
import requests
|
|
256
|
+
|
|
257
|
+
with open(source_path, "rb") as f:
|
|
258
|
+
image_data = base64.b64encode(f.read()).decode("utf-8")
|
|
259
|
+
|
|
260
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
261
|
+
|
|
262
|
+
model_id = config.backend.model_id or "PaddlePaddle/PaddleOCR-VL"
|
|
263
|
+
api_url = f"https://api-inference.huggingface.co/models/{model_id}"
|
|
264
|
+
response = requests.post(
|
|
265
|
+
api_url,
|
|
266
|
+
headers=headers,
|
|
267
|
+
json={"inputs": image_data},
|
|
268
|
+
timeout=60,
|
|
269
|
+
)
|
|
270
|
+
response.raise_for_status()
|
|
271
|
+
|
|
272
|
+
result = response.json()
|
|
273
|
+
return self._parse_api_response(result, config)
|
|
274
|
+
|
|
275
|
+
def _parse_api_response(
|
|
276
|
+
self, result: Any, config: PaddleOcrVlExtractorConfig
|
|
277
|
+
) -> Tuple[str, Optional[float]]:
|
|
278
|
+
"""
|
|
279
|
+
Parse application programming interface response.
|
|
280
|
+
|
|
281
|
+
:param result: Application programming interface response data.
|
|
282
|
+
:type result: Any
|
|
283
|
+
:param config: Parsed extractor configuration.
|
|
284
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
285
|
+
:return: Tuple of extracted text and confidence score.
|
|
286
|
+
:rtype: tuple[str, float or None]
|
|
287
|
+
"""
|
|
288
|
+
_ = config
|
|
289
|
+
if isinstance(result, str):
|
|
290
|
+
return result.strip(), None
|
|
291
|
+
if isinstance(result, dict):
|
|
292
|
+
text = result.get("generated_text", "")
|
|
293
|
+
confidence = result.get("confidence")
|
|
294
|
+
if isinstance(confidence, (int, float)):
|
|
295
|
+
return text.strip(), float(confidence)
|
|
296
|
+
return text.strip(), None
|
|
297
|
+
if isinstance(result, list) and result:
|
|
298
|
+
first = result[0]
|
|
299
|
+
if isinstance(first, dict):
|
|
300
|
+
text = first.get("generated_text", "")
|
|
301
|
+
confidence = first.get("confidence")
|
|
302
|
+
if isinstance(confidence, (int, float)):
|
|
303
|
+
return text.strip(), float(confidence)
|
|
304
|
+
return text.strip(), None
|
|
305
|
+
return "", None
|