biblicus 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/__init__.py +40 -0
- biblicus/analysis/base.py +49 -0
- biblicus/analysis/llm.py +106 -0
- biblicus/analysis/models.py +512 -0
- biblicus/analysis/schema.py +18 -0
- biblicus/analysis/topic_modeling.py +561 -0
- biblicus/cli.py +160 -11
- biblicus/constants.py +2 -0
- biblicus/corpus.py +42 -0
- biblicus/extraction.py +5 -0
- biblicus/extractors/__init__.py +14 -0
- biblicus/extractors/deepgram_stt.py +166 -0
- biblicus/extractors/docling_granite_text.py +188 -0
- biblicus/extractors/docling_smol_text.py +188 -0
- biblicus/extractors/markitdown_text.py +128 -0
- biblicus/extractors/paddleocr_vl_text.py +305 -0
- biblicus/extractors/rapidocr_text.py +8 -1
- biblicus/extractors/select_override.py +121 -0
- biblicus/extractors/select_smart_override.py +187 -0
- biblicus/inference.py +104 -0
- biblicus/models.py +6 -0
- biblicus/user_config.py +76 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/METADATA +120 -5
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/RECORD +29 -15
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/WHEEL +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.6.0.dist-info → biblicus-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PaddleOCR-VL backed optical character recognition extractor plugin.
|
|
3
|
+
|
|
4
|
+
This extractor uses PaddleOCR-VL, a vision-language model that provides
|
|
5
|
+
improved optical character recognition accuracy especially for complex layouts and multilingual text.
|
|
6
|
+
|
|
7
|
+
The extractor supports both local inference and application programming interface based inference via
|
|
8
|
+
the inference backend abstraction.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, ClassVar, Dict, List, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
17
|
+
|
|
18
|
+
from ..corpus import Corpus
|
|
19
|
+
from ..errors import ExtractionRunFatalError
|
|
20
|
+
from ..inference import ApiProvider, InferenceBackendConfig, InferenceBackendMode, resolve_api_key
|
|
21
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
22
|
+
from .base import TextExtractor
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PaddleOcrVlExtractorConfig(BaseModel):
|
|
26
|
+
"""
|
|
27
|
+
Configuration for the PaddleOCR-VL extractor.
|
|
28
|
+
|
|
29
|
+
:ivar backend: Inference backend configuration for local or application programming interface execution.
|
|
30
|
+
:vartype backend: InferenceBackendConfig
|
|
31
|
+
:ivar min_confidence: Minimum confidence threshold for including text.
|
|
32
|
+
:vartype min_confidence: float
|
|
33
|
+
:ivar joiner: String used to join recognized text lines.
|
|
34
|
+
:vartype joiner: str
|
|
35
|
+
:ivar use_angle_cls: Whether to use angle classification for rotated text.
|
|
36
|
+
:vartype use_angle_cls: bool
|
|
37
|
+
:ivar lang: Language code for optical character recognition model.
|
|
38
|
+
:vartype lang: str
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(extra="forbid")
|
|
42
|
+
|
|
43
|
+
backend: InferenceBackendConfig = Field(default_factory=InferenceBackendConfig)
|
|
44
|
+
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
45
|
+
joiner: str = Field(default="\n")
|
|
46
|
+
use_angle_cls: bool = Field(default=True)
|
|
47
|
+
lang: str = Field(default="en")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class PaddleOcrVlExtractor(TextExtractor):
|
|
51
|
+
"""
|
|
52
|
+
Extractor plugin using PaddleOCR-VL for optical character recognition.
|
|
53
|
+
|
|
54
|
+
This extractor handles image media types and returns text with confidence scores.
|
|
55
|
+
It supports both local inference and application programming interface based inference.
|
|
56
|
+
|
|
57
|
+
:ivar extractor_id: Extractor identifier.
|
|
58
|
+
:vartype extractor_id: str
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
extractor_id = "ocr-paddleocr-vl"
|
|
62
|
+
|
|
63
|
+
_model_cache: ClassVar[Dict[Tuple[str, bool], Any]] = {}
|
|
64
|
+
|
|
65
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
66
|
+
"""
|
|
67
|
+
Validate extractor configuration and ensure prerequisites are available.
|
|
68
|
+
|
|
69
|
+
:param config: Configuration mapping.
|
|
70
|
+
:type config: dict[str, Any]
|
|
71
|
+
:return: Parsed configuration model.
|
|
72
|
+
:rtype: PaddleOcrVlExtractorConfig
|
|
73
|
+
:raises ExtractionRunFatalError: If required dependencies are missing.
|
|
74
|
+
"""
|
|
75
|
+
import json
|
|
76
|
+
|
|
77
|
+
parsed_config = {}
|
|
78
|
+
for key, value in config.items():
|
|
79
|
+
if isinstance(value, str) and (value.startswith("{") or value.startswith("[")):
|
|
80
|
+
try:
|
|
81
|
+
parsed_config[key] = json.loads(value)
|
|
82
|
+
except json.JSONDecodeError:
|
|
83
|
+
parsed_config[key] = value
|
|
84
|
+
else:
|
|
85
|
+
parsed_config[key] = value
|
|
86
|
+
|
|
87
|
+
parsed = PaddleOcrVlExtractorConfig.model_validate(parsed_config)
|
|
88
|
+
|
|
89
|
+
if parsed.backend.mode == InferenceBackendMode.LOCAL:
|
|
90
|
+
try:
|
|
91
|
+
from paddleocr import PaddleOCR # noqa: F401
|
|
92
|
+
except ImportError as import_error:
|
|
93
|
+
raise ExtractionRunFatalError(
|
|
94
|
+
"PaddleOCR-VL extractor (local mode) requires paddleocr. "
|
|
95
|
+
'Install it with pip install "biblicus[paddleocr]".'
|
|
96
|
+
) from import_error
|
|
97
|
+
else:
|
|
98
|
+
# api_provider is guaranteed to be set by InferenceBackendConfig validator
|
|
99
|
+
api_key = resolve_api_key(
|
|
100
|
+
parsed.backend.api_provider,
|
|
101
|
+
config_override=parsed.backend.api_key,
|
|
102
|
+
)
|
|
103
|
+
if api_key is None:
|
|
104
|
+
provider_name = parsed.backend.api_provider.value.upper()
|
|
105
|
+
raise ExtractionRunFatalError(
|
|
106
|
+
f"PaddleOCR-VL extractor (API mode) requires an API key for {provider_name}. "
|
|
107
|
+
f"Set {provider_name}_API_KEY environment variable or configure "
|
|
108
|
+
f"{parsed.backend.api_provider.value} in user config."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return parsed
|
|
112
|
+
|
|
113
|
+
def extract_text(
|
|
114
|
+
self,
|
|
115
|
+
*,
|
|
116
|
+
corpus: Corpus,
|
|
117
|
+
item: CatalogItem,
|
|
118
|
+
config: BaseModel,
|
|
119
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
120
|
+
) -> Optional[ExtractedText]:
|
|
121
|
+
"""
|
|
122
|
+
Extract text from an image using PaddleOCR-VL.
|
|
123
|
+
|
|
124
|
+
:param corpus: Corpus containing the item bytes.
|
|
125
|
+
:type corpus: Corpus
|
|
126
|
+
:param item: Catalog item being processed.
|
|
127
|
+
:type item: CatalogItem
|
|
128
|
+
:param config: Parsed configuration model.
|
|
129
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
130
|
+
:param previous_extractions: Prior step outputs for this item.
|
|
131
|
+
:type previous_extractions: list[ExtractionStepOutput]
|
|
132
|
+
:return: Extracted text with confidence, or None for non-image items.
|
|
133
|
+
:rtype: ExtractedText or None
|
|
134
|
+
"""
|
|
135
|
+
_ = previous_extractions
|
|
136
|
+
|
|
137
|
+
if not item.media_type.startswith("image/"):
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
parsed_config = (
|
|
141
|
+
config
|
|
142
|
+
if isinstance(config, PaddleOcrVlExtractorConfig)
|
|
143
|
+
else PaddleOcrVlExtractorConfig.model_validate(config)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
source_path = corpus.root / item.relpath
|
|
147
|
+
|
|
148
|
+
if parsed_config.backend.mode == InferenceBackendMode.LOCAL:
|
|
149
|
+
text, confidence = self._extract_local(source_path, parsed_config)
|
|
150
|
+
else:
|
|
151
|
+
api_key = resolve_api_key(
|
|
152
|
+
parsed_config.backend.api_provider,
|
|
153
|
+
config_override=parsed_config.backend.api_key,
|
|
154
|
+
)
|
|
155
|
+
text, confidence = self._extract_via_api(
|
|
156
|
+
source_path, parsed_config, api_key
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return ExtractedText(
|
|
160
|
+
text=text,
|
|
161
|
+
producer_extractor_id=self.extractor_id,
|
|
162
|
+
confidence=confidence,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _extract_local(
|
|
166
|
+
self, source_path: Path, config: PaddleOcrVlExtractorConfig
|
|
167
|
+
) -> Tuple[str, Optional[float]]:
|
|
168
|
+
"""
|
|
169
|
+
Perform local inference using PaddleOCR.
|
|
170
|
+
|
|
171
|
+
:param source_path: Path to the image file.
|
|
172
|
+
:type source_path: Path
|
|
173
|
+
:param config: Parsed extractor configuration.
|
|
174
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
175
|
+
:return: Tuple of extracted text and average confidence score.
|
|
176
|
+
:rtype: tuple[str, float or None]
|
|
177
|
+
"""
|
|
178
|
+
from paddleocr import PaddleOCR
|
|
179
|
+
|
|
180
|
+
cache_key = (config.lang, config.use_angle_cls)
|
|
181
|
+
ocr = PaddleOcrVlExtractor._model_cache.get(cache_key)
|
|
182
|
+
if ocr is None:
|
|
183
|
+
ocr = PaddleOCR(
|
|
184
|
+
use_angle_cls=config.use_angle_cls,
|
|
185
|
+
lang=config.lang,
|
|
186
|
+
)
|
|
187
|
+
PaddleOcrVlExtractor._model_cache[cache_key] = ocr
|
|
188
|
+
result = ocr.ocr(str(source_path), cls=config.use_angle_cls)
|
|
189
|
+
|
|
190
|
+
if result is None or not result:
|
|
191
|
+
return "", None
|
|
192
|
+
|
|
193
|
+
lines: list[str] = []
|
|
194
|
+
confidences: list[float] = []
|
|
195
|
+
|
|
196
|
+
for page_result in result:
|
|
197
|
+
if page_result is None:
|
|
198
|
+
continue
|
|
199
|
+
for line_result in page_result:
|
|
200
|
+
if not isinstance(line_result, (list, tuple)) or len(line_result) < 2:
|
|
201
|
+
continue
|
|
202
|
+
text_info = line_result[1]
|
|
203
|
+
if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
|
|
204
|
+
text_value = text_info[0]
|
|
205
|
+
conf_value = text_info[1]
|
|
206
|
+
if isinstance(conf_value, (int, float)):
|
|
207
|
+
confidence = float(conf_value)
|
|
208
|
+
if confidence >= config.min_confidence:
|
|
209
|
+
if isinstance(text_value, str) and text_value.strip():
|
|
210
|
+
lines.append(text_value.strip())
|
|
211
|
+
confidences.append(confidence)
|
|
212
|
+
|
|
213
|
+
text = config.joiner.join(lines).strip()
|
|
214
|
+
avg_confidence = sum(confidences) / len(confidences) if confidences else None
|
|
215
|
+
|
|
216
|
+
return text, avg_confidence
|
|
217
|
+
|
|
218
|
+
def _extract_via_api(
|
|
219
|
+
self, source_path: Path, config: PaddleOcrVlExtractorConfig, api_key: Optional[str]
|
|
220
|
+
) -> Tuple[str, Optional[float]]:
|
|
221
|
+
"""
|
|
222
|
+
Perform inference via application programming interface.
|
|
223
|
+
|
|
224
|
+
:param source_path: Path to the image file.
|
|
225
|
+
:type source_path: Path
|
|
226
|
+
:param config: Parsed extractor configuration.
|
|
227
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
228
|
+
:param api_key: Application programming interface key for the provider.
|
|
229
|
+
:type api_key: str or None
|
|
230
|
+
:return: Tuple of extracted text and confidence score.
|
|
231
|
+
:rtype: tuple[str, float or None]
|
|
232
|
+
"""
|
|
233
|
+
if config.backend.api_provider == ApiProvider.HUGGINGFACE:
|
|
234
|
+
return self._extract_via_huggingface_api(source_path, config, api_key)
|
|
235
|
+
else:
|
|
236
|
+
return "", None
|
|
237
|
+
|
|
238
|
+
def _extract_via_huggingface_api(
|
|
239
|
+
self, source_path: Path, config: PaddleOcrVlExtractorConfig, api_key: Optional[str]
|
|
240
|
+
) -> Tuple[str, Optional[float]]:
|
|
241
|
+
"""
|
|
242
|
+
Perform inference via HuggingFace Inference API.
|
|
243
|
+
|
|
244
|
+
:param source_path: Path to the image file.
|
|
245
|
+
:type source_path: Path
|
|
246
|
+
:param config: Parsed extractor configuration.
|
|
247
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
248
|
+
:param api_key: HuggingFace application programming interface key.
|
|
249
|
+
:type api_key: str or None
|
|
250
|
+
:return: Tuple of extracted text and confidence score.
|
|
251
|
+
:rtype: tuple[str, float or None]
|
|
252
|
+
"""
|
|
253
|
+
import base64
|
|
254
|
+
|
|
255
|
+
import requests
|
|
256
|
+
|
|
257
|
+
with open(source_path, "rb") as f:
|
|
258
|
+
image_data = base64.b64encode(f.read()).decode("utf-8")
|
|
259
|
+
|
|
260
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
261
|
+
|
|
262
|
+
model_id = config.backend.model_id or "PaddlePaddle/PaddleOCR-VL"
|
|
263
|
+
api_url = f"https://api-inference.huggingface.co/models/{model_id}"
|
|
264
|
+
response = requests.post(
|
|
265
|
+
api_url,
|
|
266
|
+
headers=headers,
|
|
267
|
+
json={"inputs": image_data},
|
|
268
|
+
timeout=60,
|
|
269
|
+
)
|
|
270
|
+
response.raise_for_status()
|
|
271
|
+
|
|
272
|
+
result = response.json()
|
|
273
|
+
return self._parse_api_response(result, config)
|
|
274
|
+
|
|
275
|
+
def _parse_api_response(
|
|
276
|
+
self, result: Any, config: PaddleOcrVlExtractorConfig
|
|
277
|
+
) -> Tuple[str, Optional[float]]:
|
|
278
|
+
"""
|
|
279
|
+
Parse application programming interface response.
|
|
280
|
+
|
|
281
|
+
:param result: Application programming interface response data.
|
|
282
|
+
:type result: Any
|
|
283
|
+
:param config: Parsed extractor configuration.
|
|
284
|
+
:type config: PaddleOcrVlExtractorConfig
|
|
285
|
+
:return: Tuple of extracted text and confidence score.
|
|
286
|
+
:rtype: tuple[str, float or None]
|
|
287
|
+
"""
|
|
288
|
+
_ = config
|
|
289
|
+
if isinstance(result, str):
|
|
290
|
+
return result.strip(), None
|
|
291
|
+
if isinstance(result, dict):
|
|
292
|
+
text = result.get("generated_text", "")
|
|
293
|
+
confidence = result.get("confidence")
|
|
294
|
+
if isinstance(confidence, (int, float)):
|
|
295
|
+
return text.strip(), float(confidence)
|
|
296
|
+
return text.strip(), None
|
|
297
|
+
if isinstance(result, list) and result:
|
|
298
|
+
first = result[0]
|
|
299
|
+
if isinstance(first, dict):
|
|
300
|
+
text = first.get("generated_text", "")
|
|
301
|
+
confidence = first.get("confidence")
|
|
302
|
+
if isinstance(confidence, (int, float)):
|
|
303
|
+
return text.strip(), float(confidence)
|
|
304
|
+
return text.strip(), None
|
|
305
|
+
return "", None
|
|
@@ -109,6 +109,7 @@ class RapidOcrExtractor(TextExtractor):
|
|
|
109
109
|
return ExtractedText(text="", producer_extractor_id=self.extractor_id)
|
|
110
110
|
|
|
111
111
|
lines: list[str] = []
|
|
112
|
+
confidences: list[float] = []
|
|
112
113
|
for entry in result:
|
|
113
114
|
if not isinstance(entry, list) or len(entry) < 3:
|
|
114
115
|
continue
|
|
@@ -124,6 +125,12 @@ class RapidOcrExtractor(TextExtractor):
|
|
|
124
125
|
cleaned = text_value.strip()
|
|
125
126
|
if cleaned:
|
|
126
127
|
lines.append(cleaned)
|
|
128
|
+
confidences.append(confidence)
|
|
127
129
|
|
|
128
130
|
text = parsed_config.joiner.join(lines).strip()
|
|
129
|
-
|
|
131
|
+
avg_confidence = sum(confidences) / len(confidences) if confidences else None
|
|
132
|
+
return ExtractedText(
|
|
133
|
+
text=text,
|
|
134
|
+
producer_extractor_id=self.extractor_id,
|
|
135
|
+
confidence=avg_confidence,
|
|
136
|
+
)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simple override selection extractor that always uses the last extraction for matching types.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import fnmatch
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
13
|
+
from .base import TextExtractor
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SelectOverrideConfig(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
Configuration for simple override selection.
|
|
19
|
+
|
|
20
|
+
:ivar media_type_patterns: List of media type patterns where override applies.
|
|
21
|
+
:vartype media_type_patterns: list[str]
|
|
22
|
+
:ivar fallback_to_first: If True, fall back to first extraction when no override match.
|
|
23
|
+
:vartype fallback_to_first: bool
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
media_type_patterns: List[str] = Field(default_factory=lambda: ["*/*"])
|
|
29
|
+
fallback_to_first: bool = Field(default=False)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SelectOverrideExtractor(TextExtractor):
|
|
33
|
+
"""
|
|
34
|
+
Simple override selector that uses the last extraction for matching media types.
|
|
35
|
+
|
|
36
|
+
For items matching the configured patterns, always use the last extraction.
|
|
37
|
+
For non-matching items, use the first extraction (if fallback_to_first) or last.
|
|
38
|
+
|
|
39
|
+
:ivar extractor_id: Extractor identifier.
|
|
40
|
+
:vartype extractor_id: str
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
extractor_id = "select-override"
|
|
44
|
+
|
|
45
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
46
|
+
"""
|
|
47
|
+
Validate selection extractor configuration.
|
|
48
|
+
|
|
49
|
+
:param config: Configuration mapping.
|
|
50
|
+
:type config: dict[str, Any]
|
|
51
|
+
:return: Parsed configuration.
|
|
52
|
+
:rtype: SelectOverrideConfig
|
|
53
|
+
"""
|
|
54
|
+
import json
|
|
55
|
+
|
|
56
|
+
# Parse JSON values from CLI string format
|
|
57
|
+
parsed_config = {}
|
|
58
|
+
for key, value in config.items():
|
|
59
|
+
if isinstance(value, str) and value.startswith("["):
|
|
60
|
+
try:
|
|
61
|
+
parsed_config[key] = json.loads(value)
|
|
62
|
+
except json.JSONDecodeError:
|
|
63
|
+
parsed_config[key] = value
|
|
64
|
+
else:
|
|
65
|
+
parsed_config[key] = value
|
|
66
|
+
|
|
67
|
+
return SelectOverrideConfig.model_validate(parsed_config)
|
|
68
|
+
|
|
69
|
+
def extract_text(
|
|
70
|
+
self,
|
|
71
|
+
*,
|
|
72
|
+
corpus,
|
|
73
|
+
item: CatalogItem,
|
|
74
|
+
config: BaseModel,
|
|
75
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
76
|
+
) -> Optional[ExtractedText]:
|
|
77
|
+
"""
|
|
78
|
+
Select extracted text using simple override logic.
|
|
79
|
+
|
|
80
|
+
:param corpus: Corpus containing the item bytes.
|
|
81
|
+
:type corpus: Corpus
|
|
82
|
+
:param item: Catalog item being processed.
|
|
83
|
+
:type item: CatalogItem
|
|
84
|
+
:param config: Parsed configuration model.
|
|
85
|
+
:type config: SelectOverrideConfig
|
|
86
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
87
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
88
|
+
:return: Selected extracted text payload or None when no prior outputs exist.
|
|
89
|
+
:rtype: ExtractedText or None
|
|
90
|
+
"""
|
|
91
|
+
_ = corpus
|
|
92
|
+
parsed_config = (
|
|
93
|
+
config
|
|
94
|
+
if isinstance(config, SelectOverrideConfig)
|
|
95
|
+
else SelectOverrideConfig.model_validate(config)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
extracted_candidates = [e for e in previous_extractions if e.text is not None]
|
|
99
|
+
|
|
100
|
+
if not extracted_candidates:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
matches_pattern = any(
|
|
104
|
+
fnmatch.fnmatch(item.media_type, pattern)
|
|
105
|
+
for pattern in parsed_config.media_type_patterns
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if matches_pattern:
|
|
109
|
+
candidate = extracted_candidates[-1]
|
|
110
|
+
elif parsed_config.fallback_to_first:
|
|
111
|
+
candidate = extracted_candidates[0]
|
|
112
|
+
else:
|
|
113
|
+
candidate = extracted_candidates[-1]
|
|
114
|
+
|
|
115
|
+
producer = candidate.producer_extractor_id or candidate.extractor_id
|
|
116
|
+
return ExtractedText(
|
|
117
|
+
text=candidate.text or "",
|
|
118
|
+
producer_extractor_id=producer,
|
|
119
|
+
source_step_index=candidate.step_index,
|
|
120
|
+
confidence=candidate.confidence,
|
|
121
|
+
)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Smart override selection extractor that intelligently chooses between extraction results.
|
|
3
|
+
|
|
4
|
+
This extractor implements the smart override behavior where it compares the most recent
|
|
5
|
+
extraction against previous ones and makes an intelligent choice based on content quality
|
|
6
|
+
and confidence scores.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import fnmatch
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
15
|
+
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SelectSmartOverrideConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for smart override selection.
|
|
23
|
+
|
|
24
|
+
:ivar media_type_patterns: List of media type patterns to consider (e.g., image/*).
|
|
25
|
+
:vartype media_type_patterns: list[str]
|
|
26
|
+
:ivar min_confidence_threshold: Minimum confidence to consider an extraction good.
|
|
27
|
+
:vartype min_confidence_threshold: float
|
|
28
|
+
:ivar min_text_length: Minimum text length to consider an extraction meaningful.
|
|
29
|
+
:vartype min_text_length: int
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(extra="forbid")
|
|
33
|
+
|
|
34
|
+
media_type_patterns: List[str] = Field(default_factory=lambda: ["*/*"])
|
|
35
|
+
min_confidence_threshold: float = Field(default=0.7, ge=0.0, le=1.0)
|
|
36
|
+
min_text_length: int = Field(default=10, ge=0)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SelectSmartOverrideExtractor(TextExtractor):
|
|
40
|
+
"""
|
|
41
|
+
Smart override selector that intelligently chooses between extraction results.
|
|
42
|
+
|
|
43
|
+
This extractor applies smart override logic for items matching the configured media
|
|
44
|
+
type patterns. The selection rules are:
|
|
45
|
+
|
|
46
|
+
1. If the item's media type doesn't match any configured patterns, use last extraction.
|
|
47
|
+
2. If the last extraction has meaningful content, use it.
|
|
48
|
+
3. If the last extraction is empty or low-confidence but a previous extraction has
|
|
49
|
+
good content with confidence, use the previous one.
|
|
50
|
+
4. Otherwise, use the last extraction.
|
|
51
|
+
|
|
52
|
+
Meaningful content is defined as text length >= min_text_length AND (confidence
|
|
53
|
+
>= min_confidence_threshold OR confidence is not available).
|
|
54
|
+
|
|
55
|
+
:ivar extractor_id: Extractor identifier.
|
|
56
|
+
:vartype extractor_id: str
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
extractor_id = "select-smart-override"
|
|
60
|
+
|
|
61
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
62
|
+
"""
|
|
63
|
+
Validate selection extractor configuration.
|
|
64
|
+
|
|
65
|
+
:param config: Configuration mapping.
|
|
66
|
+
:type config: dict[str, Any]
|
|
67
|
+
:return: Parsed configuration.
|
|
68
|
+
:rtype: SelectSmartOverrideConfig
|
|
69
|
+
"""
|
|
70
|
+
import json
|
|
71
|
+
|
|
72
|
+
# Parse JSON values from CLI string format
|
|
73
|
+
parsed_config = {}
|
|
74
|
+
for key, value in config.items():
|
|
75
|
+
if isinstance(value, str) and value.startswith("["):
|
|
76
|
+
try:
|
|
77
|
+
parsed_config[key] = json.loads(value)
|
|
78
|
+
except json.JSONDecodeError:
|
|
79
|
+
parsed_config[key] = value
|
|
80
|
+
else:
|
|
81
|
+
parsed_config[key] = value
|
|
82
|
+
|
|
83
|
+
return SelectSmartOverrideConfig.model_validate(parsed_config)
|
|
84
|
+
|
|
85
|
+
def extract_text(
|
|
86
|
+
self,
|
|
87
|
+
*,
|
|
88
|
+
corpus,
|
|
89
|
+
item: CatalogItem,
|
|
90
|
+
config: BaseModel,
|
|
91
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
92
|
+
) -> Optional[ExtractedText]:
|
|
93
|
+
"""
|
|
94
|
+
Select extracted text using smart override logic.
|
|
95
|
+
|
|
96
|
+
:param corpus: Corpus containing the item bytes.
|
|
97
|
+
:type corpus: Corpus
|
|
98
|
+
:param item: Catalog item being processed.
|
|
99
|
+
:type item: CatalogItem
|
|
100
|
+
:param config: Parsed configuration model.
|
|
101
|
+
:type config: SelectSmartOverrideConfig
|
|
102
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
103
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
104
|
+
:return: Selected extracted text payload or None when no prior outputs exist.
|
|
105
|
+
:rtype: ExtractedText or None
|
|
106
|
+
"""
|
|
107
|
+
_ = corpus
|
|
108
|
+
parsed_config = (
|
|
109
|
+
config
|
|
110
|
+
if isinstance(config, SelectSmartOverrideConfig)
|
|
111
|
+
else SelectSmartOverrideConfig.model_validate(config)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
matches_pattern = any(
|
|
115
|
+
fnmatch.fnmatch(item.media_type, pattern)
|
|
116
|
+
for pattern in parsed_config.media_type_patterns
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
extracted_candidates = [e for e in previous_extractions if e.text is not None]
|
|
120
|
+
|
|
121
|
+
if not extracted_candidates:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
if not matches_pattern:
|
|
125
|
+
return self._extraction_to_result(extracted_candidates[-1])
|
|
126
|
+
|
|
127
|
+
last_extraction = extracted_candidates[-1]
|
|
128
|
+
previous_candidates = extracted_candidates[:-1]
|
|
129
|
+
|
|
130
|
+
last_is_meaningful = self._is_meaningful(last_extraction, parsed_config)
|
|
131
|
+
|
|
132
|
+
if last_is_meaningful:
|
|
133
|
+
return self._extraction_to_result(last_extraction)
|
|
134
|
+
|
|
135
|
+
best_candidate = None
|
|
136
|
+
best_confidence = -1.0
|
|
137
|
+
for prev in previous_candidates:
|
|
138
|
+
if self._is_meaningful(prev, parsed_config):
|
|
139
|
+
prev_confidence = prev.confidence if prev.confidence is not None else 0.0
|
|
140
|
+
if prev_confidence > best_confidence:
|
|
141
|
+
best_candidate = prev
|
|
142
|
+
best_confidence = prev_confidence
|
|
143
|
+
|
|
144
|
+
if best_candidate is not None:
|
|
145
|
+
return self._extraction_to_result(best_candidate)
|
|
146
|
+
|
|
147
|
+
return self._extraction_to_result(last_extraction)
|
|
148
|
+
|
|
149
|
+
def _is_meaningful(
|
|
150
|
+
self, extraction: ExtractionStepOutput, config: SelectSmartOverrideConfig
|
|
151
|
+
) -> bool:
|
|
152
|
+
"""
|
|
153
|
+
Check if an extraction has meaningful content.
|
|
154
|
+
|
|
155
|
+
:param extraction: Extraction step output to check.
|
|
156
|
+
:type extraction: ExtractionStepOutput
|
|
157
|
+
:param config: Parsed configuration.
|
|
158
|
+
:type config: SelectSmartOverrideConfig
|
|
159
|
+
:return: True if the extraction has meaningful content.
|
|
160
|
+
:rtype: bool
|
|
161
|
+
"""
|
|
162
|
+
text = (extraction.text or "").strip()
|
|
163
|
+
if len(text) < config.min_text_length:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
confidence = extraction.confidence
|
|
167
|
+
if confidence is not None and confidence < config.min_confidence_threshold:
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
def _extraction_to_result(self, extraction: ExtractionStepOutput) -> ExtractedText:
|
|
173
|
+
"""
|
|
174
|
+
Convert an ExtractionStepOutput to ExtractedText.
|
|
175
|
+
|
|
176
|
+
:param extraction: Extraction step output to convert.
|
|
177
|
+
:type extraction: ExtractionStepOutput
|
|
178
|
+
:return: Extracted text result.
|
|
179
|
+
:rtype: ExtractedText
|
|
180
|
+
"""
|
|
181
|
+
producer = extraction.producer_extractor_id or extraction.extractor_id
|
|
182
|
+
return ExtractedText(
|
|
183
|
+
text=extraction.text or "",
|
|
184
|
+
producer_extractor_id=producer,
|
|
185
|
+
source_step_index=extraction.step_index,
|
|
186
|
+
confidence=extraction.confidence,
|
|
187
|
+
)
|