biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +21 -15
- biblicus/backends/sqlite_full_text_search.py +14 -15
- biblicus/cli.py +177 -53
- biblicus/corpus.py +209 -59
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +280 -79
- biblicus/extractors/__init__.py +14 -3
- biblicus/extractors/base.py +12 -5
- biblicus/extractors/metadata_text.py +13 -5
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +16 -6
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +0 -5
- biblicus/hook_manager.py +3 -5
- biblicus/hooks.py +3 -7
- biblicus/ignore.py +0 -3
- biblicus/models.py +118 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +44 -9
- biblicus/time.py +1 -2
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
- biblicus-0.4.0.dist-info/RECORD +45 -0
- biblicus/extractors/cascade.py +0 -101
- biblicus-0.2.0.dist-info/RECORD +0 -32
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenAI-backed speech to text extractor plugin.
|
|
3
|
+
|
|
4
|
+
This extractor is implemented as an optional dependency so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
12
|
+
|
|
13
|
+
from ..corpus import Corpus
|
|
14
|
+
from ..errors import ExtractionRunFatalError
|
|
15
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
|
+
from ..user_config import resolve_openai_api_key
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OpenAiSpeechToTextExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for OpenAI speech to text extraction.
|
|
23
|
+
|
|
24
|
+
:ivar model: OpenAI transcription model identifier.
|
|
25
|
+
:vartype model: str
|
|
26
|
+
:ivar response_format: OpenAI transcription response format.
|
|
27
|
+
:vartype response_format: str
|
|
28
|
+
:ivar language: Optional language code hint for transcription.
|
|
29
|
+
:vartype language: str or None
|
|
30
|
+
:ivar prompt: Optional prompt text to guide transcription.
|
|
31
|
+
:vartype prompt: str or None
|
|
32
|
+
:ivar no_speech_probability_threshold: Optional threshold for suppressing hallucinated transcripts.
|
|
33
|
+
:vartype no_speech_probability_threshold: float or None
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
model_config = ConfigDict(extra="forbid")
|
|
37
|
+
|
|
38
|
+
model: str = Field(default="whisper-1", min_length=1)
|
|
39
|
+
response_format: str = Field(default="json", min_length=1)
|
|
40
|
+
language: Optional[str] = Field(default=None, min_length=1)
|
|
41
|
+
prompt: Optional[str] = Field(default=None, min_length=1)
|
|
42
|
+
no_speech_probability_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
43
|
+
|
|
44
|
+
@model_validator(mode="after")
|
|
45
|
+
def _validate_no_speech_threshold(self) -> "OpenAiSpeechToTextExtractorConfig":
|
|
46
|
+
if self.no_speech_probability_threshold is None:
|
|
47
|
+
return self
|
|
48
|
+
if self.response_format != "verbose_json":
|
|
49
|
+
raise ValueError(
|
|
50
|
+
"no_speech_probability_threshold requires response_format='verbose_json' "
|
|
51
|
+
"so the transcription API returns per-segment no-speech probabilities"
|
|
52
|
+
)
|
|
53
|
+
return self
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
57
|
+
"""
|
|
58
|
+
Extractor plugin that transcribes audio items using the OpenAI API.
|
|
59
|
+
|
|
60
|
+
This extractor is intended as a practical, hosted speech to text implementation.
|
|
61
|
+
It skips non-audio items.
|
|
62
|
+
|
|
63
|
+
:ivar extractor_id: Extractor identifier.
|
|
64
|
+
:vartype extractor_id: str
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
extractor_id = "stt-openai"
|
|
68
|
+
|
|
69
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
70
|
+
"""
|
|
71
|
+
Validate extractor configuration and ensure prerequisites are available.
|
|
72
|
+
|
|
73
|
+
:param config: Configuration mapping.
|
|
74
|
+
:type config: dict[str, Any]
|
|
75
|
+
:return: Parsed configuration model.
|
|
76
|
+
:rtype: OpenAiSpeechToTextExtractorConfig
|
|
77
|
+
:raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
from openai import OpenAI # noqa: F401
|
|
81
|
+
except ImportError as import_error:
|
|
82
|
+
raise ExtractionRunFatalError(
|
|
83
|
+
"OpenAI speech to text extractor requires an optional dependency. "
|
|
84
|
+
'Install it with pip install "biblicus[openai]".'
|
|
85
|
+
) from import_error
|
|
86
|
+
|
|
87
|
+
api_key = resolve_openai_api_key()
|
|
88
|
+
if api_key is None:
|
|
89
|
+
raise ExtractionRunFatalError(
|
|
90
|
+
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
91
|
+
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
92
|
+
"openai.api_key."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return OpenAiSpeechToTextExtractorConfig.model_validate(config)
|
|
96
|
+
|
|
97
|
+
def extract_text(
|
|
98
|
+
self,
|
|
99
|
+
*,
|
|
100
|
+
corpus: Corpus,
|
|
101
|
+
item: CatalogItem,
|
|
102
|
+
config: BaseModel,
|
|
103
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
104
|
+
) -> Optional[ExtractedText]:
|
|
105
|
+
"""
|
|
106
|
+
Transcribe an audio item.
|
|
107
|
+
|
|
108
|
+
:param corpus: Corpus containing the item bytes.
|
|
109
|
+
:type corpus: Corpus
|
|
110
|
+
:param item: Catalog item being processed.
|
|
111
|
+
:type item: CatalogItem
|
|
112
|
+
:param config: Parsed configuration model.
|
|
113
|
+
:type config: OpenAiSpeechToTextExtractorConfig
|
|
114
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
115
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
116
|
+
:return: Extracted text payload, or None when the item is not audio.
|
|
117
|
+
:rtype: ExtractedText or None
|
|
118
|
+
:raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
|
|
119
|
+
"""
|
|
120
|
+
_ = previous_extractions
|
|
121
|
+
if not item.media_type.startswith("audio/"):
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
parsed_config = (
|
|
125
|
+
config
|
|
126
|
+
if isinstance(config, OpenAiSpeechToTextExtractorConfig)
|
|
127
|
+
else OpenAiSpeechToTextExtractorConfig.model_validate(config)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
api_key = resolve_openai_api_key()
|
|
131
|
+
if api_key is None:
|
|
132
|
+
raise ExtractionRunFatalError(
|
|
133
|
+
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
134
|
+
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
135
|
+
"openai.api_key."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
from openai import OpenAI
|
|
140
|
+
except ImportError as import_error:
|
|
141
|
+
raise ExtractionRunFatalError(
|
|
142
|
+
"OpenAI speech to text extractor requires an optional dependency. "
|
|
143
|
+
'Install it with pip install "biblicus[openai]".'
|
|
144
|
+
) from import_error
|
|
145
|
+
|
|
146
|
+
client = OpenAI(api_key=api_key)
|
|
147
|
+
source_path = corpus.root / item.relpath
|
|
148
|
+
with source_path.open("rb") as audio_handle:
|
|
149
|
+
result = client.audio.transcriptions.create(
|
|
150
|
+
file=audio_handle,
|
|
151
|
+
model=parsed_config.model,
|
|
152
|
+
response_format=parsed_config.response_format,
|
|
153
|
+
language=parsed_config.language,
|
|
154
|
+
prompt=parsed_config.prompt,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
transcript_text: str
|
|
158
|
+
no_speech_probability_threshold = parsed_config.no_speech_probability_threshold
|
|
159
|
+
|
|
160
|
+
if isinstance(result, dict):
|
|
161
|
+
transcript_text = str(result.get("text") or "")
|
|
162
|
+
segments = result.get("segments")
|
|
163
|
+
if (
|
|
164
|
+
no_speech_probability_threshold is not None
|
|
165
|
+
and isinstance(segments, list)
|
|
166
|
+
and segments
|
|
167
|
+
):
|
|
168
|
+
probabilities: list[float] = []
|
|
169
|
+
for entry in segments:
|
|
170
|
+
if not isinstance(entry, dict):
|
|
171
|
+
continue
|
|
172
|
+
value = entry.get("no_speech_prob", entry.get("no_speech_probability"))
|
|
173
|
+
if isinstance(value, (int, float)):
|
|
174
|
+
probabilities.append(float(value))
|
|
175
|
+
if probabilities and max(probabilities) >= no_speech_probability_threshold:
|
|
176
|
+
transcript_text = ""
|
|
177
|
+
else:
|
|
178
|
+
transcript_text = str(getattr(result, "text", "") or "")
|
|
179
|
+
|
|
180
|
+
return ExtractedText(text=transcript_text.strip(), producer_extractor_id=self.extractor_id)
|
|
@@ -4,13 +4,13 @@ Pass-through extractor for text items.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
-
from typing import Any, Dict, Optional
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict
|
|
10
10
|
|
|
11
11
|
from ..corpus import Corpus
|
|
12
12
|
from ..frontmatter import parse_front_matter
|
|
13
|
-
from ..models import CatalogItem, ExtractedText
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
14
|
from .base import TextExtractor
|
|
15
15
|
|
|
16
16
|
|
|
@@ -45,10 +45,16 @@ class PassThroughTextExtractor(TextExtractor):
|
|
|
45
45
|
:return: Parsed config.
|
|
46
46
|
:rtype: PassThroughTextExtractorConfig
|
|
47
47
|
"""
|
|
48
|
-
|
|
49
48
|
return PassThroughTextExtractorConfig.model_validate(config)
|
|
50
49
|
|
|
51
|
-
def extract_text(
|
|
50
|
+
def extract_text(
|
|
51
|
+
self,
|
|
52
|
+
*,
|
|
53
|
+
corpus: Corpus,
|
|
54
|
+
item: CatalogItem,
|
|
55
|
+
config: BaseModel,
|
|
56
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
57
|
+
) -> Optional[ExtractedText]:
|
|
52
58
|
"""
|
|
53
59
|
Extract text by reading the raw item content from the corpus.
|
|
54
60
|
|
|
@@ -58,11 +64,13 @@ class PassThroughTextExtractor(TextExtractor):
|
|
|
58
64
|
:type item: CatalogItem
|
|
59
65
|
:param config: Parsed configuration model.
|
|
60
66
|
:type config: PassThroughTextExtractorConfig
|
|
67
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
68
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
61
69
|
:return: Extracted text payload, or None if the item is not text.
|
|
62
70
|
:rtype: ExtractedText or None
|
|
63
71
|
"""
|
|
64
|
-
|
|
65
72
|
_ = config
|
|
73
|
+
_ = previous_extractions
|
|
66
74
|
media_type = item.media_type
|
|
67
75
|
if media_type != "text/markdown" and not media_type.startswith("text/"):
|
|
68
76
|
return None
|
|
@@ -71,4 +79,6 @@ class PassThroughTextExtractor(TextExtractor):
|
|
|
71
79
|
markdown_text = raw_bytes.decode("utf-8")
|
|
72
80
|
parsed_document = parse_front_matter(markdown_text)
|
|
73
81
|
return ExtractedText(text=parsed_document.body, producer_extractor_id=self.extractor_id)
|
|
74
|
-
return ExtractedText(
|
|
82
|
+
return ExtractedText(
|
|
83
|
+
text=raw_bytes.decode("utf-8"), producer_extractor_id=self.extractor_id
|
|
84
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Portable Document Format text extractor plugin.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
from pypdf import PdfReader
|
|
12
|
+
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
|
+
from .base import TextExtractor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PortableDocumentFormatTextExtractorConfig(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Configuration for Portable Document Format text extraction.
|
|
20
|
+
|
|
21
|
+
:ivar max_pages: Optional maximum number of pages to process.
|
|
22
|
+
:vartype max_pages: int or None
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
model_config = ConfigDict(extra="forbid")
|
|
26
|
+
|
|
27
|
+
max_pages: Optional[int] = Field(default=None, ge=1)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PortableDocumentFormatTextExtractor(TextExtractor):
|
|
31
|
+
"""
|
|
32
|
+
Extractor plugin that attempts to extract text from Portable Document Format items.
|
|
33
|
+
|
|
34
|
+
This extractor only handles items whose media type is `application/pdf`.
|
|
35
|
+
Items of other media types are skipped.
|
|
36
|
+
|
|
37
|
+
:ivar extractor_id: Extractor identifier.
|
|
38
|
+
:vartype extractor_id: str
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
extractor_id = "pdf-text"
|
|
42
|
+
|
|
43
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
44
|
+
"""
|
|
45
|
+
Validate extractor configuration.
|
|
46
|
+
|
|
47
|
+
:param config: Configuration mapping.
|
|
48
|
+
:type config: dict[str, Any]
|
|
49
|
+
:return: Parsed configuration.
|
|
50
|
+
:rtype: PortableDocumentFormatTextExtractorConfig
|
|
51
|
+
"""
|
|
52
|
+
return PortableDocumentFormatTextExtractorConfig.model_validate(config)
|
|
53
|
+
|
|
54
|
+
def extract_text(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
corpus,
|
|
58
|
+
item: CatalogItem,
|
|
59
|
+
config: BaseModel,
|
|
60
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
61
|
+
) -> Optional[ExtractedText]:
|
|
62
|
+
"""
|
|
63
|
+
Extract text for a Portable Document Format item.
|
|
64
|
+
|
|
65
|
+
:param corpus: Corpus containing the item bytes.
|
|
66
|
+
:type corpus: Corpus
|
|
67
|
+
:param item: Catalog item being processed.
|
|
68
|
+
:type item: CatalogItem
|
|
69
|
+
:param config: Parsed configuration model.
|
|
70
|
+
:type config: PortableDocumentFormatTextExtractorConfig
|
|
71
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
72
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
73
|
+
:return: Extracted text payload, or None when the item is not a Portable Document Format item.
|
|
74
|
+
:rtype: ExtractedText or None
|
|
75
|
+
"""
|
|
76
|
+
if item.media_type != "application/pdf":
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
_ = previous_extractions
|
|
80
|
+
parsed_config = (
|
|
81
|
+
config
|
|
82
|
+
if isinstance(config, PortableDocumentFormatTextExtractorConfig)
|
|
83
|
+
else PortableDocumentFormatTextExtractorConfig.model_validate(config)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
pdf_path = corpus.root / item.relpath
|
|
87
|
+
pdf_bytes = pdf_path.read_bytes()
|
|
88
|
+
reader = PdfReader(BytesIO(pdf_bytes))
|
|
89
|
+
|
|
90
|
+
texts: list[str] = []
|
|
91
|
+
pages = list(reader.pages)
|
|
92
|
+
if parsed_config.max_pages is not None:
|
|
93
|
+
pages = pages[: int(parsed_config.max_pages)]
|
|
94
|
+
|
|
95
|
+
for page in pages:
|
|
96
|
+
page_text = page.extract_text() or ""
|
|
97
|
+
texts.append(page_text)
|
|
98
|
+
|
|
99
|
+
combined_text = "\n".join(texts).strip()
|
|
100
|
+
return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline extractor configuration and validation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..errors import ExtractionRunFatalError
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
|
+
from .base import TextExtractor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PipelineStepSpec(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Single extractor step within a pipeline.
|
|
20
|
+
|
|
21
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
22
|
+
:vartype extractor_id: str
|
|
23
|
+
:ivar config: Extractor configuration mapping.
|
|
24
|
+
:vartype config: dict[str, Any]
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
model_config = ConfigDict(extra="forbid")
|
|
28
|
+
|
|
29
|
+
extractor_id: str = Field(min_length=1)
|
|
30
|
+
config: Dict[str, Any] = Field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PipelineExtractorConfig(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
Configuration for the pipeline extractor.
|
|
36
|
+
|
|
37
|
+
:ivar steps: Ordered list of extractor steps to run.
|
|
38
|
+
:vartype steps: list[PipelineStepSpec]
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(extra="forbid")
|
|
42
|
+
|
|
43
|
+
steps: List[PipelineStepSpec] = Field(min_length=1)
|
|
44
|
+
|
|
45
|
+
@model_validator(mode="after")
|
|
46
|
+
def _forbid_pipeline_step(self) -> "PipelineExtractorConfig":
|
|
47
|
+
if any(step.extractor_id == "pipeline" for step in self.steps):
|
|
48
|
+
raise ValueError("Pipeline steps cannot include the pipeline extractor itself")
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class PipelineExtractor(TextExtractor):
|
|
53
|
+
"""
|
|
54
|
+
Pipeline extractor configuration shim.
|
|
55
|
+
|
|
56
|
+
The pipeline extractor is executed by the extraction engine so it can persist
|
|
57
|
+
per-step artifacts. This class only validates configuration.
|
|
58
|
+
|
|
59
|
+
:ivar extractor_id: Extractor identifier.
|
|
60
|
+
:vartype extractor_id: str
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
extractor_id = "pipeline"
|
|
64
|
+
|
|
65
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
66
|
+
"""
|
|
67
|
+
Validate pipeline configuration.
|
|
68
|
+
|
|
69
|
+
:param config: Configuration mapping.
|
|
70
|
+
:type config: dict[str, Any]
|
|
71
|
+
:return: Parsed configuration.
|
|
72
|
+
:rtype: PipelineExtractorConfig
|
|
73
|
+
"""
|
|
74
|
+
return PipelineExtractorConfig.model_validate(config)
|
|
75
|
+
|
|
76
|
+
def extract_text(
|
|
77
|
+
self,
|
|
78
|
+
*,
|
|
79
|
+
corpus: Corpus,
|
|
80
|
+
item: CatalogItem,
|
|
81
|
+
config: BaseModel,
|
|
82
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
83
|
+
) -> Optional[ExtractedText]:
|
|
84
|
+
"""
|
|
85
|
+
Reject direct execution of the pipeline extractor.
|
|
86
|
+
|
|
87
|
+
:param corpus: Corpus containing the item bytes.
|
|
88
|
+
:type corpus: Corpus
|
|
89
|
+
:param item: Catalog item being processed.
|
|
90
|
+
:type item: CatalogItem
|
|
91
|
+
:param config: Parsed configuration model.
|
|
92
|
+
:type config: PipelineExtractorConfig
|
|
93
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
94
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
95
|
+
:raises ExtractionRunFatalError: Always, because the pipeline is executed by the runner.
|
|
96
|
+
:return: None.
|
|
97
|
+
:rtype: None
|
|
98
|
+
"""
|
|
99
|
+
_ = corpus
|
|
100
|
+
_ = item
|
|
101
|
+
_ = config
|
|
102
|
+
_ = previous_extractions
|
|
103
|
+
raise ExtractionRunFatalError(
|
|
104
|
+
"Pipeline extractor must be executed by the extraction runner."
|
|
105
|
+
)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RapidOCR-backed optical character recognition extractor plugin.
|
|
3
|
+
|
|
4
|
+
This extractor is an optional dependency. It exists as a practical default for extracting text
|
|
5
|
+
from image items without requiring a separate daemon.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RapidOcrExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for the RapidOCR extractor.
|
|
23
|
+
|
|
24
|
+
:ivar min_confidence: Minimum per-line confidence to include in output.
|
|
25
|
+
:vartype min_confidence: float
|
|
26
|
+
:ivar joiner: Joiner used to combine recognized lines.
|
|
27
|
+
:vartype joiner: str
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(extra="forbid")
|
|
31
|
+
|
|
32
|
+
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
33
|
+
joiner: str = Field(default="\n")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class RapidOcrExtractor(TextExtractor):
|
|
37
|
+
"""
|
|
38
|
+
Extractor plugin that performs optical character recognition on image items using RapidOCR.
|
|
39
|
+
|
|
40
|
+
This extractor handles common image media types such as Portable Network Graphics and Joint Photographic Experts Group.
|
|
41
|
+
It returns an empty extracted text artifact when the image is handled but no text is recognized.
|
|
42
|
+
|
|
43
|
+
:ivar extractor_id: Extractor identifier.
|
|
44
|
+
:vartype extractor_id: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
extractor_id = "ocr-rapidocr"
|
|
48
|
+
|
|
49
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
50
|
+
"""
|
|
51
|
+
Validate extractor configuration and ensure prerequisites are available.
|
|
52
|
+
|
|
53
|
+
:param config: Configuration mapping.
|
|
54
|
+
:type config: dict[str, Any]
|
|
55
|
+
:return: Parsed configuration model.
|
|
56
|
+
:rtype: RapidOcrExtractorConfig
|
|
57
|
+
:raises ExtractionRunFatalError: If the optional dependency is missing.
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
|
61
|
+
except ImportError as import_error:
|
|
62
|
+
raise ExtractionRunFatalError(
|
|
63
|
+
"RapidOCR extractor requires an optional dependency. "
|
|
64
|
+
'Install it with pip install "biblicus[ocr]".'
|
|
65
|
+
) from import_error
|
|
66
|
+
|
|
67
|
+
return RapidOcrExtractorConfig.model_validate(config)
|
|
68
|
+
|
|
69
|
+
def extract_text(
|
|
70
|
+
self,
|
|
71
|
+
*,
|
|
72
|
+
corpus: Corpus,
|
|
73
|
+
item: CatalogItem,
|
|
74
|
+
config: BaseModel,
|
|
75
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
76
|
+
) -> Optional[ExtractedText]:
|
|
77
|
+
"""
|
|
78
|
+
Extract text from an image item using optical character recognition.
|
|
79
|
+
|
|
80
|
+
:param corpus: Corpus containing the item bytes.
|
|
81
|
+
:type corpus: Corpus
|
|
82
|
+
:param item: Catalog item being processed.
|
|
83
|
+
:type item: CatalogItem
|
|
84
|
+
:param config: Parsed configuration model.
|
|
85
|
+
:type config: RapidOcrExtractorConfig
|
|
86
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
87
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
88
|
+
:return: Extracted text payload, or None when the item is not an image.
|
|
89
|
+
:rtype: ExtractedText or None
|
|
90
|
+
"""
|
|
91
|
+
_ = previous_extractions
|
|
92
|
+
media_type = item.media_type
|
|
93
|
+
if not media_type.startswith("image/"):
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
parsed_config = (
|
|
97
|
+
config
|
|
98
|
+
if isinstance(config, RapidOcrExtractorConfig)
|
|
99
|
+
else RapidOcrExtractorConfig.model_validate(config)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
from rapidocr_onnxruntime import RapidOCR
|
|
103
|
+
|
|
104
|
+
source_path = corpus.root / item.relpath
|
|
105
|
+
ocr = RapidOCR()
|
|
106
|
+
result, _elapsed = ocr(str(source_path))
|
|
107
|
+
|
|
108
|
+
if result is None:
|
|
109
|
+
return ExtractedText(text="", producer_extractor_id=self.extractor_id)
|
|
110
|
+
|
|
111
|
+
lines: list[str] = []
|
|
112
|
+
for entry in result:
|
|
113
|
+
if not isinstance(entry, list) or len(entry) < 3:
|
|
114
|
+
continue
|
|
115
|
+
text_value = entry[1]
|
|
116
|
+
confidence_value = entry[2]
|
|
117
|
+
if not isinstance(text_value, str):
|
|
118
|
+
continue
|
|
119
|
+
if not isinstance(confidence_value, (int, float)):
|
|
120
|
+
continue
|
|
121
|
+
confidence = float(confidence_value)
|
|
122
|
+
if confidence < parsed_config.min_confidence:
|
|
123
|
+
continue
|
|
124
|
+
cleaned = text_value.strip()
|
|
125
|
+
if cleaned:
|
|
126
|
+
lines.append(cleaned)
|
|
127
|
+
|
|
128
|
+
text = parsed_config.joiner.join(lines).strip()
|
|
129
|
+
return ExtractedText(text=text, producer_extractor_id=self.extractor_id)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Selection extractor that chooses the longest available text from previous pipeline outputs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
12
|
+
from .base import TextExtractor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SelectLongestTextExtractorConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for the longest text selection extractor.
|
|
18
|
+
|
|
19
|
+
Version zero does not expose configuration for this extractor.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
model_config = ConfigDict(extra="forbid")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SelectLongestTextExtractor(TextExtractor):
|
|
26
|
+
"""
|
|
27
|
+
Extractor plugin that selects the longest text from previous pipeline outputs.
|
|
28
|
+
|
|
29
|
+
This extractor does not attempt to score semantic quality. It is a deterministic
|
|
30
|
+
selection policy for cases where multiple steps can produce usable text for the
|
|
31
|
+
same item.
|
|
32
|
+
|
|
33
|
+
The selection rules are:
|
|
34
|
+
|
|
35
|
+
- If any prior extracted texts are non-empty after stripping whitespace, choose the one
|
|
36
|
+
with the greatest stripped character count.
|
|
37
|
+
- Ties are broken by earliest pipeline step index.
|
|
38
|
+
- If no prior extracted texts are usable but prior extracted texts exist, select the
|
|
39
|
+
earliest extracted text even if it is empty.
|
|
40
|
+
|
|
41
|
+
:ivar extractor_id: Extractor identifier.
|
|
42
|
+
:vartype extractor_id: str
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
extractor_id = "select-longest-text"
|
|
46
|
+
|
|
47
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
48
|
+
"""
|
|
49
|
+
Validate selection extractor configuration.
|
|
50
|
+
|
|
51
|
+
:param config: Configuration mapping.
|
|
52
|
+
:type config: dict[str, Any]
|
|
53
|
+
:return: Parsed configuration.
|
|
54
|
+
:rtype: SelectLongestTextExtractorConfig
|
|
55
|
+
"""
|
|
56
|
+
return SelectLongestTextExtractorConfig.model_validate(config)
|
|
57
|
+
|
|
58
|
+
def extract_text(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
corpus,
|
|
62
|
+
item: CatalogItem,
|
|
63
|
+
config: BaseModel,
|
|
64
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
65
|
+
) -> Optional[ExtractedText]:
|
|
66
|
+
"""
|
|
67
|
+
Select the longest extracted text from previous pipeline outputs.
|
|
68
|
+
|
|
69
|
+
:param corpus: Corpus containing the item bytes.
|
|
70
|
+
:type corpus: Corpus
|
|
71
|
+
:param item: Catalog item being processed.
|
|
72
|
+
:type item: CatalogItem
|
|
73
|
+
:param config: Parsed configuration model.
|
|
74
|
+
:type config: SelectLongestTextExtractorConfig
|
|
75
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
76
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
77
|
+
:return: Selected extracted text payload or None when no prior outputs exist.
|
|
78
|
+
:rtype: ExtractedText or None
|
|
79
|
+
"""
|
|
80
|
+
_ = corpus
|
|
81
|
+
_ = item
|
|
82
|
+
_ = config
|
|
83
|
+
|
|
84
|
+
extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
|
|
85
|
+
if not extracted_candidates:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
|
|
89
|
+
if usable_candidates:
|
|
90
|
+
candidate = max(usable_candidates, key=lambda entry: len(entry.text.strip()))
|
|
91
|
+
ties = [
|
|
92
|
+
entry
|
|
93
|
+
for entry in usable_candidates
|
|
94
|
+
if len(entry.text.strip()) == len(candidate.text.strip())
|
|
95
|
+
]
|
|
96
|
+
candidate = min(ties, key=lambda entry: int(entry.step_index))
|
|
97
|
+
else:
|
|
98
|
+
candidate = min(extracted_candidates, key=lambda entry: int(entry.step_index))
|
|
99
|
+
|
|
100
|
+
producer = candidate.producer_extractor_id or candidate.extractor_id
|
|
101
|
+
return ExtractedText(
|
|
102
|
+
text=candidate.text or "",
|
|
103
|
+
producer_extractor_id=producer,
|
|
104
|
+
source_step_index=candidate.step_index,
|
|
105
|
+
)
|