biblicus 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +30 -0
- biblicus/__main__.py +8 -0
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +42 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +375 -0
- biblicus/backends/sqlite_full_text_search.py +487 -0
- biblicus/cli.py +804 -0
- biblicus/constants.py +12 -0
- biblicus/context.py +183 -0
- biblicus/corpus.py +1531 -0
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +257 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +531 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +89 -0
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/knowledge_base.py +191 -0
- biblicus/models.py +445 -0
- biblicus/retrieval.py +133 -0
- biblicus/sources.py +212 -0
- biblicus/time.py +17 -0
- biblicus/uris.py +63 -0
- biblicus/user_config.py +138 -0
- biblicus-0.6.0.dist-info/METADATA +533 -0
- biblicus-0.6.0.dist-info/RECORD +48 -0
- biblicus-0.6.0.dist-info/WHEEL +5 -0
- biblicus-0.6.0.dist-info/entry_points.txt +2 -0
- biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
- biblicus-0.6.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata-based text extractor plugin.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
12
|
+
from .base import TextExtractor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MetadataTextExtractorConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for the metadata text extractor.
|
|
18
|
+
|
|
19
|
+
The metadata text extractor is intentionally minimal and deterministic.
|
|
20
|
+
It emits a plain text representation derived only from an item's catalog metadata.
|
|
21
|
+
|
|
22
|
+
:ivar include_title: Whether to include the item title as the first line, if present.
|
|
23
|
+
:vartype include_title: bool
|
|
24
|
+
:ivar include_tags: Whether to include a ``tags: ...`` line, if tags are present.
|
|
25
|
+
:vartype include_tags: bool
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
include_title: bool = Field(default=True)
|
|
31
|
+
include_tags: bool = Field(default=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MetadataTextExtractor(TextExtractor):
|
|
35
|
+
"""
|
|
36
|
+
Extractor plugin that emits a small, searchable text representation of item metadata.
|
|
37
|
+
|
|
38
|
+
The output is intended to be stable and human-readable:
|
|
39
|
+
|
|
40
|
+
- If a title exists, the first line is the title.
|
|
41
|
+
- If tags exist, the next line is ``tags: <comma separated tags>``.
|
|
42
|
+
|
|
43
|
+
This extractor is useful for:
|
|
44
|
+
|
|
45
|
+
- Retrieval over non-text items that carry meaningful metadata.
|
|
46
|
+
- Comparing downstream retrieval backends while holding extraction stable.
|
|
47
|
+
|
|
48
|
+
:ivar extractor_id: Extractor identifier.
|
|
49
|
+
:vartype extractor_id: str
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
extractor_id = "metadata-text"
|
|
53
|
+
|
|
54
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
55
|
+
"""
|
|
56
|
+
Validate extractor configuration.
|
|
57
|
+
|
|
58
|
+
:param config: Configuration mapping.
|
|
59
|
+
:type config: dict[str, Any]
|
|
60
|
+
:return: Parsed config.
|
|
61
|
+
:rtype: MetadataTextExtractorConfig
|
|
62
|
+
"""
|
|
63
|
+
return MetadataTextExtractorConfig.model_validate(config)
|
|
64
|
+
|
|
65
|
+
def extract_text(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
corpus,
|
|
69
|
+
item: CatalogItem,
|
|
70
|
+
config: BaseModel,
|
|
71
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
72
|
+
) -> Optional[ExtractedText]:
|
|
73
|
+
"""
|
|
74
|
+
Extract a metadata-based text payload for the item.
|
|
75
|
+
|
|
76
|
+
:param corpus: Corpus containing the item bytes.
|
|
77
|
+
:type corpus: Corpus
|
|
78
|
+
:param item: Catalog item being processed.
|
|
79
|
+
:type item: CatalogItem
|
|
80
|
+
:param config: Parsed configuration model.
|
|
81
|
+
:type config: MetadataTextExtractorConfig
|
|
82
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
83
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
84
|
+
:return: Extracted text payload, or ``None`` if no metadata is available.
|
|
85
|
+
:rtype: ExtractedText or None
|
|
86
|
+
"""
|
|
87
|
+
parsed_config = (
|
|
88
|
+
config
|
|
89
|
+
if isinstance(config, MetadataTextExtractorConfig)
|
|
90
|
+
else MetadataTextExtractorConfig.model_validate(config)
|
|
91
|
+
)
|
|
92
|
+
_ = corpus
|
|
93
|
+
_ = previous_extractions
|
|
94
|
+
lines: list[str] = []
|
|
95
|
+
|
|
96
|
+
if parsed_config.include_title and isinstance(item.title, str) and item.title.strip():
|
|
97
|
+
lines.append(item.title.strip())
|
|
98
|
+
|
|
99
|
+
tags = [tag.strip() for tag in item.tags if isinstance(tag, str) and tag.strip()]
|
|
100
|
+
if parsed_config.include_tags and tags:
|
|
101
|
+
lines.append(f"tags: {', '.join(tags)}")
|
|
102
|
+
|
|
103
|
+
if not lines:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
return ExtractedText(text="\n".join(lines), producer_extractor_id=self.extractor_id)
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenAI-backed speech to text extractor plugin.
|
|
3
|
+
|
|
4
|
+
This extractor is implemented as an optional dependency so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
12
|
+
|
|
13
|
+
from ..corpus import Corpus
|
|
14
|
+
from ..errors import ExtractionRunFatalError
|
|
15
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
|
+
from ..user_config import resolve_openai_api_key
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OpenAiSpeechToTextExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for OpenAI speech to text extraction.
|
|
23
|
+
|
|
24
|
+
:ivar model: OpenAI transcription model identifier.
|
|
25
|
+
:vartype model: str
|
|
26
|
+
:ivar response_format: OpenAI transcription response format.
|
|
27
|
+
:vartype response_format: str
|
|
28
|
+
:ivar language: Optional language code hint for transcription.
|
|
29
|
+
:vartype language: str or None
|
|
30
|
+
:ivar prompt: Optional prompt text to guide transcription.
|
|
31
|
+
:vartype prompt: str or None
|
|
32
|
+
:ivar no_speech_probability_threshold: Optional threshold for suppressing hallucinated transcripts.
|
|
33
|
+
:vartype no_speech_probability_threshold: float or None
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
model_config = ConfigDict(extra="forbid")
|
|
37
|
+
|
|
38
|
+
model: str = Field(default="whisper-1", min_length=1)
|
|
39
|
+
response_format: str = Field(default="json", min_length=1)
|
|
40
|
+
language: Optional[str] = Field(default=None, min_length=1)
|
|
41
|
+
prompt: Optional[str] = Field(default=None, min_length=1)
|
|
42
|
+
no_speech_probability_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
43
|
+
|
|
44
|
+
@model_validator(mode="after")
|
|
45
|
+
def _validate_no_speech_threshold(self) -> "OpenAiSpeechToTextExtractorConfig":
|
|
46
|
+
if self.no_speech_probability_threshold is None:
|
|
47
|
+
return self
|
|
48
|
+
if self.response_format != "verbose_json":
|
|
49
|
+
raise ValueError(
|
|
50
|
+
"no_speech_probability_threshold requires response_format='verbose_json' "
|
|
51
|
+
"so the transcription API returns per-segment no-speech probabilities"
|
|
52
|
+
)
|
|
53
|
+
return self
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
57
|
+
"""
|
|
58
|
+
Extractor plugin that transcribes audio items using the OpenAI API.
|
|
59
|
+
|
|
60
|
+
This extractor is intended as a practical, hosted speech to text implementation.
|
|
61
|
+
It skips non-audio items.
|
|
62
|
+
|
|
63
|
+
:ivar extractor_id: Extractor identifier.
|
|
64
|
+
:vartype extractor_id: str
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
extractor_id = "stt-openai"
|
|
68
|
+
|
|
69
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
70
|
+
"""
|
|
71
|
+
Validate extractor configuration and ensure prerequisites are available.
|
|
72
|
+
|
|
73
|
+
:param config: Configuration mapping.
|
|
74
|
+
:type config: dict[str, Any]
|
|
75
|
+
:return: Parsed configuration model.
|
|
76
|
+
:rtype: OpenAiSpeechToTextExtractorConfig
|
|
77
|
+
:raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
from openai import OpenAI # noqa: F401
|
|
81
|
+
except ImportError as import_error:
|
|
82
|
+
raise ExtractionRunFatalError(
|
|
83
|
+
"OpenAI speech to text extractor requires an optional dependency. "
|
|
84
|
+
'Install it with pip install "biblicus[openai]".'
|
|
85
|
+
) from import_error
|
|
86
|
+
|
|
87
|
+
api_key = resolve_openai_api_key()
|
|
88
|
+
if api_key is None:
|
|
89
|
+
raise ExtractionRunFatalError(
|
|
90
|
+
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
91
|
+
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
92
|
+
"openai.api_key."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return OpenAiSpeechToTextExtractorConfig.model_validate(config)
|
|
96
|
+
|
|
97
|
+
def extract_text(
|
|
98
|
+
self,
|
|
99
|
+
*,
|
|
100
|
+
corpus: Corpus,
|
|
101
|
+
item: CatalogItem,
|
|
102
|
+
config: BaseModel,
|
|
103
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
104
|
+
) -> Optional[ExtractedText]:
|
|
105
|
+
"""
|
|
106
|
+
Transcribe an audio item.
|
|
107
|
+
|
|
108
|
+
:param corpus: Corpus containing the item bytes.
|
|
109
|
+
:type corpus: Corpus
|
|
110
|
+
:param item: Catalog item being processed.
|
|
111
|
+
:type item: CatalogItem
|
|
112
|
+
:param config: Parsed configuration model.
|
|
113
|
+
:type config: OpenAiSpeechToTextExtractorConfig
|
|
114
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
115
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
116
|
+
:return: Extracted text payload, or None when the item is not audio.
|
|
117
|
+
:rtype: ExtractedText or None
|
|
118
|
+
:raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
|
|
119
|
+
"""
|
|
120
|
+
_ = previous_extractions
|
|
121
|
+
if not item.media_type.startswith("audio/"):
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
parsed_config = (
|
|
125
|
+
config
|
|
126
|
+
if isinstance(config, OpenAiSpeechToTextExtractorConfig)
|
|
127
|
+
else OpenAiSpeechToTextExtractorConfig.model_validate(config)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
api_key = resolve_openai_api_key()
|
|
131
|
+
if api_key is None:
|
|
132
|
+
raise ExtractionRunFatalError(
|
|
133
|
+
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
134
|
+
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
135
|
+
"openai.api_key."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
from openai import OpenAI
|
|
140
|
+
except ImportError as import_error:
|
|
141
|
+
raise ExtractionRunFatalError(
|
|
142
|
+
"OpenAI speech to text extractor requires an optional dependency. "
|
|
143
|
+
'Install it with pip install "biblicus[openai]".'
|
|
144
|
+
) from import_error
|
|
145
|
+
|
|
146
|
+
client = OpenAI(api_key=api_key)
|
|
147
|
+
source_path = corpus.root / item.relpath
|
|
148
|
+
with source_path.open("rb") as audio_handle:
|
|
149
|
+
result = client.audio.transcriptions.create(
|
|
150
|
+
file=audio_handle,
|
|
151
|
+
model=parsed_config.model,
|
|
152
|
+
response_format=parsed_config.response_format,
|
|
153
|
+
language=parsed_config.language,
|
|
154
|
+
prompt=parsed_config.prompt,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
transcript_text: str
|
|
158
|
+
no_speech_probability_threshold = parsed_config.no_speech_probability_threshold
|
|
159
|
+
|
|
160
|
+
if isinstance(result, dict):
|
|
161
|
+
transcript_text = str(result.get("text") or "")
|
|
162
|
+
segments = result.get("segments")
|
|
163
|
+
if (
|
|
164
|
+
no_speech_probability_threshold is not None
|
|
165
|
+
and isinstance(segments, list)
|
|
166
|
+
and segments
|
|
167
|
+
):
|
|
168
|
+
probabilities: list[float] = []
|
|
169
|
+
for entry in segments:
|
|
170
|
+
if not isinstance(entry, dict):
|
|
171
|
+
continue
|
|
172
|
+
value = entry.get("no_speech_prob", entry.get("no_speech_probability"))
|
|
173
|
+
if isinstance(value, (int, float)):
|
|
174
|
+
probabilities.append(float(value))
|
|
175
|
+
if probabilities and max(probabilities) >= no_speech_probability_threshold:
|
|
176
|
+
transcript_text = ""
|
|
177
|
+
else:
|
|
178
|
+
transcript_text = str(getattr(result, "text", "") or "")
|
|
179
|
+
|
|
180
|
+
return ExtractedText(text=transcript_text.strip(), producer_extractor_id=self.extractor_id)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pass-through extractor for text items.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..frontmatter import parse_front_matter
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
|
+
from .base import TextExtractor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PassThroughTextExtractorConfig(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Configuration for the pass-through text extractor.
|
|
20
|
+
|
|
21
|
+
This extractor is intentionally minimal and requires no configuration.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
model_config = ConfigDict(extra="forbid")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PassThroughTextExtractor(TextExtractor):
|
|
28
|
+
"""
|
|
29
|
+
Extractor plugin that reads text items from the corpus and returns their text content.
|
|
30
|
+
|
|
31
|
+
Non-text items are skipped.
|
|
32
|
+
|
|
33
|
+
:ivar extractor_id: Extractor identifier.
|
|
34
|
+
:vartype extractor_id: str
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
extractor_id = "pass-through-text"
|
|
38
|
+
|
|
39
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
40
|
+
"""
|
|
41
|
+
Validate extractor configuration.
|
|
42
|
+
|
|
43
|
+
:param config: Configuration mapping.
|
|
44
|
+
:type config: dict[str, Any]
|
|
45
|
+
:return: Parsed config.
|
|
46
|
+
:rtype: PassThroughTextExtractorConfig
|
|
47
|
+
"""
|
|
48
|
+
return PassThroughTextExtractorConfig.model_validate(config)
|
|
49
|
+
|
|
50
|
+
def extract_text(
|
|
51
|
+
self,
|
|
52
|
+
*,
|
|
53
|
+
corpus: Corpus,
|
|
54
|
+
item: CatalogItem,
|
|
55
|
+
config: BaseModel,
|
|
56
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
57
|
+
) -> Optional[ExtractedText]:
|
|
58
|
+
"""
|
|
59
|
+
Extract text by reading the raw item content from the corpus.
|
|
60
|
+
|
|
61
|
+
:param corpus: Corpus containing the item bytes.
|
|
62
|
+
:type corpus: Corpus
|
|
63
|
+
:param item: Catalog item being processed.
|
|
64
|
+
:type item: CatalogItem
|
|
65
|
+
:param config: Parsed configuration model.
|
|
66
|
+
:type config: PassThroughTextExtractorConfig
|
|
67
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
68
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
69
|
+
:return: Extracted text payload, or None if the item is not text.
|
|
70
|
+
:rtype: ExtractedText or None
|
|
71
|
+
"""
|
|
72
|
+
_ = config
|
|
73
|
+
_ = previous_extractions
|
|
74
|
+
media_type = item.media_type
|
|
75
|
+
if media_type != "text/markdown" and not media_type.startswith("text/"):
|
|
76
|
+
return None
|
|
77
|
+
raw_bytes = (corpus.root / item.relpath).read_bytes()
|
|
78
|
+
if media_type == "text/markdown":
|
|
79
|
+
markdown_text = raw_bytes.decode("utf-8")
|
|
80
|
+
parsed_document = parse_front_matter(markdown_text)
|
|
81
|
+
return ExtractedText(text=parsed_document.body, producer_extractor_id=self.extractor_id)
|
|
82
|
+
return ExtractedText(
|
|
83
|
+
text=raw_bytes.decode("utf-8"), producer_extractor_id=self.extractor_id
|
|
84
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Portable Document Format text extractor plugin.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
from pypdf import PdfReader
|
|
12
|
+
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
|
+
from .base import TextExtractor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PortableDocumentFormatTextExtractorConfig(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Configuration for Portable Document Format text extraction.
|
|
20
|
+
|
|
21
|
+
:ivar max_pages: Optional maximum number of pages to process.
|
|
22
|
+
:vartype max_pages: int or None
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
model_config = ConfigDict(extra="forbid")
|
|
26
|
+
|
|
27
|
+
max_pages: Optional[int] = Field(default=None, ge=1)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PortableDocumentFormatTextExtractor(TextExtractor):
|
|
31
|
+
"""
|
|
32
|
+
Extractor plugin that attempts to extract text from Portable Document Format items.
|
|
33
|
+
|
|
34
|
+
This extractor only handles items whose media type is `application/pdf`.
|
|
35
|
+
Items of other media types are skipped.
|
|
36
|
+
|
|
37
|
+
:ivar extractor_id: Extractor identifier.
|
|
38
|
+
:vartype extractor_id: str
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
extractor_id = "pdf-text"
|
|
42
|
+
|
|
43
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
44
|
+
"""
|
|
45
|
+
Validate extractor configuration.
|
|
46
|
+
|
|
47
|
+
:param config: Configuration mapping.
|
|
48
|
+
:type config: dict[str, Any]
|
|
49
|
+
:return: Parsed configuration.
|
|
50
|
+
:rtype: PortableDocumentFormatTextExtractorConfig
|
|
51
|
+
"""
|
|
52
|
+
return PortableDocumentFormatTextExtractorConfig.model_validate(config)
|
|
53
|
+
|
|
54
|
+
def extract_text(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
corpus,
|
|
58
|
+
item: CatalogItem,
|
|
59
|
+
config: BaseModel,
|
|
60
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
61
|
+
) -> Optional[ExtractedText]:
|
|
62
|
+
"""
|
|
63
|
+
Extract text for a Portable Document Format item.
|
|
64
|
+
|
|
65
|
+
:param corpus: Corpus containing the item bytes.
|
|
66
|
+
:type corpus: Corpus
|
|
67
|
+
:param item: Catalog item being processed.
|
|
68
|
+
:type item: CatalogItem
|
|
69
|
+
:param config: Parsed configuration model.
|
|
70
|
+
:type config: PortableDocumentFormatTextExtractorConfig
|
|
71
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
72
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
73
|
+
:return: Extracted text payload, or None when the item is not a Portable Document Format item.
|
|
74
|
+
:rtype: ExtractedText or None
|
|
75
|
+
"""
|
|
76
|
+
if item.media_type != "application/pdf":
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
_ = previous_extractions
|
|
80
|
+
parsed_config = (
|
|
81
|
+
config
|
|
82
|
+
if isinstance(config, PortableDocumentFormatTextExtractorConfig)
|
|
83
|
+
else PortableDocumentFormatTextExtractorConfig.model_validate(config)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
pdf_path = corpus.root / item.relpath
|
|
87
|
+
pdf_bytes = pdf_path.read_bytes()
|
|
88
|
+
reader = PdfReader(BytesIO(pdf_bytes))
|
|
89
|
+
|
|
90
|
+
texts: list[str] = []
|
|
91
|
+
pages = list(reader.pages)
|
|
92
|
+
if parsed_config.max_pages is not None:
|
|
93
|
+
pages = pages[: int(parsed_config.max_pages)]
|
|
94
|
+
|
|
95
|
+
for page in pages:
|
|
96
|
+
page_text = page.extract_text() or ""
|
|
97
|
+
texts.append(page_text)
|
|
98
|
+
|
|
99
|
+
combined_text = "\n".join(texts).strip()
|
|
100
|
+
return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline extractor configuration and validation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..errors import ExtractionRunFatalError
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
|
+
from .base import TextExtractor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PipelineStepSpec(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Single extractor step within a pipeline.
|
|
20
|
+
|
|
21
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
22
|
+
:vartype extractor_id: str
|
|
23
|
+
:ivar config: Extractor configuration mapping.
|
|
24
|
+
:vartype config: dict[str, Any]
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
model_config = ConfigDict(extra="forbid")
|
|
28
|
+
|
|
29
|
+
extractor_id: str = Field(min_length=1)
|
|
30
|
+
config: Dict[str, Any] = Field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PipelineExtractorConfig(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
Configuration for the pipeline extractor.
|
|
36
|
+
|
|
37
|
+
:ivar steps: Ordered list of extractor steps to run.
|
|
38
|
+
:vartype steps: list[PipelineStepSpec]
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(extra="forbid")
|
|
42
|
+
|
|
43
|
+
steps: List[PipelineStepSpec] = Field(min_length=1)
|
|
44
|
+
|
|
45
|
+
@model_validator(mode="after")
|
|
46
|
+
def _forbid_pipeline_step(self) -> "PipelineExtractorConfig":
|
|
47
|
+
if any(step.extractor_id == "pipeline" for step in self.steps):
|
|
48
|
+
raise ValueError("Pipeline steps cannot include the pipeline extractor itself")
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class PipelineExtractor(TextExtractor):
|
|
53
|
+
"""
|
|
54
|
+
Pipeline extractor configuration shim.
|
|
55
|
+
|
|
56
|
+
The pipeline extractor is executed by the extraction engine so it can persist
|
|
57
|
+
per-step artifacts. This class only validates configuration.
|
|
58
|
+
|
|
59
|
+
:ivar extractor_id: Extractor identifier.
|
|
60
|
+
:vartype extractor_id: str
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
extractor_id = "pipeline"
|
|
64
|
+
|
|
65
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
66
|
+
"""
|
|
67
|
+
Validate pipeline configuration.
|
|
68
|
+
|
|
69
|
+
:param config: Configuration mapping.
|
|
70
|
+
:type config: dict[str, Any]
|
|
71
|
+
:return: Parsed configuration.
|
|
72
|
+
:rtype: PipelineExtractorConfig
|
|
73
|
+
"""
|
|
74
|
+
return PipelineExtractorConfig.model_validate(config)
|
|
75
|
+
|
|
76
|
+
def extract_text(
|
|
77
|
+
self,
|
|
78
|
+
*,
|
|
79
|
+
corpus: Corpus,
|
|
80
|
+
item: CatalogItem,
|
|
81
|
+
config: BaseModel,
|
|
82
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
83
|
+
) -> Optional[ExtractedText]:
|
|
84
|
+
"""
|
|
85
|
+
Reject direct execution of the pipeline extractor.
|
|
86
|
+
|
|
87
|
+
:param corpus: Corpus containing the item bytes.
|
|
88
|
+
:type corpus: Corpus
|
|
89
|
+
:param item: Catalog item being processed.
|
|
90
|
+
:type item: CatalogItem
|
|
91
|
+
:param config: Parsed configuration model.
|
|
92
|
+
:type config: PipelineExtractorConfig
|
|
93
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
94
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
95
|
+
:raises ExtractionRunFatalError: Always, because the pipeline is executed by the runner.
|
|
96
|
+
:return: None.
|
|
97
|
+
:rtype: None
|
|
98
|
+
"""
|
|
99
|
+
_ = corpus
|
|
100
|
+
_ = item
|
|
101
|
+
_ = config
|
|
102
|
+
_ = previous_extractions
|
|
103
|
+
raise ExtractionRunFatalError(
|
|
104
|
+
"Pipeline extractor must be executed by the extraction runner."
|
|
105
|
+
)
|