biblicus 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +30 -0
- biblicus/__main__.py +8 -0
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +42 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +375 -0
- biblicus/backends/sqlite_full_text_search.py +487 -0
- biblicus/cli.py +804 -0
- biblicus/constants.py +12 -0
- biblicus/context.py +183 -0
- biblicus/corpus.py +1531 -0
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +257 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +531 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +89 -0
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/knowledge_base.py +191 -0
- biblicus/models.py +445 -0
- biblicus/retrieval.py +133 -0
- biblicus/sources.py +212 -0
- biblicus/time.py +17 -0
- biblicus/uris.py +63 -0
- biblicus/user_config.py +138 -0
- biblicus-0.6.0.dist-info/METADATA +533 -0
- biblicus-0.6.0.dist-info/RECORD +48 -0
- biblicus-0.6.0.dist-info/WHEEL +5 -0
- biblicus-0.6.0.dist-info/entry_points.txt +2 -0
- biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
- biblicus-0.6.0.dist-info/top_level.txt +1 -0
biblicus/extraction.py
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text extraction runs for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
12
|
+
|
|
13
|
+
from .corpus import Corpus
|
|
14
|
+
from .errors import ExtractionRunFatalError
|
|
15
|
+
from .extractors import get_extractor
|
|
16
|
+
from .extractors.base import TextExtractor
|
|
17
|
+
from .extractors.pipeline import PipelineExtractorConfig, PipelineStepSpec
|
|
18
|
+
from .models import CatalogItem, ExtractionStepOutput
|
|
19
|
+
from .retrieval import hash_text
|
|
20
|
+
from .time import utc_now_iso
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ExtractionRecipeManifest(BaseModel):
|
|
24
|
+
"""
|
|
25
|
+
Reproducible configuration for an extraction plugin run.
|
|
26
|
+
|
|
27
|
+
:ivar recipe_id: Deterministic recipe identifier.
|
|
28
|
+
:vartype recipe_id: str
|
|
29
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
30
|
+
:vartype extractor_id: str
|
|
31
|
+
:ivar name: Human-readable recipe name.
|
|
32
|
+
:vartype name: str
|
|
33
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp.
|
|
34
|
+
:vartype created_at: str
|
|
35
|
+
:ivar config: Extractor-specific configuration values.
|
|
36
|
+
:vartype config: dict[str, Any]
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
model_config = ConfigDict(extra="forbid")
|
|
40
|
+
|
|
41
|
+
recipe_id: str
|
|
42
|
+
extractor_id: str
|
|
43
|
+
name: str
|
|
44
|
+
created_at: str
|
|
45
|
+
config: Dict[str, Any] = Field(default_factory=dict)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ExtractionStepResult(BaseModel):
|
|
49
|
+
"""
|
|
50
|
+
Per-item result record for a single pipeline step.
|
|
51
|
+
|
|
52
|
+
:ivar step_index: One-based pipeline step index.
|
|
53
|
+
:vartype step_index: int
|
|
54
|
+
:ivar extractor_id: Extractor identifier for the step.
|
|
55
|
+
:vartype extractor_id: str
|
|
56
|
+
:ivar status: Step status, extracted, skipped, or errored.
|
|
57
|
+
:vartype status: str
|
|
58
|
+
:ivar text_relpath: Relative path to the step text artifact, when extracted.
|
|
59
|
+
:vartype text_relpath: str or None
|
|
60
|
+
:ivar text_characters: Character count of the extracted text.
|
|
61
|
+
:vartype text_characters: int
|
|
62
|
+
:ivar producer_extractor_id: Extractor identifier that produced the text content.
|
|
63
|
+
:vartype producer_extractor_id: str or None
|
|
64
|
+
:ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
|
|
65
|
+
:vartype source_step_index: int or None
|
|
66
|
+
:ivar error_type: Optional error type name for errored steps.
|
|
67
|
+
:vartype error_type: str or None
|
|
68
|
+
:ivar error_message: Optional error message for errored steps.
|
|
69
|
+
:vartype error_message: str or None
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
model_config = ConfigDict(extra="forbid")
|
|
73
|
+
|
|
74
|
+
step_index: int = Field(ge=1)
|
|
75
|
+
extractor_id: str
|
|
76
|
+
status: str
|
|
77
|
+
text_relpath: Optional[str] = None
|
|
78
|
+
text_characters: int = Field(default=0, ge=0)
|
|
79
|
+
producer_extractor_id: Optional[str] = None
|
|
80
|
+
source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
81
|
+
error_type: Optional[str] = None
|
|
82
|
+
error_message: Optional[str] = None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ExtractionItemResult(BaseModel):
|
|
86
|
+
"""
|
|
87
|
+
Per-item result record for an extraction run.
|
|
88
|
+
|
|
89
|
+
:ivar item_id: Item identifier.
|
|
90
|
+
:vartype item_id: str
|
|
91
|
+
:ivar status: Final result status, extracted, skipped, or errored.
|
|
92
|
+
:vartype status: str
|
|
93
|
+
:ivar final_text_relpath: Relative path to the final extracted text artifact, when extracted.
|
|
94
|
+
:vartype final_text_relpath: str or None
|
|
95
|
+
:ivar final_step_index: Pipeline step index that produced the final text.
|
|
96
|
+
:vartype final_step_index: int or None
|
|
97
|
+
:ivar final_step_extractor_id: Extractor identifier of the step that produced the final text.
|
|
98
|
+
:vartype final_step_extractor_id: str or None
|
|
99
|
+
:ivar final_producer_extractor_id: Extractor identifier that produced the final text content.
|
|
100
|
+
:vartype final_producer_extractor_id: str or None
|
|
101
|
+
:ivar final_source_step_index: Optional step index that supplied the final text for selection-style extractors.
|
|
102
|
+
:vartype final_source_step_index: int or None
|
|
103
|
+
:ivar error_type: Optional error type name when no extracted text was produced.
|
|
104
|
+
:vartype error_type: str or None
|
|
105
|
+
:ivar error_message: Optional error message when no extracted text was produced.
|
|
106
|
+
:vartype error_message: str or None
|
|
107
|
+
:ivar step_results: Per-step results recorded for this item.
|
|
108
|
+
:vartype step_results: list[ExtractionStepResult]
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
model_config = ConfigDict(extra="forbid")
|
|
112
|
+
|
|
113
|
+
item_id: str
|
|
114
|
+
status: str
|
|
115
|
+
final_text_relpath: Optional[str] = None
|
|
116
|
+
final_step_index: Optional[int] = Field(default=None, ge=1)
|
|
117
|
+
final_step_extractor_id: Optional[str] = None
|
|
118
|
+
final_producer_extractor_id: Optional[str] = None
|
|
119
|
+
final_source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
120
|
+
error_type: Optional[str] = None
|
|
121
|
+
error_message: Optional[str] = None
|
|
122
|
+
step_results: List[ExtractionStepResult] = Field(default_factory=list)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class ExtractionRunManifest(BaseModel):
|
|
126
|
+
"""
|
|
127
|
+
Immutable record describing an extraction run.
|
|
128
|
+
|
|
129
|
+
:ivar run_id: Unique run identifier.
|
|
130
|
+
:vartype run_id: str
|
|
131
|
+
:ivar recipe: Recipe manifest for this run.
|
|
132
|
+
:vartype recipe: ExtractionRecipeManifest
|
|
133
|
+
:ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
|
|
134
|
+
:vartype corpus_uri: str
|
|
135
|
+
:ivar catalog_generated_at: Catalog timestamp used for the run.
|
|
136
|
+
:vartype catalog_generated_at: str
|
|
137
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
|
|
138
|
+
:vartype created_at: str
|
|
139
|
+
:ivar items: Per-item results.
|
|
140
|
+
:vartype items: list[ExtractionItemResult]
|
|
141
|
+
:ivar stats: Run statistics.
|
|
142
|
+
:vartype stats: dict[str, Any]
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
model_config = ConfigDict(extra="forbid")
|
|
146
|
+
|
|
147
|
+
run_id: str
|
|
148
|
+
recipe: ExtractionRecipeManifest
|
|
149
|
+
corpus_uri: str
|
|
150
|
+
catalog_generated_at: str
|
|
151
|
+
created_at: str
|
|
152
|
+
items: List[ExtractionItemResult] = Field(default_factory=list)
|
|
153
|
+
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def create_extraction_recipe_manifest(
|
|
157
|
+
*, extractor_id: str, name: str, config: Dict[str, Any]
|
|
158
|
+
) -> ExtractionRecipeManifest:
|
|
159
|
+
"""
|
|
160
|
+
Create a deterministic extraction recipe manifest.
|
|
161
|
+
|
|
162
|
+
:param extractor_id: Extractor plugin identifier.
|
|
163
|
+
:type extractor_id: str
|
|
164
|
+
:param name: Human recipe name.
|
|
165
|
+
:type name: str
|
|
166
|
+
:param config: Extractor configuration.
|
|
167
|
+
:type config: dict[str, Any]
|
|
168
|
+
:return: Recipe manifest.
|
|
169
|
+
:rtype: ExtractionRecipeManifest
|
|
170
|
+
"""
|
|
171
|
+
recipe_payload = json.dumps(
|
|
172
|
+
{"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True
|
|
173
|
+
)
|
|
174
|
+
recipe_id = hash_text(recipe_payload)
|
|
175
|
+
return ExtractionRecipeManifest(
|
|
176
|
+
recipe_id=recipe_id,
|
|
177
|
+
extractor_id=extractor_id,
|
|
178
|
+
name=name,
|
|
179
|
+
created_at=utc_now_iso(),
|
|
180
|
+
config=config,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def create_extraction_run_manifest(
|
|
185
|
+
corpus: Corpus, *, recipe: ExtractionRecipeManifest
|
|
186
|
+
) -> ExtractionRunManifest:
|
|
187
|
+
"""
|
|
188
|
+
Create a new extraction run manifest for a corpus.
|
|
189
|
+
|
|
190
|
+
:param corpus: Corpus associated with the run.
|
|
191
|
+
:type corpus: Corpus
|
|
192
|
+
:param recipe: Recipe manifest.
|
|
193
|
+
:type recipe: ExtractionRecipeManifest
|
|
194
|
+
:return: Run manifest.
|
|
195
|
+
:rtype: ExtractionRunManifest
|
|
196
|
+
"""
|
|
197
|
+
catalog = corpus.load_catalog()
|
|
198
|
+
run_id = hash_text(f"{recipe.recipe_id}:{catalog.generated_at}")
|
|
199
|
+
return ExtractionRunManifest(
|
|
200
|
+
run_id=run_id,
|
|
201
|
+
recipe=recipe,
|
|
202
|
+
corpus_uri=corpus.uri,
|
|
203
|
+
catalog_generated_at=catalog.generated_at,
|
|
204
|
+
created_at=utc_now_iso(),
|
|
205
|
+
items=[],
|
|
206
|
+
stats={},
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def write_extraction_run_manifest(*, run_dir: Path, manifest: ExtractionRunManifest) -> None:
|
|
211
|
+
"""
|
|
212
|
+
Persist an extraction run manifest to a run directory.
|
|
213
|
+
|
|
214
|
+
:param run_dir: Extraction run directory.
|
|
215
|
+
:type run_dir: Path
|
|
216
|
+
:param manifest: Run manifest to write.
|
|
217
|
+
:type manifest: ExtractionRunManifest
|
|
218
|
+
:return: None.
|
|
219
|
+
:rtype: None
|
|
220
|
+
"""
|
|
221
|
+
manifest_path = run_dir / "manifest.json"
|
|
222
|
+
manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str) -> str:
|
|
226
|
+
"""
|
|
227
|
+
Write an extracted text artifact for an item into the run directory.
|
|
228
|
+
|
|
229
|
+
:param run_dir: Extraction run directory.
|
|
230
|
+
:type run_dir: Path
|
|
231
|
+
:param item: Catalog item being extracted.
|
|
232
|
+
:type item: CatalogItem
|
|
233
|
+
:param text: Extracted text.
|
|
234
|
+
:type text: str
|
|
235
|
+
:return: Relative path to the stored text artifact.
|
|
236
|
+
:rtype: str
|
|
237
|
+
"""
|
|
238
|
+
text_dir = run_dir / "text"
|
|
239
|
+
text_dir.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
relpath = str(Path("text") / f"{item.id}.txt")
|
|
241
|
+
path = run_dir / relpath
|
|
242
|
+
path.write_text(text, encoding="utf-8")
|
|
243
|
+
return relpath
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _pipeline_step_dir_name(*, step_index: int, extractor_id: str) -> str:
|
|
247
|
+
"""
|
|
248
|
+
Build a stable directory name for a pipeline step.
|
|
249
|
+
|
|
250
|
+
:param step_index: One-based pipeline step index.
|
|
251
|
+
:type step_index: int
|
|
252
|
+
:param extractor_id: Extractor identifier for the step.
|
|
253
|
+
:type extractor_id: str
|
|
254
|
+
:return: Directory name for the step.
|
|
255
|
+
:rtype: str
|
|
256
|
+
"""
|
|
257
|
+
return f"{step_index:02d}-{extractor_id}"
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def write_pipeline_step_text_artifact(
|
|
261
|
+
*,
|
|
262
|
+
run_dir: Path,
|
|
263
|
+
step_index: int,
|
|
264
|
+
extractor_id: str,
|
|
265
|
+
item: CatalogItem,
|
|
266
|
+
text: str,
|
|
267
|
+
) -> str:
|
|
268
|
+
"""
|
|
269
|
+
Write a pipeline step text artifact for an item.
|
|
270
|
+
|
|
271
|
+
:param run_dir: Extraction run directory.
|
|
272
|
+
:type run_dir: Path
|
|
273
|
+
:param step_index: One-based pipeline step index.
|
|
274
|
+
:type step_index: int
|
|
275
|
+
:param extractor_id: Extractor identifier for the step.
|
|
276
|
+
:type extractor_id: str
|
|
277
|
+
:param item: Catalog item being extracted.
|
|
278
|
+
:type item: CatalogItem
|
|
279
|
+
:param text: Extracted text content.
|
|
280
|
+
:type text: str
|
|
281
|
+
:return: Relative path to the stored step text artifact.
|
|
282
|
+
:rtype: str
|
|
283
|
+
"""
|
|
284
|
+
step_dir_name = _pipeline_step_dir_name(step_index=step_index, extractor_id=extractor_id)
|
|
285
|
+
text_dir = run_dir / "steps" / step_dir_name / "text"
|
|
286
|
+
text_dir.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
relpath = str(Path("steps") / step_dir_name / "text" / f"{item.id}.txt")
|
|
288
|
+
(run_dir / relpath).write_text(text, encoding="utf-8")
|
|
289
|
+
return relpath
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _final_output_from_steps(
|
|
293
|
+
step_outputs: List[ExtractionStepOutput],
|
|
294
|
+
) -> Optional[ExtractionStepOutput]:
|
|
295
|
+
"""
|
|
296
|
+
Select the final pipeline output for an item.
|
|
297
|
+
|
|
298
|
+
The final output is the last extracted step output in pipeline order.
|
|
299
|
+
|
|
300
|
+
:param step_outputs: Extracted outputs produced by pipeline steps.
|
|
301
|
+
:type step_outputs: list[biblicus.models.ExtractionStepOutput]
|
|
302
|
+
:return: Final step output or None when no steps produced extracted text.
|
|
303
|
+
:rtype: biblicus.models.ExtractionStepOutput or None
|
|
304
|
+
"""
|
|
305
|
+
if not step_outputs:
|
|
306
|
+
return None
|
|
307
|
+
return step_outputs[-1]
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def build_extraction_run(
|
|
311
|
+
corpus: Corpus,
|
|
312
|
+
*,
|
|
313
|
+
extractor_id: str,
|
|
314
|
+
recipe_name: str,
|
|
315
|
+
config: Dict[str, Any],
|
|
316
|
+
) -> ExtractionRunManifest:
|
|
317
|
+
"""
|
|
318
|
+
Build an extraction run for a corpus using the pipeline extractor.
|
|
319
|
+
|
|
320
|
+
:param corpus: Corpus to extract from.
|
|
321
|
+
:type corpus: Corpus
|
|
322
|
+
:param extractor_id: Extractor plugin identifier (must be ``pipeline``).
|
|
323
|
+
:type extractor_id: str
|
|
324
|
+
:param recipe_name: Human-readable recipe name.
|
|
325
|
+
:type recipe_name: str
|
|
326
|
+
:param config: Extractor configuration mapping.
|
|
327
|
+
:type config: dict[str, Any]
|
|
328
|
+
:return: Extraction run manifest describing the build.
|
|
329
|
+
:rtype: ExtractionRunManifest
|
|
330
|
+
:raises KeyError: If the extractor identifier is unknown.
|
|
331
|
+
:raises ValueError: If the extractor configuration is invalid.
|
|
332
|
+
:raises OSError: If the run directory or artifacts cannot be written.
|
|
333
|
+
:raises ExtractionRunFatalError: If the extractor is not the pipeline.
|
|
334
|
+
"""
|
|
335
|
+
extractor = get_extractor(extractor_id)
|
|
336
|
+
parsed_config = extractor.validate_config(config)
|
|
337
|
+
recipe = create_extraction_recipe_manifest(
|
|
338
|
+
extractor_id=extractor_id,
|
|
339
|
+
name=recipe_name,
|
|
340
|
+
config=parsed_config.model_dump(),
|
|
341
|
+
)
|
|
342
|
+
manifest = create_extraction_run_manifest(corpus, recipe=recipe)
|
|
343
|
+
run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
344
|
+
if run_dir.exists():
|
|
345
|
+
return corpus.load_extraction_run_manifest(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
346
|
+
run_dir.mkdir(parents=True, exist_ok=False)
|
|
347
|
+
|
|
348
|
+
catalog = corpus.load_catalog()
|
|
349
|
+
if extractor_id != "pipeline":
|
|
350
|
+
raise ExtractionRunFatalError("Extraction runs must use the pipeline extractor")
|
|
351
|
+
|
|
352
|
+
pipeline_config = (
|
|
353
|
+
parsed_config
|
|
354
|
+
if isinstance(parsed_config, PipelineExtractorConfig)
|
|
355
|
+
else PipelineExtractorConfig.model_validate(parsed_config)
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
validated_steps: List[Tuple[PipelineStepSpec, TextExtractor, BaseModel]] = []
|
|
359
|
+
for step in pipeline_config.steps:
|
|
360
|
+
step_extractor = get_extractor(step.extractor_id)
|
|
361
|
+
parsed_step_config = step_extractor.validate_config(step.config)
|
|
362
|
+
validated_steps.append((step, step_extractor, parsed_step_config))
|
|
363
|
+
|
|
364
|
+
extracted_items: List[ExtractionItemResult] = []
|
|
365
|
+
extracted_count = 0
|
|
366
|
+
skipped_count = 0
|
|
367
|
+
errored_count = 0
|
|
368
|
+
extracted_nonempty_count = 0
|
|
369
|
+
extracted_empty_count = 0
|
|
370
|
+
already_text_item_count = 0
|
|
371
|
+
needs_extraction_item_count = 0
|
|
372
|
+
converted_item_count = 0
|
|
373
|
+
|
|
374
|
+
for item in catalog.items.values():
|
|
375
|
+
media_type = item.media_type
|
|
376
|
+
item_is_text = media_type == "text/markdown" or media_type.startswith("text/")
|
|
377
|
+
if item_is_text:
|
|
378
|
+
already_text_item_count += 1
|
|
379
|
+
else:
|
|
380
|
+
needs_extraction_item_count += 1
|
|
381
|
+
|
|
382
|
+
step_results: List[ExtractionStepResult] = []
|
|
383
|
+
step_outputs: List[ExtractionStepOutput] = []
|
|
384
|
+
last_error_type: Optional[str] = None
|
|
385
|
+
last_error_message: Optional[str] = None
|
|
386
|
+
|
|
387
|
+
for step_index, (step, step_extractor, parsed_step_config) in enumerate(
|
|
388
|
+
validated_steps, start=1
|
|
389
|
+
):
|
|
390
|
+
try:
|
|
391
|
+
extracted_text = step_extractor.extract_text(
|
|
392
|
+
corpus=corpus,
|
|
393
|
+
item=item,
|
|
394
|
+
config=parsed_step_config,
|
|
395
|
+
previous_extractions=step_outputs,
|
|
396
|
+
)
|
|
397
|
+
except Exception as extraction_error:
|
|
398
|
+
if isinstance(extraction_error, ExtractionRunFatalError):
|
|
399
|
+
raise
|
|
400
|
+
last_error_type = extraction_error.__class__.__name__
|
|
401
|
+
last_error_message = str(extraction_error)
|
|
402
|
+
step_results.append(
|
|
403
|
+
ExtractionStepResult(
|
|
404
|
+
step_index=step_index,
|
|
405
|
+
extractor_id=step.extractor_id,
|
|
406
|
+
status="errored",
|
|
407
|
+
text_relpath=None,
|
|
408
|
+
text_characters=0,
|
|
409
|
+
producer_extractor_id=None,
|
|
410
|
+
source_step_index=None,
|
|
411
|
+
error_type=last_error_type,
|
|
412
|
+
error_message=last_error_message,
|
|
413
|
+
)
|
|
414
|
+
)
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
if extracted_text is None:
|
|
418
|
+
step_results.append(
|
|
419
|
+
ExtractionStepResult(
|
|
420
|
+
step_index=step_index,
|
|
421
|
+
extractor_id=step.extractor_id,
|
|
422
|
+
status="skipped",
|
|
423
|
+
text_relpath=None,
|
|
424
|
+
text_characters=0,
|
|
425
|
+
producer_extractor_id=None,
|
|
426
|
+
source_step_index=None,
|
|
427
|
+
error_type=None,
|
|
428
|
+
error_message=None,
|
|
429
|
+
)
|
|
430
|
+
)
|
|
431
|
+
continue
|
|
432
|
+
|
|
433
|
+
relpath = write_pipeline_step_text_artifact(
|
|
434
|
+
run_dir=run_dir,
|
|
435
|
+
step_index=step_index,
|
|
436
|
+
extractor_id=step.extractor_id,
|
|
437
|
+
item=item,
|
|
438
|
+
text=extracted_text.text,
|
|
439
|
+
)
|
|
440
|
+
text_characters = len(extracted_text.text)
|
|
441
|
+
step_results.append(
|
|
442
|
+
ExtractionStepResult(
|
|
443
|
+
step_index=step_index,
|
|
444
|
+
extractor_id=step.extractor_id,
|
|
445
|
+
status="extracted",
|
|
446
|
+
text_relpath=relpath,
|
|
447
|
+
text_characters=text_characters,
|
|
448
|
+
producer_extractor_id=extracted_text.producer_extractor_id,
|
|
449
|
+
source_step_index=extracted_text.source_step_index,
|
|
450
|
+
error_type=None,
|
|
451
|
+
error_message=None,
|
|
452
|
+
)
|
|
453
|
+
)
|
|
454
|
+
step_outputs.append(
|
|
455
|
+
ExtractionStepOutput(
|
|
456
|
+
step_index=step_index,
|
|
457
|
+
extractor_id=step.extractor_id,
|
|
458
|
+
status="extracted",
|
|
459
|
+
text=extracted_text.text,
|
|
460
|
+
text_characters=text_characters,
|
|
461
|
+
producer_extractor_id=extracted_text.producer_extractor_id,
|
|
462
|
+
source_step_index=extracted_text.source_step_index,
|
|
463
|
+
error_type=None,
|
|
464
|
+
error_message=None,
|
|
465
|
+
)
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
final_output = _final_output_from_steps(step_outputs)
|
|
469
|
+
if final_output is None:
|
|
470
|
+
status = "errored" if last_error_type else "skipped"
|
|
471
|
+
if status == "errored":
|
|
472
|
+
errored_count += 1
|
|
473
|
+
else:
|
|
474
|
+
skipped_count += 1
|
|
475
|
+
extracted_items.append(
|
|
476
|
+
ExtractionItemResult(
|
|
477
|
+
item_id=item.id,
|
|
478
|
+
status=status,
|
|
479
|
+
final_text_relpath=None,
|
|
480
|
+
final_step_index=None,
|
|
481
|
+
final_step_extractor_id=None,
|
|
482
|
+
final_producer_extractor_id=None,
|
|
483
|
+
final_source_step_index=None,
|
|
484
|
+
error_type=last_error_type if status == "errored" else None,
|
|
485
|
+
error_message=last_error_message if status == "errored" else None,
|
|
486
|
+
step_results=step_results,
|
|
487
|
+
)
|
|
488
|
+
)
|
|
489
|
+
continue
|
|
490
|
+
|
|
491
|
+
final_text = final_output.text or ""
|
|
492
|
+
final_text_relpath = write_extracted_text_artifact(
|
|
493
|
+
run_dir=run_dir, item=item, text=final_text
|
|
494
|
+
)
|
|
495
|
+
extracted_count += 1
|
|
496
|
+
if final_text.strip():
|
|
497
|
+
extracted_nonempty_count += 1
|
|
498
|
+
if not item_is_text:
|
|
499
|
+
converted_item_count += 1
|
|
500
|
+
else:
|
|
501
|
+
extracted_empty_count += 1
|
|
502
|
+
|
|
503
|
+
extracted_items.append(
|
|
504
|
+
ExtractionItemResult(
|
|
505
|
+
item_id=item.id,
|
|
506
|
+
status="extracted",
|
|
507
|
+
final_text_relpath=final_text_relpath,
|
|
508
|
+
final_step_index=final_output.step_index,
|
|
509
|
+
final_step_extractor_id=final_output.extractor_id,
|
|
510
|
+
final_producer_extractor_id=final_output.producer_extractor_id,
|
|
511
|
+
final_source_step_index=final_output.source_step_index,
|
|
512
|
+
error_type=None,
|
|
513
|
+
error_message=None,
|
|
514
|
+
step_results=step_results,
|
|
515
|
+
)
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
stats = {
|
|
519
|
+
"total_items": len(catalog.items),
|
|
520
|
+
"already_text_items": already_text_item_count,
|
|
521
|
+
"needs_extraction_items": needs_extraction_item_count,
|
|
522
|
+
"extracted_items": extracted_count,
|
|
523
|
+
"extracted_nonempty_items": extracted_nonempty_count,
|
|
524
|
+
"extracted_empty_items": extracted_empty_count,
|
|
525
|
+
"skipped_items": skipped_count,
|
|
526
|
+
"errored_items": errored_count,
|
|
527
|
+
"converted_items": converted_item_count,
|
|
528
|
+
}
|
|
529
|
+
manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
|
|
530
|
+
write_extraction_run_manifest(run_dir=run_dir, manifest=manifest)
|
|
531
|
+
return manifest
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text extraction plugins for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict
|
|
8
|
+
|
|
9
|
+
from .base import TextExtractor
|
|
10
|
+
from .metadata_text import MetadataTextExtractor
|
|
11
|
+
from .openai_stt import OpenAiSpeechToTextExtractor
|
|
12
|
+
from .pass_through_text import PassThroughTextExtractor
|
|
13
|
+
from .pdf_text import PortableDocumentFormatTextExtractor
|
|
14
|
+
from .pipeline import PipelineExtractor
|
|
15
|
+
from .rapidocr_text import RapidOcrExtractor
|
|
16
|
+
from .select_longest_text import SelectLongestTextExtractor
|
|
17
|
+
from .select_text import SelectTextExtractor
|
|
18
|
+
from .unstructured_text import UnstructuredExtractor
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_extractor(extractor_id: str) -> TextExtractor:
|
|
22
|
+
"""
|
|
23
|
+
Resolve a built-in text extractor by identifier.
|
|
24
|
+
|
|
25
|
+
:param extractor_id: Extractor identifier.
|
|
26
|
+
:type extractor_id: str
|
|
27
|
+
:return: Extractor plugin instance.
|
|
28
|
+
:rtype: TextExtractor
|
|
29
|
+
:raises KeyError: If the extractor identifier is not known.
|
|
30
|
+
"""
|
|
31
|
+
extractors: Dict[str, TextExtractor] = {
|
|
32
|
+
MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
|
|
33
|
+
PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
|
|
34
|
+
PipelineExtractor.extractor_id: PipelineExtractor(),
|
|
35
|
+
PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
|
|
36
|
+
OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
|
|
37
|
+
RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
|
|
38
|
+
SelectTextExtractor.extractor_id: SelectTextExtractor(),
|
|
39
|
+
SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
|
|
40
|
+
UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
|
|
41
|
+
}
|
|
42
|
+
if extractor_id not in extractors:
|
|
43
|
+
raise KeyError(f"Unknown extractor: {extractor_id!r}")
|
|
44
|
+
return extractors[extractor_id]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base interfaces for text extraction plugins.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from ..corpus import Corpus
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TextExtractor(ABC):
|
|
17
|
+
"""
|
|
18
|
+
Abstract interface for plugins that derive text artifacts from corpus items.
|
|
19
|
+
|
|
20
|
+
A text extractor is intentionally independent from retrieval backends. It can be swapped
|
|
21
|
+
independently so that different extraction approaches can be evaluated against the same corpus
|
|
22
|
+
and the same retrieval backend.
|
|
23
|
+
|
|
24
|
+
:ivar extractor_id: Identifier string for the extractor plugin.
|
|
25
|
+
:vartype extractor_id: str
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
extractor_id: str
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
32
|
+
"""
|
|
33
|
+
Validate extractor configuration and return a parsed model.
|
|
34
|
+
|
|
35
|
+
:param config: Extractor configuration mapping.
|
|
36
|
+
:type config: dict[str, Any]
|
|
37
|
+
:return: Parsed configuration model.
|
|
38
|
+
:rtype: pydantic.BaseModel
|
|
39
|
+
:raises ValueError: If the configuration is invalid.
|
|
40
|
+
"""
|
|
41
|
+
raise NotImplementedError
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def extract_text(
|
|
45
|
+
self,
|
|
46
|
+
*,
|
|
47
|
+
corpus: Corpus,
|
|
48
|
+
item: CatalogItem,
|
|
49
|
+
config: BaseModel,
|
|
50
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
51
|
+
) -> Optional[ExtractedText]:
|
|
52
|
+
"""
|
|
53
|
+
Derive text for a catalog item.
|
|
54
|
+
|
|
55
|
+
Returning None indicates that the item was intentionally skipped.
|
|
56
|
+
|
|
57
|
+
:param corpus: Corpus containing the item bytes.
|
|
58
|
+
:type corpus: Corpus
|
|
59
|
+
:param item: Catalog item to process.
|
|
60
|
+
:type item: CatalogItem
|
|
61
|
+
:param config: Parsed extractor configuration.
|
|
62
|
+
:type config: pydantic.BaseModel
|
|
63
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
64
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
65
|
+
:return: Extracted text payload or None when skipped.
|
|
66
|
+
:rtype: ExtractedText or None
|
|
67
|
+
"""
|
|
68
|
+
raise NotImplementedError
|