biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +21 -15
- biblicus/backends/sqlite_full_text_search.py +14 -15
- biblicus/cli.py +177 -53
- biblicus/corpus.py +209 -59
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +280 -79
- biblicus/extractors/__init__.py +14 -3
- biblicus/extractors/base.py +12 -5
- biblicus/extractors/metadata_text.py +13 -5
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +16 -6
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +0 -5
- biblicus/hook_manager.py +3 -5
- biblicus/hooks.py +3 -7
- biblicus/ignore.py +0 -3
- biblicus/models.py +118 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +44 -9
- biblicus/time.py +1 -2
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
- biblicus-0.4.0.dist-info/RECORD +45 -0
- biblicus/extractors/cascade.py +0 -101
- biblicus-0.2.0.dist-info/RECORD +0 -32
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
biblicus/extraction.py
CHANGED
|
@@ -6,65 +6,20 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Dict, List, Optional
|
|
10
|
-
from uuid import uuid4
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
11
10
|
|
|
12
11
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
12
|
|
|
14
13
|
from .corpus import Corpus
|
|
14
|
+
from .errors import ExtractionRunFatalError
|
|
15
15
|
from .extractors import get_extractor
|
|
16
|
-
from .
|
|
16
|
+
from .extractors.base import TextExtractor
|
|
17
|
+
from .extractors.pipeline import PipelineExtractorConfig, PipelineStepSpec
|
|
18
|
+
from .models import CatalogItem, ExtractionStepOutput
|
|
17
19
|
from .retrieval import hash_text
|
|
18
20
|
from .time import utc_now_iso
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
class ExtractionRunReference(BaseModel):
|
|
22
|
-
"""
|
|
23
|
-
Reference to an extraction run.
|
|
24
|
-
|
|
25
|
-
:ivar extractor_id: Extractor plugin identifier.
|
|
26
|
-
:vartype extractor_id: str
|
|
27
|
-
:ivar run_id: Extraction run identifier.
|
|
28
|
-
:vartype run_id: str
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
model_config = ConfigDict(extra="forbid")
|
|
32
|
-
|
|
33
|
-
extractor_id: str = Field(min_length=1)
|
|
34
|
-
run_id: str = Field(min_length=1)
|
|
35
|
-
|
|
36
|
-
def as_string(self) -> str:
|
|
37
|
-
"""
|
|
38
|
-
Serialize the reference as a single string.
|
|
39
|
-
|
|
40
|
-
:return: Reference in the form extractor_id:run_id.
|
|
41
|
-
:rtype: str
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
return f"{self.extractor_id}:{self.run_id}"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
|
|
48
|
-
"""
|
|
49
|
-
Parse an extraction run reference in the form extractor_id:run_id.
|
|
50
|
-
|
|
51
|
-
:param value: Raw reference string.
|
|
52
|
-
:type value: str
|
|
53
|
-
:return: Parsed extraction run reference.
|
|
54
|
-
:rtype: ExtractionRunReference
|
|
55
|
-
:raises ValueError: If the reference is not well formed.
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
if ":" not in value:
|
|
59
|
-
raise ValueError("Extraction run reference must be extractor_id:run_id")
|
|
60
|
-
extractor_id, run_id = value.split(":", 1)
|
|
61
|
-
extractor_id = extractor_id.strip()
|
|
62
|
-
run_id = run_id.strip()
|
|
63
|
-
if not extractor_id or not run_id:
|
|
64
|
-
raise ValueError("Extraction run reference must be extractor_id:run_id with non-empty parts")
|
|
65
|
-
return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
|
|
66
|
-
|
|
67
|
-
|
|
68
23
|
class ExtractionRecipeManifest(BaseModel):
|
|
69
24
|
"""
|
|
70
25
|
Reproducible configuration for an extraction plugin run.
|
|
@@ -90,26 +45,81 @@ class ExtractionRecipeManifest(BaseModel):
|
|
|
90
45
|
config: Dict[str, Any] = Field(default_factory=dict)
|
|
91
46
|
|
|
92
47
|
|
|
48
|
+
class ExtractionStepResult(BaseModel):
|
|
49
|
+
"""
|
|
50
|
+
Per-item result record for a single pipeline step.
|
|
51
|
+
|
|
52
|
+
:ivar step_index: One-based pipeline step index.
|
|
53
|
+
:vartype step_index: int
|
|
54
|
+
:ivar extractor_id: Extractor identifier for the step.
|
|
55
|
+
:vartype extractor_id: str
|
|
56
|
+
:ivar status: Step status, extracted, skipped, or errored.
|
|
57
|
+
:vartype status: str
|
|
58
|
+
:ivar text_relpath: Relative path to the step text artifact, when extracted.
|
|
59
|
+
:vartype text_relpath: str or None
|
|
60
|
+
:ivar text_characters: Character count of the extracted text.
|
|
61
|
+
:vartype text_characters: int
|
|
62
|
+
:ivar producer_extractor_id: Extractor identifier that produced the text content.
|
|
63
|
+
:vartype producer_extractor_id: str or None
|
|
64
|
+
:ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
|
|
65
|
+
:vartype source_step_index: int or None
|
|
66
|
+
:ivar error_type: Optional error type name for errored steps.
|
|
67
|
+
:vartype error_type: str or None
|
|
68
|
+
:ivar error_message: Optional error message for errored steps.
|
|
69
|
+
:vartype error_message: str or None
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
model_config = ConfigDict(extra="forbid")
|
|
73
|
+
|
|
74
|
+
step_index: int = Field(ge=1)
|
|
75
|
+
extractor_id: str
|
|
76
|
+
status: str
|
|
77
|
+
text_relpath: Optional[str] = None
|
|
78
|
+
text_characters: int = Field(default=0, ge=0)
|
|
79
|
+
producer_extractor_id: Optional[str] = None
|
|
80
|
+
source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
81
|
+
error_type: Optional[str] = None
|
|
82
|
+
error_message: Optional[str] = None
|
|
83
|
+
|
|
84
|
+
|
|
93
85
|
class ExtractionItemResult(BaseModel):
|
|
94
86
|
"""
|
|
95
87
|
Per-item result record for an extraction run.
|
|
96
88
|
|
|
97
89
|
:ivar item_id: Item identifier.
|
|
98
90
|
:vartype item_id: str
|
|
99
|
-
:ivar status:
|
|
91
|
+
:ivar status: Final result status, extracted, skipped, or errored.
|
|
100
92
|
:vartype status: str
|
|
101
|
-
:ivar
|
|
102
|
-
:vartype
|
|
103
|
-
:ivar
|
|
104
|
-
:vartype
|
|
93
|
+
:ivar final_text_relpath: Relative path to the final extracted text artifact, when extracted.
|
|
94
|
+
:vartype final_text_relpath: str or None
|
|
95
|
+
:ivar final_step_index: Pipeline step index that produced the final text.
|
|
96
|
+
:vartype final_step_index: int or None
|
|
97
|
+
:ivar final_step_extractor_id: Extractor identifier of the step that produced the final text.
|
|
98
|
+
:vartype final_step_extractor_id: str or None
|
|
99
|
+
:ivar final_producer_extractor_id: Extractor identifier that produced the final text content.
|
|
100
|
+
:vartype final_producer_extractor_id: str or None
|
|
101
|
+
:ivar final_source_step_index: Optional step index that supplied the final text for selection-style extractors.
|
|
102
|
+
:vartype final_source_step_index: int or None
|
|
103
|
+
:ivar error_type: Optional error type name when no extracted text was produced.
|
|
104
|
+
:vartype error_type: str or None
|
|
105
|
+
:ivar error_message: Optional error message when no extracted text was produced.
|
|
106
|
+
:vartype error_message: str or None
|
|
107
|
+
:ivar step_results: Per-step results recorded for this item.
|
|
108
|
+
:vartype step_results: list[ExtractionStepResult]
|
|
105
109
|
"""
|
|
106
110
|
|
|
107
111
|
model_config = ConfigDict(extra="forbid")
|
|
108
112
|
|
|
109
113
|
item_id: str
|
|
110
114
|
status: str
|
|
111
|
-
|
|
112
|
-
|
|
115
|
+
final_text_relpath: Optional[str] = None
|
|
116
|
+
final_step_index: Optional[int] = Field(default=None, ge=1)
|
|
117
|
+
final_step_extractor_id: Optional[str] = None
|
|
118
|
+
final_producer_extractor_id: Optional[str] = None
|
|
119
|
+
final_source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
120
|
+
error_type: Optional[str] = None
|
|
121
|
+
error_message: Optional[str] = None
|
|
122
|
+
step_results: List[ExtractionStepResult] = Field(default_factory=list)
|
|
113
123
|
|
|
114
124
|
|
|
115
125
|
class ExtractionRunManifest(BaseModel):
|
|
@@ -143,7 +153,9 @@ class ExtractionRunManifest(BaseModel):
|
|
|
143
153
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
144
154
|
|
|
145
155
|
|
|
146
|
-
def create_extraction_recipe_manifest(
|
|
156
|
+
def create_extraction_recipe_manifest(
|
|
157
|
+
*, extractor_id: str, name: str, config: Dict[str, Any]
|
|
158
|
+
) -> ExtractionRecipeManifest:
|
|
147
159
|
"""
|
|
148
160
|
Create a deterministic extraction recipe manifest.
|
|
149
161
|
|
|
@@ -156,8 +168,9 @@ def create_extraction_recipe_manifest(*, extractor_id: str, name: str, config: D
|
|
|
156
168
|
:return: Recipe manifest.
|
|
157
169
|
:rtype: ExtractionRecipeManifest
|
|
158
170
|
"""
|
|
159
|
-
|
|
160
|
-
|
|
171
|
+
recipe_payload = json.dumps(
|
|
172
|
+
{"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True
|
|
173
|
+
)
|
|
161
174
|
recipe_id = hash_text(recipe_payload)
|
|
162
175
|
return ExtractionRecipeManifest(
|
|
163
176
|
recipe_id=recipe_id,
|
|
@@ -168,7 +181,9 @@ def create_extraction_recipe_manifest(*, extractor_id: str, name: str, config: D
|
|
|
168
181
|
)
|
|
169
182
|
|
|
170
183
|
|
|
171
|
-
def create_extraction_run_manifest(
|
|
184
|
+
def create_extraction_run_manifest(
|
|
185
|
+
corpus: Corpus, *, recipe: ExtractionRecipeManifest
|
|
186
|
+
) -> ExtractionRunManifest:
|
|
172
187
|
"""
|
|
173
188
|
Create a new extraction run manifest for a corpus.
|
|
174
189
|
|
|
@@ -179,10 +194,10 @@ def create_extraction_run_manifest(corpus: Corpus, *, recipe: ExtractionRecipeMa
|
|
|
179
194
|
:return: Run manifest.
|
|
180
195
|
:rtype: ExtractionRunManifest
|
|
181
196
|
"""
|
|
182
|
-
|
|
183
197
|
catalog = corpus.load_catalog()
|
|
198
|
+
run_id = hash_text(f"{recipe.recipe_id}:{catalog.generated_at}")
|
|
184
199
|
return ExtractionRunManifest(
|
|
185
|
-
run_id=
|
|
200
|
+
run_id=run_id,
|
|
186
201
|
recipe=recipe,
|
|
187
202
|
corpus_uri=corpus.uri,
|
|
188
203
|
catalog_generated_at=catalog.generated_at,
|
|
@@ -203,7 +218,6 @@ def write_extraction_run_manifest(*, run_dir: Path, manifest: ExtractionRunManif
|
|
|
203
218
|
:return: None.
|
|
204
219
|
:rtype: None
|
|
205
220
|
"""
|
|
206
|
-
|
|
207
221
|
manifest_path = run_dir / "manifest.json"
|
|
208
222
|
manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
209
223
|
|
|
@@ -221,7 +235,6 @@ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str
|
|
|
221
235
|
:return: Relative path to the stored text artifact.
|
|
222
236
|
:rtype: str
|
|
223
237
|
"""
|
|
224
|
-
|
|
225
238
|
text_dir = run_dir / "text"
|
|
226
239
|
text_dir.mkdir(parents=True, exist_ok=True)
|
|
227
240
|
relpath = str(Path("text") / f"{item.id}.txt")
|
|
@@ -230,6 +243,70 @@ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str
|
|
|
230
243
|
return relpath
|
|
231
244
|
|
|
232
245
|
|
|
246
|
+
def _pipeline_step_dir_name(*, step_index: int, extractor_id: str) -> str:
|
|
247
|
+
"""
|
|
248
|
+
Build a stable directory name for a pipeline step.
|
|
249
|
+
|
|
250
|
+
:param step_index: One-based pipeline step index.
|
|
251
|
+
:type step_index: int
|
|
252
|
+
:param extractor_id: Extractor identifier for the step.
|
|
253
|
+
:type extractor_id: str
|
|
254
|
+
:return: Directory name for the step.
|
|
255
|
+
:rtype: str
|
|
256
|
+
"""
|
|
257
|
+
return f"{step_index:02d}-{extractor_id}"
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def write_pipeline_step_text_artifact(
|
|
261
|
+
*,
|
|
262
|
+
run_dir: Path,
|
|
263
|
+
step_index: int,
|
|
264
|
+
extractor_id: str,
|
|
265
|
+
item: CatalogItem,
|
|
266
|
+
text: str,
|
|
267
|
+
) -> str:
|
|
268
|
+
"""
|
|
269
|
+
Write a pipeline step text artifact for an item.
|
|
270
|
+
|
|
271
|
+
:param run_dir: Extraction run directory.
|
|
272
|
+
:type run_dir: Path
|
|
273
|
+
:param step_index: One-based pipeline step index.
|
|
274
|
+
:type step_index: int
|
|
275
|
+
:param extractor_id: Extractor identifier for the step.
|
|
276
|
+
:type extractor_id: str
|
|
277
|
+
:param item: Catalog item being extracted.
|
|
278
|
+
:type item: CatalogItem
|
|
279
|
+
:param text: Extracted text content.
|
|
280
|
+
:type text: str
|
|
281
|
+
:return: Relative path to the stored step text artifact.
|
|
282
|
+
:rtype: str
|
|
283
|
+
"""
|
|
284
|
+
step_dir_name = _pipeline_step_dir_name(step_index=step_index, extractor_id=extractor_id)
|
|
285
|
+
text_dir = run_dir / "steps" / step_dir_name / "text"
|
|
286
|
+
text_dir.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
relpath = str(Path("steps") / step_dir_name / "text" / f"{item.id}.txt")
|
|
288
|
+
(run_dir / relpath).write_text(text, encoding="utf-8")
|
|
289
|
+
return relpath
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _final_output_from_steps(
|
|
293
|
+
step_outputs: List[ExtractionStepOutput],
|
|
294
|
+
) -> Optional[ExtractionStepOutput]:
|
|
295
|
+
"""
|
|
296
|
+
Select the final pipeline output for an item.
|
|
297
|
+
|
|
298
|
+
The final output is the last extracted step output in pipeline order.
|
|
299
|
+
|
|
300
|
+
:param step_outputs: Extracted outputs produced by pipeline steps.
|
|
301
|
+
:type step_outputs: list[biblicus.models.ExtractionStepOutput]
|
|
302
|
+
:return: Final step output or None when no steps produced extracted text.
|
|
303
|
+
:rtype: biblicus.models.ExtractionStepOutput or None
|
|
304
|
+
"""
|
|
305
|
+
if not step_outputs:
|
|
306
|
+
return None
|
|
307
|
+
return step_outputs[-1]
|
|
308
|
+
|
|
309
|
+
|
|
233
310
|
def build_extraction_run(
|
|
234
311
|
corpus: Corpus,
|
|
235
312
|
*,
|
|
@@ -238,11 +315,11 @@ def build_extraction_run(
|
|
|
238
315
|
config: Dict[str, Any],
|
|
239
316
|
) -> ExtractionRunManifest:
|
|
240
317
|
"""
|
|
241
|
-
Build an extraction run for a corpus using
|
|
318
|
+
Build an extraction run for a corpus using the pipeline extractor.
|
|
242
319
|
|
|
243
320
|
:param corpus: Corpus to extract from.
|
|
244
321
|
:type corpus: Corpus
|
|
245
|
-
:param extractor_id: Extractor plugin identifier.
|
|
322
|
+
:param extractor_id: Extractor plugin identifier (must be ``pipeline``).
|
|
246
323
|
:type extractor_id: str
|
|
247
324
|
:param recipe_name: Human-readable recipe name.
|
|
248
325
|
:type recipe_name: str
|
|
@@ -253,8 +330,8 @@ def build_extraction_run(
|
|
|
253
330
|
:raises KeyError: If the extractor identifier is unknown.
|
|
254
331
|
:raises ValueError: If the extractor configuration is invalid.
|
|
255
332
|
:raises OSError: If the run directory or artifacts cannot be written.
|
|
333
|
+
:raises ExtractionRunFatalError: If the extractor is not the pipeline.
|
|
256
334
|
"""
|
|
257
|
-
|
|
258
335
|
extractor = get_extractor(extractor_id)
|
|
259
336
|
parsed_config = extractor.validate_config(config)
|
|
260
337
|
recipe = create_extraction_recipe_manifest(
|
|
@@ -264,17 +341,36 @@ def build_extraction_run(
|
|
|
264
341
|
)
|
|
265
342
|
manifest = create_extraction_run_manifest(corpus, recipe=recipe)
|
|
266
343
|
run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
344
|
+
if run_dir.exists():
|
|
345
|
+
return corpus.load_extraction_run_manifest(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
267
346
|
run_dir.mkdir(parents=True, exist_ok=False)
|
|
268
347
|
|
|
269
348
|
catalog = corpus.load_catalog()
|
|
349
|
+
if extractor_id != "pipeline":
|
|
350
|
+
raise ExtractionRunFatalError("Extraction runs must use the pipeline extractor")
|
|
351
|
+
|
|
352
|
+
pipeline_config = (
|
|
353
|
+
parsed_config
|
|
354
|
+
if isinstance(parsed_config, PipelineExtractorConfig)
|
|
355
|
+
else PipelineExtractorConfig.model_validate(parsed_config)
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
validated_steps: List[Tuple[PipelineStepSpec, TextExtractor, BaseModel]] = []
|
|
359
|
+
for step in pipeline_config.steps:
|
|
360
|
+
step_extractor = get_extractor(step.extractor_id)
|
|
361
|
+
parsed_step_config = step_extractor.validate_config(step.config)
|
|
362
|
+
validated_steps.append((step, step_extractor, parsed_step_config))
|
|
363
|
+
|
|
270
364
|
extracted_items: List[ExtractionItemResult] = []
|
|
271
365
|
extracted_count = 0
|
|
272
366
|
skipped_count = 0
|
|
367
|
+
errored_count = 0
|
|
273
368
|
extracted_nonempty_count = 0
|
|
274
369
|
extracted_empty_count = 0
|
|
275
370
|
already_text_item_count = 0
|
|
276
371
|
needs_extraction_item_count = 0
|
|
277
372
|
converted_item_count = 0
|
|
373
|
+
|
|
278
374
|
for item in catalog.items.values():
|
|
279
375
|
media_type = item.media_type
|
|
280
376
|
item_is_text = media_type == "text/markdown" or media_type.startswith("text/")
|
|
@@ -283,35 +379,139 @@ def build_extraction_run(
|
|
|
283
379
|
else:
|
|
284
380
|
needs_extraction_item_count += 1
|
|
285
381
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
382
|
+
step_results: List[ExtractionStepResult] = []
|
|
383
|
+
step_outputs: List[ExtractionStepOutput] = []
|
|
384
|
+
last_error_type: Optional[str] = None
|
|
385
|
+
last_error_message: Optional[str] = None
|
|
386
|
+
|
|
387
|
+
for step_index, (step, step_extractor, parsed_step_config) in enumerate(
|
|
388
|
+
validated_steps, start=1
|
|
389
|
+
):
|
|
390
|
+
try:
|
|
391
|
+
extracted_text = step_extractor.extract_text(
|
|
392
|
+
corpus=corpus,
|
|
393
|
+
item=item,
|
|
394
|
+
config=parsed_step_config,
|
|
395
|
+
previous_extractions=step_outputs,
|
|
396
|
+
)
|
|
397
|
+
except Exception as extraction_error:
|
|
398
|
+
if isinstance(extraction_error, ExtractionRunFatalError):
|
|
399
|
+
raise
|
|
400
|
+
last_error_type = extraction_error.__class__.__name__
|
|
401
|
+
last_error_message = str(extraction_error)
|
|
402
|
+
step_results.append(
|
|
403
|
+
ExtractionStepResult(
|
|
404
|
+
step_index=step_index,
|
|
405
|
+
extractor_id=step.extractor_id,
|
|
406
|
+
status="errored",
|
|
407
|
+
text_relpath=None,
|
|
408
|
+
text_characters=0,
|
|
409
|
+
producer_extractor_id=None,
|
|
410
|
+
source_step_index=None,
|
|
411
|
+
error_type=last_error_type,
|
|
412
|
+
error_message=last_error_message,
|
|
413
|
+
)
|
|
414
|
+
)
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
if extracted_text is None:
|
|
418
|
+
step_results.append(
|
|
419
|
+
ExtractionStepResult(
|
|
420
|
+
step_index=step_index,
|
|
421
|
+
extractor_id=step.extractor_id,
|
|
422
|
+
status="skipped",
|
|
423
|
+
text_relpath=None,
|
|
424
|
+
text_characters=0,
|
|
425
|
+
producer_extractor_id=None,
|
|
426
|
+
source_step_index=None,
|
|
427
|
+
error_type=None,
|
|
428
|
+
error_message=None,
|
|
429
|
+
)
|
|
430
|
+
)
|
|
431
|
+
continue
|
|
432
|
+
|
|
433
|
+
relpath = write_pipeline_step_text_artifact(
|
|
434
|
+
run_dir=run_dir,
|
|
435
|
+
step_index=step_index,
|
|
436
|
+
extractor_id=step.extractor_id,
|
|
437
|
+
item=item,
|
|
438
|
+
text=extracted_text.text,
|
|
439
|
+
)
|
|
440
|
+
text_characters = len(extracted_text.text)
|
|
441
|
+
step_results.append(
|
|
442
|
+
ExtractionStepResult(
|
|
443
|
+
step_index=step_index,
|
|
444
|
+
extractor_id=step.extractor_id,
|
|
445
|
+
status="extracted",
|
|
446
|
+
text_relpath=relpath,
|
|
447
|
+
text_characters=text_characters,
|
|
448
|
+
producer_extractor_id=extracted_text.producer_extractor_id,
|
|
449
|
+
source_step_index=extracted_text.source_step_index,
|
|
450
|
+
error_type=None,
|
|
451
|
+
error_message=None,
|
|
452
|
+
)
|
|
453
|
+
)
|
|
454
|
+
step_outputs.append(
|
|
455
|
+
ExtractionStepOutput(
|
|
456
|
+
step_index=step_index,
|
|
457
|
+
extractor_id=step.extractor_id,
|
|
458
|
+
status="extracted",
|
|
459
|
+
text=extracted_text.text,
|
|
460
|
+
text_characters=text_characters,
|
|
461
|
+
producer_extractor_id=extracted_text.producer_extractor_id,
|
|
462
|
+
source_step_index=extracted_text.source_step_index,
|
|
463
|
+
error_type=None,
|
|
464
|
+
error_message=None,
|
|
465
|
+
)
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
final_output = _final_output_from_steps(step_outputs)
|
|
469
|
+
if final_output is None:
|
|
470
|
+
status = "errored" if last_error_type else "skipped"
|
|
471
|
+
if status == "errored":
|
|
472
|
+
errored_count += 1
|
|
473
|
+
else:
|
|
474
|
+
skipped_count += 1
|
|
289
475
|
extracted_items.append(
|
|
290
476
|
ExtractionItemResult(
|
|
291
477
|
item_id=item.id,
|
|
292
|
-
status=
|
|
293
|
-
|
|
294
|
-
|
|
478
|
+
status=status,
|
|
479
|
+
final_text_relpath=None,
|
|
480
|
+
final_step_index=None,
|
|
481
|
+
final_step_extractor_id=None,
|
|
482
|
+
final_producer_extractor_id=None,
|
|
483
|
+
final_source_step_index=None,
|
|
484
|
+
error_type=last_error_type if status == "errored" else None,
|
|
485
|
+
error_message=last_error_message if status == "errored" else None,
|
|
486
|
+
step_results=step_results,
|
|
295
487
|
)
|
|
296
488
|
)
|
|
297
489
|
continue
|
|
298
490
|
|
|
491
|
+
final_text = final_output.text or ""
|
|
492
|
+
final_text_relpath = write_extracted_text_artifact(
|
|
493
|
+
run_dir=run_dir, item=item, text=final_text
|
|
494
|
+
)
|
|
299
495
|
extracted_count += 1
|
|
300
|
-
|
|
301
|
-
if stripped_text:
|
|
496
|
+
if final_text.strip():
|
|
302
497
|
extracted_nonempty_count += 1
|
|
303
498
|
if not item_is_text:
|
|
304
499
|
converted_item_count += 1
|
|
305
500
|
else:
|
|
306
501
|
extracted_empty_count += 1
|
|
307
502
|
|
|
308
|
-
relpath = write_extracted_text_artifact(run_dir=run_dir, item=item, text=extracted_text.text)
|
|
309
503
|
extracted_items.append(
|
|
310
504
|
ExtractionItemResult(
|
|
311
505
|
item_id=item.id,
|
|
312
506
|
status="extracted",
|
|
313
|
-
|
|
314
|
-
|
|
507
|
+
final_text_relpath=final_text_relpath,
|
|
508
|
+
final_step_index=final_output.step_index,
|
|
509
|
+
final_step_extractor_id=final_output.extractor_id,
|
|
510
|
+
final_producer_extractor_id=final_output.producer_extractor_id,
|
|
511
|
+
final_source_step_index=final_output.source_step_index,
|
|
512
|
+
error_type=None,
|
|
513
|
+
error_message=None,
|
|
514
|
+
step_results=step_results,
|
|
315
515
|
)
|
|
316
516
|
)
|
|
317
517
|
|
|
@@ -323,6 +523,7 @@ def build_extraction_run(
|
|
|
323
523
|
"extracted_nonempty_items": extracted_nonempty_count,
|
|
324
524
|
"extracted_empty_items": extracted_empty_count,
|
|
325
525
|
"skipped_items": skipped_count,
|
|
526
|
+
"errored_items": errored_count,
|
|
326
527
|
"converted_items": converted_item_count,
|
|
327
528
|
}
|
|
328
529
|
manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
|
biblicus/extractors/__init__.py
CHANGED
|
@@ -7,9 +7,15 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict
|
|
8
8
|
|
|
9
9
|
from .base import TextExtractor
|
|
10
|
-
from .cascade import CascadeExtractor
|
|
11
10
|
from .metadata_text import MetadataTextExtractor
|
|
11
|
+
from .openai_stt import OpenAiSpeechToTextExtractor
|
|
12
12
|
from .pass_through_text import PassThroughTextExtractor
|
|
13
|
+
from .pdf_text import PortableDocumentFormatTextExtractor
|
|
14
|
+
from .pipeline import PipelineExtractor
|
|
15
|
+
from .rapidocr_text import RapidOcrExtractor
|
|
16
|
+
from .select_longest_text import SelectLongestTextExtractor
|
|
17
|
+
from .select_text import SelectTextExtractor
|
|
18
|
+
from .unstructured_text import UnstructuredExtractor
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
def get_extractor(extractor_id: str) -> TextExtractor:
|
|
@@ -22,11 +28,16 @@ def get_extractor(extractor_id: str) -> TextExtractor:
|
|
|
22
28
|
:rtype: TextExtractor
|
|
23
29
|
:raises KeyError: If the extractor identifier is not known.
|
|
24
30
|
"""
|
|
25
|
-
|
|
26
31
|
extractors: Dict[str, TextExtractor] = {
|
|
27
|
-
CascadeExtractor.extractor_id: CascadeExtractor(),
|
|
28
32
|
MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
|
|
29
33
|
PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
|
|
34
|
+
PipelineExtractor.extractor_id: PipelineExtractor(),
|
|
35
|
+
PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
|
|
36
|
+
OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
|
|
37
|
+
RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
|
|
38
|
+
SelectTextExtractor.extractor_id: SelectTextExtractor(),
|
|
39
|
+
SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
|
|
40
|
+
UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
|
|
30
41
|
}
|
|
31
42
|
if extractor_id not in extractors:
|
|
32
43
|
raise KeyError(f"Unknown extractor: {extractor_id!r}")
|
biblicus/extractors/base.py
CHANGED
|
@@ -5,12 +5,12 @@ Base interfaces for text extraction plugins.
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
|
-
from typing import Any, Dict, Optional
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
10
|
from pydantic import BaseModel
|
|
11
11
|
|
|
12
12
|
from ..corpus import Corpus
|
|
13
|
-
from ..models import CatalogItem, ExtractedText
|
|
13
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class TextExtractor(ABC):
|
|
@@ -38,11 +38,17 @@ class TextExtractor(ABC):
|
|
|
38
38
|
:rtype: pydantic.BaseModel
|
|
39
39
|
:raises ValueError: If the configuration is invalid.
|
|
40
40
|
"""
|
|
41
|
-
|
|
42
41
|
raise NotImplementedError
|
|
43
42
|
|
|
44
43
|
@abstractmethod
|
|
45
|
-
def extract_text(
|
|
44
|
+
def extract_text(
|
|
45
|
+
self,
|
|
46
|
+
*,
|
|
47
|
+
corpus: Corpus,
|
|
48
|
+
item: CatalogItem,
|
|
49
|
+
config: BaseModel,
|
|
50
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
51
|
+
) -> Optional[ExtractedText]:
|
|
46
52
|
"""
|
|
47
53
|
Derive text for a catalog item.
|
|
48
54
|
|
|
@@ -54,8 +60,9 @@ class TextExtractor(ABC):
|
|
|
54
60
|
:type item: CatalogItem
|
|
55
61
|
:param config: Parsed extractor configuration.
|
|
56
62
|
:type config: pydantic.BaseModel
|
|
63
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
64
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
57
65
|
:return: Extracted text payload or None when skipped.
|
|
58
66
|
:rtype: ExtractedText or None
|
|
59
67
|
"""
|
|
60
|
-
|
|
61
68
|
raise NotImplementedError
|
|
@@ -4,11 +4,11 @@ Metadata-based text extractor plugin.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
-
from typing import Any, Dict, Optional
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
|
|
11
|
-
from ..models import CatalogItem, ExtractedText
|
|
11
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
12
12
|
from .base import TextExtractor
|
|
13
13
|
|
|
14
14
|
|
|
@@ -60,10 +60,16 @@ class MetadataTextExtractor(TextExtractor):
|
|
|
60
60
|
:return: Parsed config.
|
|
61
61
|
:rtype: MetadataTextExtractorConfig
|
|
62
62
|
"""
|
|
63
|
-
|
|
64
63
|
return MetadataTextExtractorConfig.model_validate(config)
|
|
65
64
|
|
|
66
|
-
def extract_text(
|
|
65
|
+
def extract_text(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
corpus,
|
|
69
|
+
item: CatalogItem,
|
|
70
|
+
config: BaseModel,
|
|
71
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
72
|
+
) -> Optional[ExtractedText]:
|
|
67
73
|
"""
|
|
68
74
|
Extract a metadata-based text payload for the item.
|
|
69
75
|
|
|
@@ -73,16 +79,18 @@ class MetadataTextExtractor(TextExtractor):
|
|
|
73
79
|
:type item: CatalogItem
|
|
74
80
|
:param config: Parsed configuration model.
|
|
75
81
|
:type config: MetadataTextExtractorConfig
|
|
82
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
83
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
76
84
|
:return: Extracted text payload, or ``None`` if no metadata is available.
|
|
77
85
|
:rtype: ExtractedText or None
|
|
78
86
|
"""
|
|
79
|
-
|
|
80
87
|
parsed_config = (
|
|
81
88
|
config
|
|
82
89
|
if isinstance(config, MetadataTextExtractorConfig)
|
|
83
90
|
else MetadataTextExtractorConfig.model_validate(config)
|
|
84
91
|
)
|
|
85
92
|
_ = corpus
|
|
93
|
+
_ = previous_extractions
|
|
86
94
|
lines: list[str] = []
|
|
87
95
|
|
|
88
96
|
if parsed_config.include_title and isinstance(item.title, str) and item.title.strip():
|