biblicus 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +177 -53
  11. biblicus/corpus.py +209 -59
  12. biblicus/crawl.py +186 -0
  13. biblicus/errors.py +15 -0
  14. biblicus/evaluation.py +4 -8
  15. biblicus/extraction.py +280 -79
  16. biblicus/extractors/__init__.py +14 -3
  17. biblicus/extractors/base.py +12 -5
  18. biblicus/extractors/metadata_text.py +13 -5
  19. biblicus/extractors/openai_stt.py +180 -0
  20. biblicus/extractors/pass_through_text.py +16 -6
  21. biblicus/extractors/pdf_text.py +100 -0
  22. biblicus/extractors/pipeline.py +105 -0
  23. biblicus/extractors/rapidocr_text.py +129 -0
  24. biblicus/extractors/select_longest_text.py +105 -0
  25. biblicus/extractors/select_text.py +100 -0
  26. biblicus/extractors/unstructured_text.py +100 -0
  27. biblicus/frontmatter.py +0 -3
  28. biblicus/hook_logging.py +0 -5
  29. biblicus/hook_manager.py +3 -5
  30. biblicus/hooks.py +3 -7
  31. biblicus/ignore.py +0 -3
  32. biblicus/models.py +118 -0
  33. biblicus/retrieval.py +0 -4
  34. biblicus/sources.py +44 -9
  35. biblicus/time.py +1 -2
  36. biblicus/uris.py +3 -4
  37. biblicus/user_config.py +138 -0
  38. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/METADATA +96 -18
  39. biblicus-0.4.0.dist-info/RECORD +45 -0
  40. biblicus/extractors/cascade.py +0 -101
  41. biblicus-0.2.0.dist-info/RECORD +0 -32
  42. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/WHEEL +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/entry_points.txt +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/licenses/LICENSE +0 -0
  45. {biblicus-0.2.0.dist-info → biblicus-0.4.0.dist-info}/top_level.txt +0 -0
biblicus/extraction.py CHANGED
@@ -6,65 +6,20 @@ from __future__ import annotations
6
6
 
7
7
  import json
8
8
  from pathlib import Path
9
- from typing import Any, Dict, List, Optional
10
- from uuid import uuid4
9
+ from typing import Any, Dict, List, Optional, Tuple
11
10
 
12
11
  from pydantic import BaseModel, ConfigDict, Field
13
12
 
14
13
  from .corpus import Corpus
14
+ from .errors import ExtractionRunFatalError
15
15
  from .extractors import get_extractor
16
- from .models import CatalogItem
16
+ from .extractors.base import TextExtractor
17
+ from .extractors.pipeline import PipelineExtractorConfig, PipelineStepSpec
18
+ from .models import CatalogItem, ExtractionStepOutput
17
19
  from .retrieval import hash_text
18
20
  from .time import utc_now_iso
19
21
 
20
22
 
21
- class ExtractionRunReference(BaseModel):
22
- """
23
- Reference to an extraction run.
24
-
25
- :ivar extractor_id: Extractor plugin identifier.
26
- :vartype extractor_id: str
27
- :ivar run_id: Extraction run identifier.
28
- :vartype run_id: str
29
- """
30
-
31
- model_config = ConfigDict(extra="forbid")
32
-
33
- extractor_id: str = Field(min_length=1)
34
- run_id: str = Field(min_length=1)
35
-
36
- def as_string(self) -> str:
37
- """
38
- Serialize the reference as a single string.
39
-
40
- :return: Reference in the form extractor_id:run_id.
41
- :rtype: str
42
- """
43
-
44
- return f"{self.extractor_id}:{self.run_id}"
45
-
46
-
47
- def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
48
- """
49
- Parse an extraction run reference in the form extractor_id:run_id.
50
-
51
- :param value: Raw reference string.
52
- :type value: str
53
- :return: Parsed extraction run reference.
54
- :rtype: ExtractionRunReference
55
- :raises ValueError: If the reference is not well formed.
56
- """
57
-
58
- if ":" not in value:
59
- raise ValueError("Extraction run reference must be extractor_id:run_id")
60
- extractor_id, run_id = value.split(":", 1)
61
- extractor_id = extractor_id.strip()
62
- run_id = run_id.strip()
63
- if not extractor_id or not run_id:
64
- raise ValueError("Extraction run reference must be extractor_id:run_id with non-empty parts")
65
- return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
66
-
67
-
68
23
  class ExtractionRecipeManifest(BaseModel):
69
24
  """
70
25
  Reproducible configuration for an extraction plugin run.
@@ -90,26 +45,81 @@ class ExtractionRecipeManifest(BaseModel):
90
45
  config: Dict[str, Any] = Field(default_factory=dict)
91
46
 
92
47
 
48
+ class ExtractionStepResult(BaseModel):
49
+ """
50
+ Per-item result record for a single pipeline step.
51
+
52
+ :ivar step_index: One-based pipeline step index.
53
+ :vartype step_index: int
54
+ :ivar extractor_id: Extractor identifier for the step.
55
+ :vartype extractor_id: str
56
+ :ivar status: Step status, extracted, skipped, or errored.
57
+ :vartype status: str
58
+ :ivar text_relpath: Relative path to the step text artifact, when extracted.
59
+ :vartype text_relpath: str or None
60
+ :ivar text_characters: Character count of the extracted text.
61
+ :vartype text_characters: int
62
+ :ivar producer_extractor_id: Extractor identifier that produced the text content.
63
+ :vartype producer_extractor_id: str or None
64
+ :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
65
+ :vartype source_step_index: int or None
66
+ :ivar error_type: Optional error type name for errored steps.
67
+ :vartype error_type: str or None
68
+ :ivar error_message: Optional error message for errored steps.
69
+ :vartype error_message: str or None
70
+ """
71
+
72
+ model_config = ConfigDict(extra="forbid")
73
+
74
+ step_index: int = Field(ge=1)
75
+ extractor_id: str
76
+ status: str
77
+ text_relpath: Optional[str] = None
78
+ text_characters: int = Field(default=0, ge=0)
79
+ producer_extractor_id: Optional[str] = None
80
+ source_step_index: Optional[int] = Field(default=None, ge=1)
81
+ error_type: Optional[str] = None
82
+ error_message: Optional[str] = None
83
+
84
+
93
85
  class ExtractionItemResult(BaseModel):
94
86
  """
95
87
  Per-item result record for an extraction run.
96
88
 
97
89
  :ivar item_id: Item identifier.
98
90
  :vartype item_id: str
99
- :ivar status: Result status, extracted or skipped.
91
+ :ivar status: Final result status, extracted, skipped, or errored.
100
92
  :vartype status: str
101
- :ivar text_relpath: Relative path to the extracted text artifact, when extracted.
102
- :vartype text_relpath: str or None
103
- :ivar producer_extractor_id: Extractor identifier that produced the extracted text.
104
- :vartype producer_extractor_id: str or None
93
+ :ivar final_text_relpath: Relative path to the final extracted text artifact, when extracted.
94
+ :vartype final_text_relpath: str or None
95
+ :ivar final_step_index: Pipeline step index that produced the final text.
96
+ :vartype final_step_index: int or None
97
+ :ivar final_step_extractor_id: Extractor identifier of the step that produced the final text.
98
+ :vartype final_step_extractor_id: str or None
99
+ :ivar final_producer_extractor_id: Extractor identifier that produced the final text content.
100
+ :vartype final_producer_extractor_id: str or None
101
+ :ivar final_source_step_index: Optional step index that supplied the final text for selection-style extractors.
102
+ :vartype final_source_step_index: int or None
103
+ :ivar error_type: Optional error type name when no extracted text was produced.
104
+ :vartype error_type: str or None
105
+ :ivar error_message: Optional error message when no extracted text was produced.
106
+ :vartype error_message: str or None
107
+ :ivar step_results: Per-step results recorded for this item.
108
+ :vartype step_results: list[ExtractionStepResult]
105
109
  """
106
110
 
107
111
  model_config = ConfigDict(extra="forbid")
108
112
 
109
113
  item_id: str
110
114
  status: str
111
- text_relpath: Optional[str] = None
112
- producer_extractor_id: Optional[str] = None
115
+ final_text_relpath: Optional[str] = None
116
+ final_step_index: Optional[int] = Field(default=None, ge=1)
117
+ final_step_extractor_id: Optional[str] = None
118
+ final_producer_extractor_id: Optional[str] = None
119
+ final_source_step_index: Optional[int] = Field(default=None, ge=1)
120
+ error_type: Optional[str] = None
121
+ error_message: Optional[str] = None
122
+ step_results: List[ExtractionStepResult] = Field(default_factory=list)
113
123
 
114
124
 
115
125
  class ExtractionRunManifest(BaseModel):
@@ -143,7 +153,9 @@ class ExtractionRunManifest(BaseModel):
143
153
  stats: Dict[str, Any] = Field(default_factory=dict)
144
154
 
145
155
 
146
- def create_extraction_recipe_manifest(*, extractor_id: str, name: str, config: Dict[str, Any]) -> ExtractionRecipeManifest:
156
+ def create_extraction_recipe_manifest(
157
+ *, extractor_id: str, name: str, config: Dict[str, Any]
158
+ ) -> ExtractionRecipeManifest:
147
159
  """
148
160
  Create a deterministic extraction recipe manifest.
149
161
 
@@ -156,8 +168,9 @@ def create_extraction_recipe_manifest(*, extractor_id: str, name: str, config: D
156
168
  :return: Recipe manifest.
157
169
  :rtype: ExtractionRecipeManifest
158
170
  """
159
-
160
- recipe_payload = json.dumps({"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True)
171
+ recipe_payload = json.dumps(
172
+ {"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True
173
+ )
161
174
  recipe_id = hash_text(recipe_payload)
162
175
  return ExtractionRecipeManifest(
163
176
  recipe_id=recipe_id,
@@ -168,7 +181,9 @@ def create_extraction_recipe_manifest(*, extractor_id: str, name: str, config: D
168
181
  )
169
182
 
170
183
 
171
- def create_extraction_run_manifest(corpus: Corpus, *, recipe: ExtractionRecipeManifest) -> ExtractionRunManifest:
184
+ def create_extraction_run_manifest(
185
+ corpus: Corpus, *, recipe: ExtractionRecipeManifest
186
+ ) -> ExtractionRunManifest:
172
187
  """
173
188
  Create a new extraction run manifest for a corpus.
174
189
 
@@ -179,10 +194,10 @@ def create_extraction_run_manifest(corpus: Corpus, *, recipe: ExtractionRecipeMa
179
194
  :return: Run manifest.
180
195
  :rtype: ExtractionRunManifest
181
196
  """
182
-
183
197
  catalog = corpus.load_catalog()
198
+ run_id = hash_text(f"{recipe.recipe_id}:{catalog.generated_at}")
184
199
  return ExtractionRunManifest(
185
- run_id=str(uuid4()),
200
+ run_id=run_id,
186
201
  recipe=recipe,
187
202
  corpus_uri=corpus.uri,
188
203
  catalog_generated_at=catalog.generated_at,
@@ -203,7 +218,6 @@ def write_extraction_run_manifest(*, run_dir: Path, manifest: ExtractionRunManif
203
218
  :return: None.
204
219
  :rtype: None
205
220
  """
206
-
207
221
  manifest_path = run_dir / "manifest.json"
208
222
  manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
209
223
 
@@ -221,7 +235,6 @@ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str
221
235
  :return: Relative path to the stored text artifact.
222
236
  :rtype: str
223
237
  """
224
-
225
238
  text_dir = run_dir / "text"
226
239
  text_dir.mkdir(parents=True, exist_ok=True)
227
240
  relpath = str(Path("text") / f"{item.id}.txt")
@@ -230,6 +243,70 @@ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str
230
243
  return relpath
231
244
 
232
245
 
246
+ def _pipeline_step_dir_name(*, step_index: int, extractor_id: str) -> str:
247
+ """
248
+ Build a stable directory name for a pipeline step.
249
+
250
+ :param step_index: One-based pipeline step index.
251
+ :type step_index: int
252
+ :param extractor_id: Extractor identifier for the step.
253
+ :type extractor_id: str
254
+ :return: Directory name for the step.
255
+ :rtype: str
256
+ """
257
+ return f"{step_index:02d}-{extractor_id}"
258
+
259
+
260
+ def write_pipeline_step_text_artifact(
261
+ *,
262
+ run_dir: Path,
263
+ step_index: int,
264
+ extractor_id: str,
265
+ item: CatalogItem,
266
+ text: str,
267
+ ) -> str:
268
+ """
269
+ Write a pipeline step text artifact for an item.
270
+
271
+ :param run_dir: Extraction run directory.
272
+ :type run_dir: Path
273
+ :param step_index: One-based pipeline step index.
274
+ :type step_index: int
275
+ :param extractor_id: Extractor identifier for the step.
276
+ :type extractor_id: str
277
+ :param item: Catalog item being extracted.
278
+ :type item: CatalogItem
279
+ :param text: Extracted text content.
280
+ :type text: str
281
+ :return: Relative path to the stored step text artifact.
282
+ :rtype: str
283
+ """
284
+ step_dir_name = _pipeline_step_dir_name(step_index=step_index, extractor_id=extractor_id)
285
+ text_dir = run_dir / "steps" / step_dir_name / "text"
286
+ text_dir.mkdir(parents=True, exist_ok=True)
287
+ relpath = str(Path("steps") / step_dir_name / "text" / f"{item.id}.txt")
288
+ (run_dir / relpath).write_text(text, encoding="utf-8")
289
+ return relpath
290
+
291
+
292
+ def _final_output_from_steps(
293
+ step_outputs: List[ExtractionStepOutput],
294
+ ) -> Optional[ExtractionStepOutput]:
295
+ """
296
+ Select the final pipeline output for an item.
297
+
298
+ The final output is the last extracted step output in pipeline order.
299
+
300
+ :param step_outputs: Extracted outputs produced by pipeline steps.
301
+ :type step_outputs: list[biblicus.models.ExtractionStepOutput]
302
+ :return: Final step output or None when no steps produced extracted text.
303
+ :rtype: biblicus.models.ExtractionStepOutput or None
304
+ """
305
+ if not step_outputs:
306
+ return None
307
+ return step_outputs[-1]
308
+
309
+
233
310
  def build_extraction_run(
234
311
  corpus: Corpus,
235
312
  *,
@@ -238,11 +315,11 @@ def build_extraction_run(
238
315
  config: Dict[str, Any],
239
316
  ) -> ExtractionRunManifest:
240
317
  """
241
- Build an extraction run for a corpus using a named extractor plugin.
318
+ Build an extraction run for a corpus using the pipeline extractor.
242
319
 
243
320
  :param corpus: Corpus to extract from.
244
321
  :type corpus: Corpus
245
- :param extractor_id: Extractor plugin identifier.
322
+ :param extractor_id: Extractor plugin identifier (must be ``pipeline``).
246
323
  :type extractor_id: str
247
324
  :param recipe_name: Human-readable recipe name.
248
325
  :type recipe_name: str
@@ -253,8 +330,8 @@ def build_extraction_run(
253
330
  :raises KeyError: If the extractor identifier is unknown.
254
331
  :raises ValueError: If the extractor configuration is invalid.
255
332
  :raises OSError: If the run directory or artifacts cannot be written.
333
+ :raises ExtractionRunFatalError: If the extractor is not the pipeline.
256
334
  """
257
-
258
335
  extractor = get_extractor(extractor_id)
259
336
  parsed_config = extractor.validate_config(config)
260
337
  recipe = create_extraction_recipe_manifest(
@@ -264,17 +341,36 @@ def build_extraction_run(
264
341
  )
265
342
  manifest = create_extraction_run_manifest(corpus, recipe=recipe)
266
343
  run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
344
+ if run_dir.exists():
345
+ return corpus.load_extraction_run_manifest(extractor_id=extractor_id, run_id=manifest.run_id)
267
346
  run_dir.mkdir(parents=True, exist_ok=False)
268
347
 
269
348
  catalog = corpus.load_catalog()
349
+ if extractor_id != "pipeline":
350
+ raise ExtractionRunFatalError("Extraction runs must use the pipeline extractor")
351
+
352
+ pipeline_config = (
353
+ parsed_config
354
+ if isinstance(parsed_config, PipelineExtractorConfig)
355
+ else PipelineExtractorConfig.model_validate(parsed_config)
356
+ )
357
+
358
+ validated_steps: List[Tuple[PipelineStepSpec, TextExtractor, BaseModel]] = []
359
+ for step in pipeline_config.steps:
360
+ step_extractor = get_extractor(step.extractor_id)
361
+ parsed_step_config = step_extractor.validate_config(step.config)
362
+ validated_steps.append((step, step_extractor, parsed_step_config))
363
+
270
364
  extracted_items: List[ExtractionItemResult] = []
271
365
  extracted_count = 0
272
366
  skipped_count = 0
367
+ errored_count = 0
273
368
  extracted_nonempty_count = 0
274
369
  extracted_empty_count = 0
275
370
  already_text_item_count = 0
276
371
  needs_extraction_item_count = 0
277
372
  converted_item_count = 0
373
+
278
374
  for item in catalog.items.values():
279
375
  media_type = item.media_type
280
376
  item_is_text = media_type == "text/markdown" or media_type.startswith("text/")
@@ -283,35 +379,139 @@ def build_extraction_run(
283
379
  else:
284
380
  needs_extraction_item_count += 1
285
381
 
286
- extracted_text = extractor.extract_text(corpus=corpus, item=item, config=parsed_config)
287
- if extracted_text is None:
288
- skipped_count += 1
382
+ step_results: List[ExtractionStepResult] = []
383
+ step_outputs: List[ExtractionStepOutput] = []
384
+ last_error_type: Optional[str] = None
385
+ last_error_message: Optional[str] = None
386
+
387
+ for step_index, (step, step_extractor, parsed_step_config) in enumerate(
388
+ validated_steps, start=1
389
+ ):
390
+ try:
391
+ extracted_text = step_extractor.extract_text(
392
+ corpus=corpus,
393
+ item=item,
394
+ config=parsed_step_config,
395
+ previous_extractions=step_outputs,
396
+ )
397
+ except Exception as extraction_error:
398
+ if isinstance(extraction_error, ExtractionRunFatalError):
399
+ raise
400
+ last_error_type = extraction_error.__class__.__name__
401
+ last_error_message = str(extraction_error)
402
+ step_results.append(
403
+ ExtractionStepResult(
404
+ step_index=step_index,
405
+ extractor_id=step.extractor_id,
406
+ status="errored",
407
+ text_relpath=None,
408
+ text_characters=0,
409
+ producer_extractor_id=None,
410
+ source_step_index=None,
411
+ error_type=last_error_type,
412
+ error_message=last_error_message,
413
+ )
414
+ )
415
+ continue
416
+
417
+ if extracted_text is None:
418
+ step_results.append(
419
+ ExtractionStepResult(
420
+ step_index=step_index,
421
+ extractor_id=step.extractor_id,
422
+ status="skipped",
423
+ text_relpath=None,
424
+ text_characters=0,
425
+ producer_extractor_id=None,
426
+ source_step_index=None,
427
+ error_type=None,
428
+ error_message=None,
429
+ )
430
+ )
431
+ continue
432
+
433
+ relpath = write_pipeline_step_text_artifact(
434
+ run_dir=run_dir,
435
+ step_index=step_index,
436
+ extractor_id=step.extractor_id,
437
+ item=item,
438
+ text=extracted_text.text,
439
+ )
440
+ text_characters = len(extracted_text.text)
441
+ step_results.append(
442
+ ExtractionStepResult(
443
+ step_index=step_index,
444
+ extractor_id=step.extractor_id,
445
+ status="extracted",
446
+ text_relpath=relpath,
447
+ text_characters=text_characters,
448
+ producer_extractor_id=extracted_text.producer_extractor_id,
449
+ source_step_index=extracted_text.source_step_index,
450
+ error_type=None,
451
+ error_message=None,
452
+ )
453
+ )
454
+ step_outputs.append(
455
+ ExtractionStepOutput(
456
+ step_index=step_index,
457
+ extractor_id=step.extractor_id,
458
+ status="extracted",
459
+ text=extracted_text.text,
460
+ text_characters=text_characters,
461
+ producer_extractor_id=extracted_text.producer_extractor_id,
462
+ source_step_index=extracted_text.source_step_index,
463
+ error_type=None,
464
+ error_message=None,
465
+ )
466
+ )
467
+
468
+ final_output = _final_output_from_steps(step_outputs)
469
+ if final_output is None:
470
+ status = "errored" if last_error_type else "skipped"
471
+ if status == "errored":
472
+ errored_count += 1
473
+ else:
474
+ skipped_count += 1
289
475
  extracted_items.append(
290
476
  ExtractionItemResult(
291
477
  item_id=item.id,
292
- status="skipped",
293
- text_relpath=None,
294
- producer_extractor_id=None,
478
+ status=status,
479
+ final_text_relpath=None,
480
+ final_step_index=None,
481
+ final_step_extractor_id=None,
482
+ final_producer_extractor_id=None,
483
+ final_source_step_index=None,
484
+ error_type=last_error_type if status == "errored" else None,
485
+ error_message=last_error_message if status == "errored" else None,
486
+ step_results=step_results,
295
487
  )
296
488
  )
297
489
  continue
298
490
 
491
+ final_text = final_output.text or ""
492
+ final_text_relpath = write_extracted_text_artifact(
493
+ run_dir=run_dir, item=item, text=final_text
494
+ )
299
495
  extracted_count += 1
300
- stripped_text = extracted_text.text.strip()
301
- if stripped_text:
496
+ if final_text.strip():
302
497
  extracted_nonempty_count += 1
303
498
  if not item_is_text:
304
499
  converted_item_count += 1
305
500
  else:
306
501
  extracted_empty_count += 1
307
502
 
308
- relpath = write_extracted_text_artifact(run_dir=run_dir, item=item, text=extracted_text.text)
309
503
  extracted_items.append(
310
504
  ExtractionItemResult(
311
505
  item_id=item.id,
312
506
  status="extracted",
313
- text_relpath=relpath,
314
- producer_extractor_id=extracted_text.producer_extractor_id,
507
+ final_text_relpath=final_text_relpath,
508
+ final_step_index=final_output.step_index,
509
+ final_step_extractor_id=final_output.extractor_id,
510
+ final_producer_extractor_id=final_output.producer_extractor_id,
511
+ final_source_step_index=final_output.source_step_index,
512
+ error_type=None,
513
+ error_message=None,
514
+ step_results=step_results,
315
515
  )
316
516
  )
317
517
 
@@ -323,6 +523,7 @@ def build_extraction_run(
323
523
  "extracted_nonempty_items": extracted_nonempty_count,
324
524
  "extracted_empty_items": extracted_empty_count,
325
525
  "skipped_items": skipped_count,
526
+ "errored_items": errored_count,
326
527
  "converted_items": converted_item_count,
327
528
  }
328
529
  manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
@@ -7,9 +7,15 @@ from __future__ import annotations
7
7
  from typing import Dict
8
8
 
9
9
  from .base import TextExtractor
10
- from .cascade import CascadeExtractor
11
10
  from .metadata_text import MetadataTextExtractor
11
+ from .openai_stt import OpenAiSpeechToTextExtractor
12
12
  from .pass_through_text import PassThroughTextExtractor
13
+ from .pdf_text import PortableDocumentFormatTextExtractor
14
+ from .pipeline import PipelineExtractor
15
+ from .rapidocr_text import RapidOcrExtractor
16
+ from .select_longest_text import SelectLongestTextExtractor
17
+ from .select_text import SelectTextExtractor
18
+ from .unstructured_text import UnstructuredExtractor
13
19
 
14
20
 
15
21
  def get_extractor(extractor_id: str) -> TextExtractor:
@@ -22,11 +28,16 @@ def get_extractor(extractor_id: str) -> TextExtractor:
22
28
  :rtype: TextExtractor
23
29
  :raises KeyError: If the extractor identifier is not known.
24
30
  """
25
-
26
31
  extractors: Dict[str, TextExtractor] = {
27
- CascadeExtractor.extractor_id: CascadeExtractor(),
28
32
  MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
29
33
  PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
34
+ PipelineExtractor.extractor_id: PipelineExtractor(),
35
+ PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
36
+ OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
37
+ RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
38
+ SelectTextExtractor.extractor_id: SelectTextExtractor(),
39
+ SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
40
+ UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
30
41
  }
31
42
  if extractor_id not in extractors:
32
43
  raise KeyError(f"Unknown extractor: {extractor_id!r}")
@@ -5,12 +5,12 @@ Base interfaces for text extraction plugins.
5
5
  from __future__ import annotations
6
6
 
7
7
  from abc import ABC, abstractmethod
8
- from typing import Any, Dict, Optional
8
+ from typing import Any, Dict, List, Optional
9
9
 
10
10
  from pydantic import BaseModel
11
11
 
12
12
  from ..corpus import Corpus
13
- from ..models import CatalogItem, ExtractedText
13
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
14
14
 
15
15
 
16
16
  class TextExtractor(ABC):
@@ -38,11 +38,17 @@ class TextExtractor(ABC):
38
38
  :rtype: pydantic.BaseModel
39
39
  :raises ValueError: If the configuration is invalid.
40
40
  """
41
-
42
41
  raise NotImplementedError
43
42
 
44
43
  @abstractmethod
45
- def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
44
+ def extract_text(
45
+ self,
46
+ *,
47
+ corpus: Corpus,
48
+ item: CatalogItem,
49
+ config: BaseModel,
50
+ previous_extractions: List[ExtractionStepOutput],
51
+ ) -> Optional[ExtractedText]:
46
52
  """
47
53
  Derive text for a catalog item.
48
54
 
@@ -54,8 +60,9 @@ class TextExtractor(ABC):
54
60
  :type item: CatalogItem
55
61
  :param config: Parsed extractor configuration.
56
62
  :type config: pydantic.BaseModel
63
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
64
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
57
65
  :return: Extracted text payload or None when skipped.
58
66
  :rtype: ExtractedText or None
59
67
  """
60
-
61
68
  raise NotImplementedError
@@ -4,11 +4,11 @@ Metadata-based text extractor plugin.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- from typing import Any, Dict, Optional
7
+ from typing import Any, Dict, List, Optional
8
8
 
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
 
11
- from ..models import CatalogItem, ExtractedText
11
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
12
12
  from .base import TextExtractor
13
13
 
14
14
 
@@ -60,10 +60,16 @@ class MetadataTextExtractor(TextExtractor):
60
60
  :return: Parsed config.
61
61
  :rtype: MetadataTextExtractorConfig
62
62
  """
63
-
64
63
  return MetadataTextExtractorConfig.model_validate(config)
65
64
 
66
- def extract_text(self, *, corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
65
+ def extract_text(
66
+ self,
67
+ *,
68
+ corpus,
69
+ item: CatalogItem,
70
+ config: BaseModel,
71
+ previous_extractions: List[ExtractionStepOutput],
72
+ ) -> Optional[ExtractedText]:
67
73
  """
68
74
  Extract a metadata-based text payload for the item.
69
75
 
@@ -73,16 +79,18 @@ class MetadataTextExtractor(TextExtractor):
73
79
  :type item: CatalogItem
74
80
  :param config: Parsed configuration model.
75
81
  :type config: MetadataTextExtractorConfig
82
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
83
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
76
84
  :return: Extracted text payload, or ``None`` if no metadata is available.
77
85
  :rtype: ExtractedText or None
78
86
  """
79
-
80
87
  parsed_config = (
81
88
  config
82
89
  if isinstance(config, MetadataTextExtractorConfig)
83
90
  else MetadataTextExtractorConfig.model_validate(config)
84
91
  )
85
92
  _ = corpus
93
+ _ = previous_extractions
86
94
  lines: list[str] = []
87
95
 
88
96
  if parsed_config.include_title and isinstance(item.title, str) and item.title.strip():