biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +96 -13
  9. biblicus/backends/sqlite_full_text_search.py +74 -14
  10. biblicus/cli.py +126 -19
  11. biblicus/constants.py +2 -0
  12. biblicus/corpus.py +455 -45
  13. biblicus/errors.py +15 -0
  14. biblicus/evaluation.py +4 -8
  15. biblicus/extraction.py +529 -0
  16. biblicus/extractors/__init__.py +44 -0
  17. biblicus/extractors/base.py +68 -0
  18. biblicus/extractors/metadata_text.py +106 -0
  19. biblicus/extractors/openai_stt.py +180 -0
  20. biblicus/extractors/pass_through_text.py +84 -0
  21. biblicus/extractors/pdf_text.py +100 -0
  22. biblicus/extractors/pipeline.py +105 -0
  23. biblicus/extractors/rapidocr_text.py +129 -0
  24. biblicus/extractors/select_longest_text.py +105 -0
  25. biblicus/extractors/select_text.py +100 -0
  26. biblicus/extractors/unstructured_text.py +100 -0
  27. biblicus/frontmatter.py +0 -3
  28. biblicus/hook_logging.py +180 -0
  29. biblicus/hook_manager.py +203 -0
  30. biblicus/hooks.py +261 -0
  31. biblicus/ignore.py +64 -0
  32. biblicus/models.py +107 -0
  33. biblicus/retrieval.py +0 -4
  34. biblicus/sources.py +85 -5
  35. biblicus/time.py +0 -1
  36. biblicus/uris.py +3 -4
  37. biblicus/user_config.py +138 -0
  38. biblicus-0.3.0.dist-info/METADATA +336 -0
  39. biblicus-0.3.0.dist-info/RECORD +44 -0
  40. biblicus-0.1.1.dist-info/METADATA +0 -174
  41. biblicus-0.1.1.dist-info/RECORD +0 -22
  42. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
  43. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
  44. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
  45. {biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
biblicus/errors.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Error types for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+
8
+ class ExtractionRunFatalError(RuntimeError):
9
+ """
10
+ Fatal extraction run error that should abort the entire run.
11
+
12
+ This exception is used for conditions that indicate a configuration or environment problem
13
+ rather than a per-item extraction failure. For example, a selection extractor that depends
14
+ on referenced extraction run manifests treats missing manifests as fatal.
15
+ """
biblicus/evaluation.py CHANGED
@@ -11,8 +11,8 @@ from typing import Dict, List, Optional
11
11
 
12
12
  from pydantic import BaseModel, ConfigDict, Field, model_validator
13
13
 
14
- from .constants import DATASET_SCHEMA_VERSION
15
14
  from .backends import get_backend
15
+ from .constants import DATASET_SCHEMA_VERSION
16
16
  from .corpus import Corpus
17
17
  from .models import QueryBudget, RetrievalResult, RetrievalRun
18
18
  from .time import utc_now_iso
@@ -45,7 +45,9 @@ class EvaluationQuery(BaseModel):
45
45
  @model_validator(mode="after")
46
46
  def _require_expectation(self) -> "EvaluationQuery":
47
47
  if not self.expected_item_id and not self.expected_source_uri:
48
- raise ValueError("Evaluation queries must include expected_item_id or expected_source_uri")
48
+ raise ValueError(
49
+ "Evaluation queries must include expected_item_id or expected_source_uri"
50
+ )
49
51
  return self
50
52
 
51
53
 
@@ -114,7 +116,6 @@ def load_dataset(path: Path) -> EvaluationDataset:
114
116
  :return: Parsed evaluation dataset.
115
117
  :rtype: EvaluationDataset
116
118
  """
117
-
118
119
  data = json.loads(path.read_text(encoding="utf-8"))
119
120
  return EvaluationDataset.model_validate(data)
120
121
 
@@ -140,7 +141,6 @@ def evaluate_run(
140
141
  :return: Evaluation result bundle.
141
142
  :rtype: EvaluationResult
142
143
  """
143
-
144
144
  backend = get_backend(run.recipe.backend_id)
145
145
  latency_seconds: List[float] = []
146
146
  hit_count = 0
@@ -200,7 +200,6 @@ def _expected_rank(result: RetrievalResult, query: EvaluationQuery) -> Optional[
200
200
  :return: Rank of the first matching evidence item, or None.
201
201
  :rtype: int or None
202
202
  """
203
-
204
203
  for evidence in result.evidence:
205
204
  if query.expected_item_id and evidence.item_id == query.expected_item_id:
206
205
  return evidence.rank
@@ -218,7 +217,6 @@ def _average_latency_milliseconds(latencies: List[float]) -> float:
218
217
  :return: Average latency in milliseconds.
219
218
  :rtype: float
220
219
  """
221
-
222
220
  if not latencies:
223
221
  return 0.0
224
222
  return sum(latencies) / len(latencies) * 1000.0
@@ -233,7 +231,6 @@ def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
233
231
  :return: Percentile 95 latency in milliseconds.
234
232
  :rtype: float
235
233
  """
236
-
237
234
  if not latencies:
238
235
  return 0.0
239
236
  sorted_latencies = sorted(latencies)
@@ -252,7 +249,6 @@ def _run_artifact_bytes(corpus: Corpus, run: RetrievalRun) -> int:
252
249
  :return: Total artifact bytes.
253
250
  :rtype: int
254
251
  """
255
-
256
252
  total_bytes = 0
257
253
  for artifact_relpath in run.artifact_paths:
258
254
  artifact_path = corpus.root / artifact_relpath
biblicus/extraction.py ADDED
@@ -0,0 +1,529 @@
1
+ """
2
+ Text extraction runs for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+ from uuid import uuid4
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from .corpus import Corpus
15
+ from .errors import ExtractionRunFatalError
16
+ from .extractors import get_extractor
17
+ from .extractors.base import TextExtractor
18
+ from .extractors.pipeline import PipelineExtractorConfig, PipelineStepSpec
19
+ from .models import CatalogItem, ExtractionStepOutput
20
+ from .retrieval import hash_text
21
+ from .time import utc_now_iso
22
+
23
+
24
+ class ExtractionRecipeManifest(BaseModel):
25
+ """
26
+ Reproducible configuration for an extraction plugin run.
27
+
28
+ :ivar recipe_id: Deterministic recipe identifier.
29
+ :vartype recipe_id: str
30
+ :ivar extractor_id: Extractor plugin identifier.
31
+ :vartype extractor_id: str
32
+ :ivar name: Human-readable recipe name.
33
+ :vartype name: str
34
+ :ivar created_at: International Organization for Standardization 8601 timestamp.
35
+ :vartype created_at: str
36
+ :ivar config: Extractor-specific configuration values.
37
+ :vartype config: dict[str, Any]
38
+ """
39
+
40
+ model_config = ConfigDict(extra="forbid")
41
+
42
+ recipe_id: str
43
+ extractor_id: str
44
+ name: str
45
+ created_at: str
46
+ config: Dict[str, Any] = Field(default_factory=dict)
47
+
48
+
49
+ class ExtractionStepResult(BaseModel):
50
+ """
51
+ Per-item result record for a single pipeline step.
52
+
53
+ :ivar step_index: One-based pipeline step index.
54
+ :vartype step_index: int
55
+ :ivar extractor_id: Extractor identifier for the step.
56
+ :vartype extractor_id: str
57
+ :ivar status: Step status, extracted, skipped, or errored.
58
+ :vartype status: str
59
+ :ivar text_relpath: Relative path to the step text artifact, when extracted.
60
+ :vartype text_relpath: str or None
61
+ :ivar text_characters: Character count of the extracted text.
62
+ :vartype text_characters: int
63
+ :ivar producer_extractor_id: Extractor identifier that produced the text content.
64
+ :vartype producer_extractor_id: str or None
65
+ :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
66
+ :vartype source_step_index: int or None
67
+ :ivar error_type: Optional error type name for errored steps.
68
+ :vartype error_type: str or None
69
+ :ivar error_message: Optional error message for errored steps.
70
+ :vartype error_message: str or None
71
+ """
72
+
73
+ model_config = ConfigDict(extra="forbid")
74
+
75
+ step_index: int = Field(ge=1)
76
+ extractor_id: str
77
+ status: str
78
+ text_relpath: Optional[str] = None
79
+ text_characters: int = Field(default=0, ge=0)
80
+ producer_extractor_id: Optional[str] = None
81
+ source_step_index: Optional[int] = Field(default=None, ge=1)
82
+ error_type: Optional[str] = None
83
+ error_message: Optional[str] = None
84
+
85
+
86
+ class ExtractionItemResult(BaseModel):
87
+ """
88
+ Per-item result record for an extraction run.
89
+
90
+ :ivar item_id: Item identifier.
91
+ :vartype item_id: str
92
+ :ivar status: Final result status, extracted, skipped, or errored.
93
+ :vartype status: str
94
+ :ivar final_text_relpath: Relative path to the final extracted text artifact, when extracted.
95
+ :vartype final_text_relpath: str or None
96
+ :ivar final_step_index: Pipeline step index that produced the final text.
97
+ :vartype final_step_index: int or None
98
+ :ivar final_step_extractor_id: Extractor identifier of the step that produced the final text.
99
+ :vartype final_step_extractor_id: str or None
100
+ :ivar final_producer_extractor_id: Extractor identifier that produced the final text content.
101
+ :vartype final_producer_extractor_id: str or None
102
+ :ivar final_source_step_index: Optional step index that supplied the final text for selection-style extractors.
103
+ :vartype final_source_step_index: int or None
104
+ :ivar error_type: Optional error type name when no extracted text was produced.
105
+ :vartype error_type: str or None
106
+ :ivar error_message: Optional error message when no extracted text was produced.
107
+ :vartype error_message: str or None
108
+ :ivar step_results: Per-step results recorded for this item.
109
+ :vartype step_results: list[ExtractionStepResult]
110
+ """
111
+
112
+ model_config = ConfigDict(extra="forbid")
113
+
114
+ item_id: str
115
+ status: str
116
+ final_text_relpath: Optional[str] = None
117
+ final_step_index: Optional[int] = Field(default=None, ge=1)
118
+ final_step_extractor_id: Optional[str] = None
119
+ final_producer_extractor_id: Optional[str] = None
120
+ final_source_step_index: Optional[int] = Field(default=None, ge=1)
121
+ error_type: Optional[str] = None
122
+ error_message: Optional[str] = None
123
+ step_results: List[ExtractionStepResult] = Field(default_factory=list)
124
+
125
+
126
+ class ExtractionRunManifest(BaseModel):
127
+ """
128
+ Immutable record describing an extraction run.
129
+
130
+ :ivar run_id: Unique run identifier.
131
+ :vartype run_id: str
132
+ :ivar recipe: Recipe manifest for this run.
133
+ :vartype recipe: ExtractionRecipeManifest
134
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
135
+ :vartype corpus_uri: str
136
+ :ivar catalog_generated_at: Catalog timestamp used for the run.
137
+ :vartype catalog_generated_at: str
138
+ :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
139
+ :vartype created_at: str
140
+ :ivar items: Per-item results.
141
+ :vartype items: list[ExtractionItemResult]
142
+ :ivar stats: Run statistics.
143
+ :vartype stats: dict[str, Any]
144
+ """
145
+
146
+ model_config = ConfigDict(extra="forbid")
147
+
148
+ run_id: str
149
+ recipe: ExtractionRecipeManifest
150
+ corpus_uri: str
151
+ catalog_generated_at: str
152
+ created_at: str
153
+ items: List[ExtractionItemResult] = Field(default_factory=list)
154
+ stats: Dict[str, Any] = Field(default_factory=dict)
155
+
156
+
157
+ def create_extraction_recipe_manifest(
158
+ *, extractor_id: str, name: str, config: Dict[str, Any]
159
+ ) -> ExtractionRecipeManifest:
160
+ """
161
+ Create a deterministic extraction recipe manifest.
162
+
163
+ :param extractor_id: Extractor plugin identifier.
164
+ :type extractor_id: str
165
+ :param name: Human recipe name.
166
+ :type name: str
167
+ :param config: Extractor configuration.
168
+ :type config: dict[str, Any]
169
+ :return: Recipe manifest.
170
+ :rtype: ExtractionRecipeManifest
171
+ """
172
+ recipe_payload = json.dumps(
173
+ {"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True
174
+ )
175
+ recipe_id = hash_text(recipe_payload)
176
+ return ExtractionRecipeManifest(
177
+ recipe_id=recipe_id,
178
+ extractor_id=extractor_id,
179
+ name=name,
180
+ created_at=utc_now_iso(),
181
+ config=config,
182
+ )
183
+
184
+
185
+ def create_extraction_run_manifest(
186
+ corpus: Corpus, *, recipe: ExtractionRecipeManifest
187
+ ) -> ExtractionRunManifest:
188
+ """
189
+ Create a new extraction run manifest for a corpus.
190
+
191
+ :param corpus: Corpus associated with the run.
192
+ :type corpus: Corpus
193
+ :param recipe: Recipe manifest.
194
+ :type recipe: ExtractionRecipeManifest
195
+ :return: Run manifest.
196
+ :rtype: ExtractionRunManifest
197
+ """
198
+ catalog = corpus.load_catalog()
199
+ return ExtractionRunManifest(
200
+ run_id=str(uuid4()),
201
+ recipe=recipe,
202
+ corpus_uri=corpus.uri,
203
+ catalog_generated_at=catalog.generated_at,
204
+ created_at=utc_now_iso(),
205
+ items=[],
206
+ stats={},
207
+ )
208
+
209
+
210
+ def write_extraction_run_manifest(*, run_dir: Path, manifest: ExtractionRunManifest) -> None:
211
+ """
212
+ Persist an extraction run manifest to a run directory.
213
+
214
+ :param run_dir: Extraction run directory.
215
+ :type run_dir: Path
216
+ :param manifest: Run manifest to write.
217
+ :type manifest: ExtractionRunManifest
218
+ :return: None.
219
+ :rtype: None
220
+ """
221
+ manifest_path = run_dir / "manifest.json"
222
+ manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
223
+
224
+
225
+ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str) -> str:
226
+ """
227
+ Write an extracted text artifact for an item into the run directory.
228
+
229
+ :param run_dir: Extraction run directory.
230
+ :type run_dir: Path
231
+ :param item: Catalog item being extracted.
232
+ :type item: CatalogItem
233
+ :param text: Extracted text.
234
+ :type text: str
235
+ :return: Relative path to the stored text artifact.
236
+ :rtype: str
237
+ """
238
+ text_dir = run_dir / "text"
239
+ text_dir.mkdir(parents=True, exist_ok=True)
240
+ relpath = str(Path("text") / f"{item.id}.txt")
241
+ path = run_dir / relpath
242
+ path.write_text(text, encoding="utf-8")
243
+ return relpath
244
+
245
+
246
+ def _pipeline_step_dir_name(*, step_index: int, extractor_id: str) -> str:
247
+ """
248
+ Build a stable directory name for a pipeline step.
249
+
250
+ :param step_index: One-based pipeline step index.
251
+ :type step_index: int
252
+ :param extractor_id: Extractor identifier for the step.
253
+ :type extractor_id: str
254
+ :return: Directory name for the step.
255
+ :rtype: str
256
+ """
257
+ return f"{step_index:02d}-{extractor_id}"
258
+
259
+
260
+ def write_pipeline_step_text_artifact(
261
+ *,
262
+ run_dir: Path,
263
+ step_index: int,
264
+ extractor_id: str,
265
+ item: CatalogItem,
266
+ text: str,
267
+ ) -> str:
268
+ """
269
+ Write a pipeline step text artifact for an item.
270
+
271
+ :param run_dir: Extraction run directory.
272
+ :type run_dir: Path
273
+ :param step_index: One-based pipeline step index.
274
+ :type step_index: int
275
+ :param extractor_id: Extractor identifier for the step.
276
+ :type extractor_id: str
277
+ :param item: Catalog item being extracted.
278
+ :type item: CatalogItem
279
+ :param text: Extracted text content.
280
+ :type text: str
281
+ :return: Relative path to the stored step text artifact.
282
+ :rtype: str
283
+ """
284
+ step_dir_name = _pipeline_step_dir_name(step_index=step_index, extractor_id=extractor_id)
285
+ text_dir = run_dir / "steps" / step_dir_name / "text"
286
+ text_dir.mkdir(parents=True, exist_ok=True)
287
+ relpath = str(Path("steps") / step_dir_name / "text" / f"{item.id}.txt")
288
+ (run_dir / relpath).write_text(text, encoding="utf-8")
289
+ return relpath
290
+
291
+
292
+ def _final_output_from_steps(
293
+ step_outputs: List[ExtractionStepOutput],
294
+ ) -> Optional[ExtractionStepOutput]:
295
+ """
296
+ Select the final pipeline output for an item.
297
+
298
+ The final output is the last extracted step output in pipeline order.
299
+
300
+ :param step_outputs: Extracted outputs produced by pipeline steps.
301
+ :type step_outputs: list[biblicus.models.ExtractionStepOutput]
302
+ :return: Final step output or None when no steps produced extracted text.
303
+ :rtype: biblicus.models.ExtractionStepOutput or None
304
+ """
305
+ if not step_outputs:
306
+ return None
307
+ return step_outputs[-1]
308
+
309
+
310
+ def build_extraction_run(
311
+ corpus: Corpus,
312
+ *,
313
+ extractor_id: str,
314
+ recipe_name: str,
315
+ config: Dict[str, Any],
316
+ ) -> ExtractionRunManifest:
317
+ """
318
+ Build an extraction run for a corpus using the pipeline extractor.
319
+
320
+ :param corpus: Corpus to extract from.
321
+ :type corpus: Corpus
322
+ :param extractor_id: Extractor plugin identifier (must be ``pipeline``).
323
+ :type extractor_id: str
324
+ :param recipe_name: Human-readable recipe name.
325
+ :type recipe_name: str
326
+ :param config: Extractor configuration mapping.
327
+ :type config: dict[str, Any]
328
+ :return: Extraction run manifest describing the build.
329
+ :rtype: ExtractionRunManifest
330
+ :raises KeyError: If the extractor identifier is unknown.
331
+ :raises ValueError: If the extractor configuration is invalid.
332
+ :raises OSError: If the run directory or artifacts cannot be written.
333
+ :raises ExtractionRunFatalError: If the extractor is not the pipeline.
334
+ """
335
+ extractor = get_extractor(extractor_id)
336
+ parsed_config = extractor.validate_config(config)
337
+ recipe = create_extraction_recipe_manifest(
338
+ extractor_id=extractor_id,
339
+ name=recipe_name,
340
+ config=parsed_config.model_dump(),
341
+ )
342
+ manifest = create_extraction_run_manifest(corpus, recipe=recipe)
343
+ run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
344
+ run_dir.mkdir(parents=True, exist_ok=False)
345
+
346
+ catalog = corpus.load_catalog()
347
+ if extractor_id != "pipeline":
348
+ raise ExtractionRunFatalError("Extraction runs must use the pipeline extractor")
349
+
350
+ pipeline_config = (
351
+ parsed_config
352
+ if isinstance(parsed_config, PipelineExtractorConfig)
353
+ else PipelineExtractorConfig.model_validate(parsed_config)
354
+ )
355
+
356
+ validated_steps: List[Tuple[PipelineStepSpec, TextExtractor, BaseModel]] = []
357
+ for step in pipeline_config.steps:
358
+ step_extractor = get_extractor(step.extractor_id)
359
+ parsed_step_config = step_extractor.validate_config(step.config)
360
+ validated_steps.append((step, step_extractor, parsed_step_config))
361
+
362
+ extracted_items: List[ExtractionItemResult] = []
363
+ extracted_count = 0
364
+ skipped_count = 0
365
+ errored_count = 0
366
+ extracted_nonempty_count = 0
367
+ extracted_empty_count = 0
368
+ already_text_item_count = 0
369
+ needs_extraction_item_count = 0
370
+ converted_item_count = 0
371
+
372
+ for item in catalog.items.values():
373
+ media_type = item.media_type
374
+ item_is_text = media_type == "text/markdown" or media_type.startswith("text/")
375
+ if item_is_text:
376
+ already_text_item_count += 1
377
+ else:
378
+ needs_extraction_item_count += 1
379
+
380
+ step_results: List[ExtractionStepResult] = []
381
+ step_outputs: List[ExtractionStepOutput] = []
382
+ last_error_type: Optional[str] = None
383
+ last_error_message: Optional[str] = None
384
+
385
+ for step_index, (step, step_extractor, parsed_step_config) in enumerate(
386
+ validated_steps, start=1
387
+ ):
388
+ try:
389
+ extracted_text = step_extractor.extract_text(
390
+ corpus=corpus,
391
+ item=item,
392
+ config=parsed_step_config,
393
+ previous_extractions=step_outputs,
394
+ )
395
+ except Exception as extraction_error:
396
+ if isinstance(extraction_error, ExtractionRunFatalError):
397
+ raise
398
+ last_error_type = extraction_error.__class__.__name__
399
+ last_error_message = str(extraction_error)
400
+ step_results.append(
401
+ ExtractionStepResult(
402
+ step_index=step_index,
403
+ extractor_id=step.extractor_id,
404
+ status="errored",
405
+ text_relpath=None,
406
+ text_characters=0,
407
+ producer_extractor_id=None,
408
+ source_step_index=None,
409
+ error_type=last_error_type,
410
+ error_message=last_error_message,
411
+ )
412
+ )
413
+ continue
414
+
415
+ if extracted_text is None:
416
+ step_results.append(
417
+ ExtractionStepResult(
418
+ step_index=step_index,
419
+ extractor_id=step.extractor_id,
420
+ status="skipped",
421
+ text_relpath=None,
422
+ text_characters=0,
423
+ producer_extractor_id=None,
424
+ source_step_index=None,
425
+ error_type=None,
426
+ error_message=None,
427
+ )
428
+ )
429
+ continue
430
+
431
+ relpath = write_pipeline_step_text_artifact(
432
+ run_dir=run_dir,
433
+ step_index=step_index,
434
+ extractor_id=step.extractor_id,
435
+ item=item,
436
+ text=extracted_text.text,
437
+ )
438
+ text_characters = len(extracted_text.text)
439
+ step_results.append(
440
+ ExtractionStepResult(
441
+ step_index=step_index,
442
+ extractor_id=step.extractor_id,
443
+ status="extracted",
444
+ text_relpath=relpath,
445
+ text_characters=text_characters,
446
+ producer_extractor_id=extracted_text.producer_extractor_id,
447
+ source_step_index=extracted_text.source_step_index,
448
+ error_type=None,
449
+ error_message=None,
450
+ )
451
+ )
452
+ step_outputs.append(
453
+ ExtractionStepOutput(
454
+ step_index=step_index,
455
+ extractor_id=step.extractor_id,
456
+ status="extracted",
457
+ text=extracted_text.text,
458
+ text_characters=text_characters,
459
+ producer_extractor_id=extracted_text.producer_extractor_id,
460
+ source_step_index=extracted_text.source_step_index,
461
+ error_type=None,
462
+ error_message=None,
463
+ )
464
+ )
465
+
466
+ final_output = _final_output_from_steps(step_outputs)
467
+ if final_output is None:
468
+ status = "errored" if last_error_type else "skipped"
469
+ if status == "errored":
470
+ errored_count += 1
471
+ else:
472
+ skipped_count += 1
473
+ extracted_items.append(
474
+ ExtractionItemResult(
475
+ item_id=item.id,
476
+ status=status,
477
+ final_text_relpath=None,
478
+ final_step_index=None,
479
+ final_step_extractor_id=None,
480
+ final_producer_extractor_id=None,
481
+ final_source_step_index=None,
482
+ error_type=last_error_type if status == "errored" else None,
483
+ error_message=last_error_message if status == "errored" else None,
484
+ step_results=step_results,
485
+ )
486
+ )
487
+ continue
488
+
489
+ final_text = final_output.text or ""
490
+ final_text_relpath = write_extracted_text_artifact(
491
+ run_dir=run_dir, item=item, text=final_text
492
+ )
493
+ extracted_count += 1
494
+ if final_text.strip():
495
+ extracted_nonempty_count += 1
496
+ if not item_is_text:
497
+ converted_item_count += 1
498
+ else:
499
+ extracted_empty_count += 1
500
+
501
+ extracted_items.append(
502
+ ExtractionItemResult(
503
+ item_id=item.id,
504
+ status="extracted",
505
+ final_text_relpath=final_text_relpath,
506
+ final_step_index=final_output.step_index,
507
+ final_step_extractor_id=final_output.extractor_id,
508
+ final_producer_extractor_id=final_output.producer_extractor_id,
509
+ final_source_step_index=final_output.source_step_index,
510
+ error_type=None,
511
+ error_message=None,
512
+ step_results=step_results,
513
+ )
514
+ )
515
+
516
+ stats = {
517
+ "total_items": len(catalog.items),
518
+ "already_text_items": already_text_item_count,
519
+ "needs_extraction_items": needs_extraction_item_count,
520
+ "extracted_items": extracted_count,
521
+ "extracted_nonempty_items": extracted_nonempty_count,
522
+ "extracted_empty_items": extracted_empty_count,
523
+ "skipped_items": skipped_count,
524
+ "errored_items": errored_count,
525
+ "converted_items": converted_item_count,
526
+ }
527
+ manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
528
+ write_extraction_run_manifest(run_dir=run_dir, manifest=manifest)
529
+ return manifest
@@ -0,0 +1,44 @@
1
+ """
2
+ Text extraction plugins for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict
8
+
9
+ from .base import TextExtractor
10
+ from .metadata_text import MetadataTextExtractor
11
+ from .openai_stt import OpenAiSpeechToTextExtractor
12
+ from .pass_through_text import PassThroughTextExtractor
13
+ from .pdf_text import PortableDocumentFormatTextExtractor
14
+ from .pipeline import PipelineExtractor
15
+ from .rapidocr_text import RapidOcrExtractor
16
+ from .select_longest_text import SelectLongestTextExtractor
17
+ from .select_text import SelectTextExtractor
18
+ from .unstructured_text import UnstructuredExtractor
19
+
20
+
21
+ def get_extractor(extractor_id: str) -> TextExtractor:
22
+ """
23
+ Resolve a built-in text extractor by identifier.
24
+
25
+ :param extractor_id: Extractor identifier.
26
+ :type extractor_id: str
27
+ :return: Extractor plugin instance.
28
+ :rtype: TextExtractor
29
+ :raises KeyError: If the extractor identifier is not known.
30
+ """
31
+ extractors: Dict[str, TextExtractor] = {
32
+ MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
33
+ PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
34
+ PipelineExtractor.extractor_id: PipelineExtractor(),
35
+ PortableDocumentFormatTextExtractor.extractor_id: PortableDocumentFormatTextExtractor(),
36
+ OpenAiSpeechToTextExtractor.extractor_id: OpenAiSpeechToTextExtractor(),
37
+ RapidOcrExtractor.extractor_id: RapidOcrExtractor(),
38
+ SelectTextExtractor.extractor_id: SelectTextExtractor(),
39
+ SelectLongestTextExtractor.extractor_id: SelectLongestTextExtractor(),
40
+ UnstructuredExtractor.extractor_id: UnstructuredExtractor(),
41
+ }
42
+ if extractor_id not in extractors:
43
+ raise KeyError(f"Unknown extractor: {extractor_id!r}")
44
+ return extractors[extractor_id]