biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
  43. biblicus-1.1.0.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/extraction.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- Text extraction runs for Biblicus.
2
+ Text extraction snapshots for Biblicus.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Tuple
11
11
  from pydantic import BaseModel, ConfigDict, Field
12
12
 
13
13
  from .corpus import Corpus
14
- from .errors import ExtractionRunFatalError
14
+ from .errors import ExtractionSnapshotFatalError
15
15
  from .extractors import get_extractor
16
16
  from .extractors.base import TextExtractor
17
17
  from .extractors.pipeline import PipelineExtractorConfig, PipelineStepSpec
@@ -20,29 +20,29 @@ from .retrieval import hash_text
20
20
  from .time import utc_now_iso
21
21
 
22
22
 
23
- class ExtractionRecipeManifest(BaseModel):
23
+ class ExtractionConfigurationManifest(BaseModel):
24
24
  """
25
- Reproducible configuration for an extraction plugin run.
25
+ Reproducible configuration for an extraction plugin snapshot.
26
26
 
27
- :ivar recipe_id: Deterministic recipe identifier.
28
- :vartype recipe_id: str
27
+ :ivar configuration_id: Deterministic configuration identifier.
28
+ :vartype configuration_id: str
29
29
  :ivar extractor_id: Extractor plugin identifier.
30
30
  :vartype extractor_id: str
31
- :ivar name: Human-readable recipe name.
31
+ :ivar name: Human-readable configuration name.
32
32
  :vartype name: str
33
33
  :ivar created_at: International Organization for Standardization 8601 timestamp.
34
34
  :vartype created_at: str
35
- :ivar config: Extractor-specific configuration values.
36
- :vartype config: dict[str, Any]
35
+ :ivar configuration: Extractor-specific configuration values.
36
+ :vartype configuration: dict[str, Any]
37
37
  """
38
38
 
39
39
  model_config = ConfigDict(extra="forbid")
40
40
 
41
- recipe_id: str
41
+ configuration_id: str
42
42
  extractor_id: str
43
43
  name: str
44
44
  created_at: str
45
- config: Dict[str, Any] = Field(default_factory=dict)
45
+ configuration: Dict[str, Any] = Field(default_factory=dict)
46
46
 
47
47
 
48
48
  class ExtractionStepResult(BaseModel):
@@ -87,7 +87,7 @@ class ExtractionStepResult(BaseModel):
87
87
 
88
88
  class ExtractionItemResult(BaseModel):
89
89
  """
90
- Per-item result record for an extraction run.
90
+ Per-item result record for an extraction snapshot.
91
91
 
92
92
  :ivar item_id: Item identifier.
93
93
  :vartype item_id: str
@@ -125,30 +125,30 @@ class ExtractionItemResult(BaseModel):
125
125
  step_results: List[ExtractionStepResult] = Field(default_factory=list)
126
126
 
127
127
 
128
- class ExtractionRunManifest(BaseModel):
128
+ class ExtractionSnapshotManifest(BaseModel):
129
129
  """
130
- Immutable record describing an extraction run.
130
+ Immutable record describing an extraction snapshot.
131
131
 
132
- :ivar run_id: Unique run identifier.
133
- :vartype run_id: str
134
- :ivar recipe: Recipe manifest for this run.
135
- :vartype recipe: ExtractionRecipeManifest
132
+ :ivar snapshot_id: Unique snapshot identifier.
133
+ :vartype snapshot_id: str
134
+ :ivar configuration: Configuration manifest for this snapshot.
135
+ :vartype configuration: ExtractionConfigurationManifest
136
136
  :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
137
137
  :vartype corpus_uri: str
138
- :ivar catalog_generated_at: Catalog timestamp used for the run.
138
+ :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
139
139
  :vartype catalog_generated_at: str
140
- :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
140
+ :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
141
141
  :vartype created_at: str
142
142
  :ivar items: Per-item results.
143
143
  :vartype items: list[ExtractionItemResult]
144
- :ivar stats: Run statistics.
144
+ :ivar stats: Snapshot statistics.
145
145
  :vartype stats: dict[str, Any]
146
146
  """
147
147
 
148
148
  model_config = ConfigDict(extra="forbid")
149
149
 
150
- run_id: str
151
- recipe: ExtractionRecipeManifest
150
+ snapshot_id: str
151
+ configuration: ExtractionConfigurationManifest
152
152
  corpus_uri: str
153
153
  catalog_generated_at: str
154
154
  created_at: str
@@ -156,52 +156,53 @@ class ExtractionRunManifest(BaseModel):
156
156
  stats: Dict[str, Any] = Field(default_factory=dict)
157
157
 
158
158
 
159
- def create_extraction_recipe_manifest(
160
- *, extractor_id: str, name: str, config: Dict[str, Any]
161
- ) -> ExtractionRecipeManifest:
159
+ def create_extraction_configuration_manifest(
160
+ *, extractor_id: str, name: str, configuration: Dict[str, Any]
161
+ ) -> ExtractionConfigurationManifest:
162
162
  """
163
- Create a deterministic extraction recipe manifest.
163
+ Create a deterministic extraction configuration manifest.
164
164
 
165
165
  :param extractor_id: Extractor plugin identifier.
166
166
  :type extractor_id: str
167
- :param name: Human recipe name.
167
+ :param name: Human configuration name.
168
168
  :type name: str
169
- :param config: Extractor configuration.
170
- :type config: dict[str, Any]
171
- :return: Recipe manifest.
172
- :rtype: ExtractionRecipeManifest
169
+ :param configuration: Extractor configuration.
170
+ :type configuration: dict[str, Any]
171
+ :return: Configuration manifest.
172
+ :rtype: ExtractionConfigurationManifest
173
173
  """
174
- recipe_payload = json.dumps(
175
- {"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True
174
+ configuration_payload = json.dumps(
175
+ {"extractor_id": extractor_id, "name": name, "configuration": configuration},
176
+ sort_keys=True,
176
177
  )
177
- recipe_id = hash_text(recipe_payload)
178
- return ExtractionRecipeManifest(
179
- recipe_id=recipe_id,
178
+ configuration_id = hash_text(configuration_payload)
179
+ return ExtractionConfigurationManifest(
180
+ configuration_id=configuration_id,
180
181
  extractor_id=extractor_id,
181
182
  name=name,
182
183
  created_at=utc_now_iso(),
183
- config=config,
184
+ configuration=configuration,
184
185
  )
185
186
 
186
187
 
187
- def create_extraction_run_manifest(
188
- corpus: Corpus, *, recipe: ExtractionRecipeManifest
189
- ) -> ExtractionRunManifest:
188
+ def create_extraction_snapshot_manifest(
189
+ corpus: Corpus, *, configuration: ExtractionConfigurationManifest
190
+ ) -> ExtractionSnapshotManifest:
190
191
  """
191
- Create a new extraction run manifest for a corpus.
192
+ Create a new extraction snapshot manifest for a corpus.
192
193
 
193
- :param corpus: Corpus associated with the run.
194
+ :param corpus: Corpus associated with the snapshot.
194
195
  :type corpus: Corpus
195
- :param recipe: Recipe manifest.
196
- :type recipe: ExtractionRecipeManifest
197
- :return: Run manifest.
198
- :rtype: ExtractionRunManifest
196
+ :param configuration: Configuration manifest.
197
+ :type configuration: ExtractionConfigurationManifest
198
+ :return: Snapshot manifest.
199
+ :rtype: ExtractionSnapshotManifest
199
200
  """
200
201
  catalog = corpus.load_catalog()
201
- run_id = hash_text(f"{recipe.recipe_id}:{catalog.generated_at}")
202
- return ExtractionRunManifest(
203
- run_id=run_id,
204
- recipe=recipe,
202
+ snapshot_id = hash_text(f"{configuration.configuration_id}:{catalog.generated_at}")
203
+ return ExtractionSnapshotManifest(
204
+ snapshot_id=snapshot_id,
205
+ configuration=configuration,
205
206
  corpus_uri=corpus.uri,
206
207
  catalog_generated_at=catalog.generated_at,
207
208
  created_at=utc_now_iso(),
@@ -210,27 +211,29 @@ def create_extraction_run_manifest(
210
211
  )
211
212
 
212
213
 
213
- def write_extraction_run_manifest(*, run_dir: Path, manifest: ExtractionRunManifest) -> None:
214
+ def write_extraction_snapshot_manifest(
215
+ *, snapshot_dir: Path, manifest: ExtractionSnapshotManifest
216
+ ) -> None:
214
217
  """
215
- Persist an extraction run manifest to a run directory.
218
+ Persist an extraction snapshot manifest to a snapshot directory.
216
219
 
217
- :param run_dir: Extraction run directory.
218
- :type run_dir: Path
219
- :param manifest: Run manifest to write.
220
- :type manifest: ExtractionRunManifest
220
+ :param snapshot_dir: Extraction snapshot directory.
221
+ :type snapshot_dir: Path
222
+ :param manifest: Snapshot manifest to write.
223
+ :type manifest: ExtractionSnapshotManifest
221
224
  :return: None.
222
225
  :rtype: None
223
226
  """
224
- manifest_path = run_dir / "manifest.json"
227
+ manifest_path = snapshot_dir / "manifest.json"
225
228
  manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
226
229
 
227
230
 
228
- def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str) -> str:
231
+ def write_extracted_text_artifact(*, snapshot_dir: Path, item: CatalogItem, text: str) -> str:
229
232
  """
230
- Write an extracted text artifact for an item into the run directory.
233
+ Write an extracted text artifact for an item into the snapshot directory.
231
234
 
232
- :param run_dir: Extraction run directory.
233
- :type run_dir: Path
235
+ :param snapshot_dir: Extraction snapshot directory.
236
+ :type snapshot_dir: Path
234
237
  :param item: Catalog item being extracted.
235
238
  :type item: CatalogItem
236
239
  :param text: Extracted text.
@@ -238,10 +241,10 @@ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str
238
241
  :return: Relative path to the stored text artifact.
239
242
  :rtype: str
240
243
  """
241
- text_dir = run_dir / "text"
244
+ text_dir = snapshot_dir / "text"
242
245
  text_dir.mkdir(parents=True, exist_ok=True)
243
246
  relpath = str(Path("text") / f"{item.id}.txt")
244
- path = run_dir / relpath
247
+ path = snapshot_dir / relpath
245
248
  path.write_text(text, encoding="utf-8")
246
249
  return relpath
247
250
 
@@ -262,7 +265,7 @@ def _pipeline_step_dir_name(*, step_index: int, extractor_id: str) -> str:
262
265
 
263
266
  def write_pipeline_step_text_artifact(
264
267
  *,
265
- run_dir: Path,
268
+ snapshot_dir: Path,
266
269
  step_index: int,
267
270
  extractor_id: str,
268
271
  item: CatalogItem,
@@ -271,8 +274,8 @@ def write_pipeline_step_text_artifact(
271
274
  """
272
275
  Write a pipeline step text artifact for an item.
273
276
 
274
- :param run_dir: Extraction run directory.
275
- :type run_dir: Path
277
+ :param snapshot_dir: Extraction snapshot directory.
278
+ :type snapshot_dir: Path
276
279
  :param step_index: One-based pipeline step index.
277
280
  :type step_index: int
278
281
  :param extractor_id: Extractor identifier for the step.
@@ -285,10 +288,10 @@ def write_pipeline_step_text_artifact(
285
288
  :rtype: str
286
289
  """
287
290
  step_dir_name = _pipeline_step_dir_name(step_index=step_index, extractor_id=extractor_id)
288
- text_dir = run_dir / "steps" / step_dir_name / "text"
291
+ text_dir = snapshot_dir / "steps" / step_dir_name / "text"
289
292
  text_dir.mkdir(parents=True, exist_ok=True)
290
293
  relpath = str(Path("steps") / step_dir_name / "text" / f"{item.id}.txt")
291
- (run_dir / relpath).write_text(text, encoding="utf-8")
294
+ (snapshot_dir / relpath).write_text(text, encoding="utf-8")
292
295
  return relpath
293
296
 
294
297
 
@@ -310,49 +313,51 @@ def _final_output_from_steps(
310
313
  return step_outputs[-1]
311
314
 
312
315
 
313
- def build_extraction_run(
316
+ def build_extraction_snapshot(
314
317
  corpus: Corpus,
315
318
  *,
316
319
  extractor_id: str,
317
- recipe_name: str,
318
- config: Dict[str, Any],
319
- ) -> ExtractionRunManifest:
320
+ configuration_name: str,
321
+ configuration: Dict[str, Any],
322
+ ) -> ExtractionSnapshotManifest:
320
323
  """
321
- Build an extraction run for a corpus using the pipeline extractor.
324
+ Build an extraction snapshot for a corpus using the pipeline extractor.
322
325
 
323
326
  :param corpus: Corpus to extract from.
324
327
  :type corpus: Corpus
325
328
  :param extractor_id: Extractor plugin identifier (must be ``pipeline``).
326
329
  :type extractor_id: str
327
- :param recipe_name: Human-readable recipe name.
328
- :type recipe_name: str
329
- :param config: Extractor configuration mapping.
330
- :type config: dict[str, Any]
331
- :return: Extraction run manifest describing the build.
332
- :rtype: ExtractionRunManifest
330
+ :param configuration_name: Human-readable configuration name.
331
+ :type configuration_name: str
332
+ :param configuration: Extractor configuration mapping.
333
+ :type configuration: dict[str, Any]
334
+ :return: Extraction snapshot manifest describing the build.
335
+ :rtype: ExtractionSnapshotManifest
333
336
  :raises KeyError: If the extractor identifier is unknown.
334
337
  :raises ValueError: If the extractor configuration is invalid.
335
- :raises OSError: If the run directory or artifacts cannot be written.
336
- :raises ExtractionRunFatalError: If the extractor is not the pipeline.
338
+ :raises OSError: If the snapshot directory or artifacts cannot be written.
339
+ :raises ExtractionSnapshotFatalError: If the extractor is not the pipeline.
337
340
  """
338
341
  extractor = get_extractor(extractor_id)
339
- parsed_config = extractor.validate_config(config)
340
- recipe = create_extraction_recipe_manifest(
342
+ parsed_config = extractor.validate_config(configuration)
343
+ config_manifest = create_extraction_configuration_manifest(
341
344
  extractor_id=extractor_id,
342
- name=recipe_name,
343
- config=parsed_config.model_dump(),
345
+ name=configuration_name,
346
+ configuration=parsed_config.model_dump(),
344
347
  )
345
- manifest = create_extraction_run_manifest(corpus, recipe=recipe)
346
- run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
347
- if run_dir.exists():
348
- return corpus.load_extraction_run_manifest(
349
- extractor_id=extractor_id, run_id=manifest.run_id
348
+ manifest = create_extraction_snapshot_manifest(corpus, configuration=config_manifest)
349
+ snapshot_dir = corpus.extraction_snapshot_dir(
350
+ extractor_id=extractor_id, snapshot_id=manifest.snapshot_id
351
+ )
352
+ if snapshot_dir.exists():
353
+ return corpus.load_extraction_snapshot_manifest(
354
+ extractor_id=extractor_id, snapshot_id=manifest.snapshot_id
350
355
  )
351
- run_dir.mkdir(parents=True, exist_ok=False)
356
+ snapshot_dir.mkdir(parents=True, exist_ok=False)
352
357
 
353
358
  catalog = corpus.load_catalog()
354
359
  if extractor_id != "pipeline":
355
- raise ExtractionRunFatalError("Extraction runs must use the pipeline extractor")
360
+ raise ExtractionSnapshotFatalError("Extraction snapshots must use the pipeline extractor")
356
361
 
357
362
  pipeline_config = (
358
363
  parsed_config
@@ -363,7 +368,7 @@ def build_extraction_run(
363
368
  validated_steps: List[Tuple[PipelineStepSpec, TextExtractor, BaseModel]] = []
364
369
  for step in pipeline_config.steps:
365
370
  step_extractor = get_extractor(step.extractor_id)
366
- parsed_step_config = step_extractor.validate_config(step.config)
371
+ parsed_step_config = step_extractor.validate_config(step.configuration)
367
372
  validated_steps.append((step, step_extractor, parsed_step_config))
368
373
 
369
374
  extracted_items: List[ExtractionItemResult] = []
@@ -400,7 +405,7 @@ def build_extraction_run(
400
405
  previous_extractions=step_outputs,
401
406
  )
402
407
  except Exception as extraction_error:
403
- if isinstance(extraction_error, ExtractionRunFatalError):
408
+ if isinstance(extraction_error, ExtractionSnapshotFatalError):
404
409
  raise
405
410
  last_error_type = extraction_error.__class__.__name__
406
411
  last_error_message = str(extraction_error)
@@ -436,7 +441,7 @@ def build_extraction_run(
436
441
  continue
437
442
 
438
443
  relpath = write_pipeline_step_text_artifact(
439
- run_dir=run_dir,
444
+ snapshot_dir=snapshot_dir,
440
445
  step_index=step_index,
441
446
  extractor_id=step.extractor_id,
442
447
  item=item,
@@ -497,7 +502,7 @@ def build_extraction_run(
497
502
 
498
503
  final_text = final_output.text or ""
499
504
  final_text_relpath = write_extracted_text_artifact(
500
- run_dir=run_dir, item=item, text=final_text
505
+ snapshot_dir=snapshot_dir, item=item, text=final_text
501
506
  )
502
507
  extracted_count += 1
503
508
  if final_text.strip():
@@ -534,5 +539,5 @@ def build_extraction_run(
534
539
  "converted_items": converted_item_count,
535
540
  }
536
541
  manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
537
- write_extraction_run_manifest(run_dir=run_dir, manifest=manifest)
542
+ write_extraction_snapshot_manifest(snapshot_dir=snapshot_dir, manifest=manifest)
538
543
  return manifest
@@ -13,7 +13,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
13
13
 
14
14
  from .constants import EXTRACTION_DATASET_SCHEMA_VERSION
15
15
  from .corpus import Corpus
16
- from .extraction import ExtractionRunManifest
16
+ from .extraction import ExtractionSnapshotManifest
17
17
  from .models import CatalogItem
18
18
  from .time import utc_now_iso
19
19
 
@@ -118,12 +118,12 @@ class ExtractionEvaluationResult(BaseModel):
118
118
  :vartype dataset: dict[str, object]
119
119
  :ivar extractor_id: Extractor identifier.
120
120
  :vartype extractor_id: str
121
- :ivar run_id: Extraction run identifier.
122
- :vartype run_id: str
123
- :ivar recipe_id: Extraction recipe identifier.
124
- :vartype recipe_id: str
125
- :ivar recipe_name: Extraction recipe name.
126
- :vartype recipe_name: str
121
+ :ivar snapshot_id: Extraction snapshot identifier.
122
+ :vartype snapshot_id: str
123
+ :ivar configuration_id: Extraction configuration identifier.
124
+ :vartype configuration_id: str
125
+ :ivar configuration_name: Extraction configuration name.
126
+ :vartype configuration_name: str
127
127
  :ivar evaluated_at: International Organization for Standardization 8601 timestamp.
128
128
  :vartype evaluated_at: str
129
129
  :ivar metrics: Evaluation metrics for coverage and accuracy.
@@ -136,9 +136,9 @@ class ExtractionEvaluationResult(BaseModel):
136
136
 
137
137
  dataset: Dict[str, object]
138
138
  extractor_id: str
139
- run_id: str
140
- recipe_id: str
141
- recipe_name: str
139
+ snapshot_id: str
140
+ configuration_id: str
141
+ configuration_name: str
142
142
  evaluated_at: str
143
143
  metrics: Dict[str, float]
144
144
  items: List[ExtractionEvaluationItemReport]
@@ -160,21 +160,21 @@ def load_extraction_dataset(path: Path) -> ExtractionEvaluationDataset:
160
160
  return ExtractionEvaluationDataset.model_validate(data)
161
161
 
162
162
 
163
- def evaluate_extraction_run(
163
+ def evaluate_extraction_snapshot(
164
164
  *,
165
165
  corpus: Corpus,
166
- run: ExtractionRunManifest,
166
+ snapshot: ExtractionSnapshotManifest,
167
167
  extractor_id: str,
168
168
  dataset: ExtractionEvaluationDataset,
169
169
  ) -> ExtractionEvaluationResult:
170
170
  """
171
- Evaluate an extraction run against a dataset.
171
+ Evaluate an extraction snapshot against a dataset.
172
172
 
173
- :param corpus: Corpus associated with the run.
173
+ :param corpus: Corpus associated with the snapshot.
174
174
  :type corpus: Corpus
175
- :param run: Extraction run manifest.
176
- :type run: ExtractionRunManifest
177
- :param extractor_id: Extractor identifier for the run.
175
+ :param snapshot: Extraction snapshot manifest.
176
+ :type snapshot: ExtractionSnapshotManifest
177
+ :param extractor_id: Extractor identifier for the snapshot.
178
178
  :type extractor_id: str
179
179
  :param dataset: Extraction evaluation dataset.
180
180
  :type dataset: ExtractionEvaluationDataset
@@ -182,7 +182,7 @@ def evaluate_extraction_run(
182
182
  :rtype: ExtractionEvaluationResult
183
183
  """
184
184
  catalog = corpus.load_catalog()
185
- item_index = {item.item_id: item for item in run.items}
185
+ item_index = {item.item_id: item for item in snapshot.items}
186
186
  coverage_present = 0
187
187
  coverage_empty = 0
188
188
  coverage_missing = 0
@@ -201,7 +201,7 @@ def evaluate_extraction_run(
201
201
  processable += 1
202
202
 
203
203
  extracted_text = corpus.read_extracted_text(
204
- extractor_id=extractor_id, run_id=run.run_id, item_id=item_id
204
+ extractor_id=extractor_id, snapshot_id=snapshot.snapshot_id, item_id=item_id
205
205
  )
206
206
  coverage_status = _coverage_status(extracted_text)
207
207
  if coverage_status == "present":
@@ -245,9 +245,9 @@ def evaluate_extraction_run(
245
245
  return ExtractionEvaluationResult(
246
246
  dataset=dataset_meta,
247
247
  extractor_id=extractor_id,
248
- run_id=run.run_id,
249
- recipe_id=run.recipe.recipe_id,
250
- recipe_name=run.recipe.name,
248
+ snapshot_id=snapshot.snapshot_id,
249
+ configuration_id=snapshot.configuration.configuration_id,
250
+ configuration_name=snapshot.configuration.name,
251
251
  evaluated_at=utc_now_iso(),
252
252
  metrics=metrics,
253
253
  items=item_reports,
@@ -255,21 +255,21 @@ def evaluate_extraction_run(
255
255
 
256
256
 
257
257
  def write_extraction_evaluation_result(
258
- *, corpus: Corpus, run_id: str, result: ExtractionEvaluationResult
258
+ *, corpus: Corpus, snapshot_id: str, result: ExtractionEvaluationResult
259
259
  ) -> Path:
260
260
  """
261
261
  Persist extraction evaluation output under the corpus.
262
262
 
263
263
  :param corpus: Corpus associated with the evaluation.
264
264
  :type corpus: Corpus
265
- :param run_id: Extraction run identifier.
266
- :type run_id: str
265
+ :param snapshot_id: Extraction snapshot identifier.
266
+ :type snapshot_id: str
267
267
  :param result: Evaluation result to write.
268
268
  :type result: ExtractionEvaluationResult
269
269
  :return: Output path.
270
270
  :rtype: Path
271
271
  """
272
- output_dir = corpus.runs_dir / "evaluation" / "extraction" / run_id
272
+ output_dir = corpus.snapshots_dir / "evaluation" / "extraction" / snapshot_id
273
273
  output_dir.mkdir(parents=True, exist_ok=True)
274
274
  output_path = output_dir / "output.json"
275
275
  output_path.write_text(result.model_dump_json(indent=2) + "\n", encoding="utf-8")
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
  from pydantic import BaseModel, ConfigDict, Field
12
12
 
13
13
  from ..corpus import Corpus
14
- from ..errors import ExtractionRunFatalError
14
+ from ..errors import ExtractionSnapshotFatalError
15
15
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
16
  from ..user_config import resolve_deepgram_api_key
17
17
  from .base import TextExtractor
@@ -66,19 +66,19 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
66
66
  :type config: dict[str, Any]
67
67
  :return: Parsed configuration model.
68
68
  :rtype: DeepgramSpeechToTextExtractorConfig
69
- :raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
69
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
70
70
  """
71
71
  try:
72
72
  from deepgram import DeepgramClient # noqa: F401
73
73
  except ImportError as import_error:
74
- raise ExtractionRunFatalError(
74
+ raise ExtractionSnapshotFatalError(
75
75
  "Deepgram speech to text extractor requires an optional dependency. "
76
76
  'Install it with pip install "biblicus[deepgram]".'
77
77
  ) from import_error
78
78
 
79
79
  api_key = resolve_deepgram_api_key()
80
80
  if api_key is None:
81
- raise ExtractionRunFatalError(
81
+ raise ExtractionSnapshotFatalError(
82
82
  "Deepgram speech to text extractor requires a Deepgram API key. "
83
83
  "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
84
84
  "deepgram.api_key."
@@ -107,7 +107,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
107
107
  :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
108
108
  :return: Extracted text payload, or None when the item is not audio.
109
109
  :rtype: ExtractedText or None
110
- :raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
110
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
111
111
  """
112
112
  _ = previous_extractions
113
113
  if not item.media_type.startswith("audio/"):
@@ -121,7 +121,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
121
121
 
122
122
  api_key = resolve_deepgram_api_key()
123
123
  if api_key is None:
124
- raise ExtractionRunFatalError(
124
+ raise ExtractionSnapshotFatalError(
125
125
  "Deepgram speech to text extractor requires a Deepgram API key. "
126
126
  "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
127
127
  "deepgram.api_key."
@@ -130,7 +130,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
130
130
  try:
131
131
  from deepgram import DeepgramClient
132
132
  except ImportError as import_error:
133
- raise ExtractionRunFatalError(
133
+ raise ExtractionSnapshotFatalError(
134
134
  "Deepgram speech to text extractor requires an optional dependency. "
135
135
  'Install it with pip install "biblicus[deepgram]".'
136
136
  ) from import_error
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -40,14 +40,14 @@ class DoclingGraniteExtractorConfig(BaseModel):
40
40
 
41
41
  :ivar output_format: Output format for extracted content (markdown, text, or html).
42
42
  :vartype output_format: str
43
- :ivar backend: Inference backend (mlx or transformers).
44
- :vartype backend: str
43
+ :ivar retriever: Inference retriever (mlx or transformers).
44
+ :vartype retriever: str
45
45
  """
46
46
 
47
- model_config = ConfigDict(extra="forbid")
47
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
48
48
 
49
49
  output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
50
- backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
50
+ retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
51
51
 
52
52
 
53
53
  class DoclingGraniteExtractor(TextExtractor):
@@ -71,7 +71,7 @@ class DoclingGraniteExtractor(TextExtractor):
71
71
  :type config: dict[str, Any]
72
72
  :return: Parsed config.
73
73
  :rtype: DoclingGraniteExtractorConfig
74
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
74
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
75
75
  """
76
76
  parsed = DoclingGraniteExtractorConfig.model_validate(config)
77
77
 
@@ -82,19 +82,19 @@ class DoclingGraniteExtractor(TextExtractor):
82
82
  vlm_model_specs,
83
83
  )
84
84
  except ImportError as import_error:
85
- raise ExtractionRunFatalError(
85
+ raise ExtractionSnapshotFatalError(
86
86
  "DoclingGranite extractor requires an optional dependency. "
87
87
  'Install it with pip install "biblicus[docling]".'
88
88
  ) from import_error
89
89
 
90
- if parsed.backend == "mlx":
90
+ if parsed.retriever == "mlx":
91
91
  try:
92
92
  from docling.pipeline_options import vlm_model_specs
93
93
 
94
94
  _ = vlm_model_specs.GRANITE_DOCLING_MLX
95
95
  except (ImportError, AttributeError) as exc:
96
- raise ExtractionRunFatalError(
97
- "DoclingGranite extractor with MLX backend requires MLX support. "
96
+ raise ExtractionSnapshotFatalError(
97
+ "DoclingGranite extractor with MLX retriever requires MLX support. "
98
98
  'Install it with pip install "biblicus[docling-mlx]".'
99
99
  ) from exc
100
100
 
@@ -167,7 +167,7 @@ class DoclingGraniteExtractor(TextExtractor):
167
167
  from docling.format_options import InputFormat, PdfFormatOption
168
168
  from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
169
169
 
170
- if config.backend == "mlx":
170
+ if config.retriever == "mlx":
171
171
  vlm_options = vlm_model_specs.GRANITE_DOCLING_MLX
172
172
  else:
173
173
  vlm_options = vlm_model_specs.GRANITE_DOCLING_TRANSFORMERS