biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/extraction.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Text extraction
|
|
2
|
+
Text extraction snapshots for Biblicus.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field
|
|
12
12
|
|
|
13
13
|
from .corpus import Corpus
|
|
14
|
-
from .errors import
|
|
14
|
+
from .errors import ExtractionSnapshotFatalError
|
|
15
15
|
from .extractors import get_extractor
|
|
16
16
|
from .extractors.base import TextExtractor
|
|
17
17
|
from .extractors.pipeline import PipelineExtractorConfig, PipelineStepSpec
|
|
@@ -20,29 +20,29 @@ from .retrieval import hash_text
|
|
|
20
20
|
from .time import utc_now_iso
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class
|
|
23
|
+
class ExtractionConfigurationManifest(BaseModel):
|
|
24
24
|
"""
|
|
25
|
-
Reproducible configuration for an extraction plugin
|
|
25
|
+
Reproducible configuration for an extraction plugin snapshot.
|
|
26
26
|
|
|
27
|
-
:ivar
|
|
28
|
-
:vartype
|
|
27
|
+
:ivar configuration_id: Deterministic configuration identifier.
|
|
28
|
+
:vartype configuration_id: str
|
|
29
29
|
:ivar extractor_id: Extractor plugin identifier.
|
|
30
30
|
:vartype extractor_id: str
|
|
31
|
-
:ivar name: Human-readable
|
|
31
|
+
:ivar name: Human-readable configuration name.
|
|
32
32
|
:vartype name: str
|
|
33
33
|
:ivar created_at: International Organization for Standardization 8601 timestamp.
|
|
34
34
|
:vartype created_at: str
|
|
35
|
-
:ivar
|
|
36
|
-
:vartype
|
|
35
|
+
:ivar configuration: Extractor-specific configuration values.
|
|
36
|
+
:vartype configuration: dict[str, Any]
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
39
|
model_config = ConfigDict(extra="forbid")
|
|
40
40
|
|
|
41
|
-
|
|
41
|
+
configuration_id: str
|
|
42
42
|
extractor_id: str
|
|
43
43
|
name: str
|
|
44
44
|
created_at: str
|
|
45
|
-
|
|
45
|
+
configuration: Dict[str, Any] = Field(default_factory=dict)
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
class ExtractionStepResult(BaseModel):
|
|
@@ -87,7 +87,7 @@ class ExtractionStepResult(BaseModel):
|
|
|
87
87
|
|
|
88
88
|
class ExtractionItemResult(BaseModel):
|
|
89
89
|
"""
|
|
90
|
-
Per-item result record for an extraction
|
|
90
|
+
Per-item result record for an extraction snapshot.
|
|
91
91
|
|
|
92
92
|
:ivar item_id: Item identifier.
|
|
93
93
|
:vartype item_id: str
|
|
@@ -125,30 +125,30 @@ class ExtractionItemResult(BaseModel):
|
|
|
125
125
|
step_results: List[ExtractionStepResult] = Field(default_factory=list)
|
|
126
126
|
|
|
127
127
|
|
|
128
|
-
class
|
|
128
|
+
class ExtractionSnapshotManifest(BaseModel):
|
|
129
129
|
"""
|
|
130
|
-
Immutable record describing an extraction
|
|
130
|
+
Immutable record describing an extraction snapshot.
|
|
131
131
|
|
|
132
|
-
:ivar
|
|
133
|
-
:vartype
|
|
134
|
-
:ivar
|
|
135
|
-
:vartype
|
|
132
|
+
:ivar snapshot_id: Unique snapshot identifier.
|
|
133
|
+
:vartype snapshot_id: str
|
|
134
|
+
:ivar configuration: Configuration manifest for this snapshot.
|
|
135
|
+
:vartype configuration: ExtractionConfigurationManifest
|
|
136
136
|
:ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
|
|
137
137
|
:vartype corpus_uri: str
|
|
138
|
-
:ivar catalog_generated_at: Catalog timestamp used for the
|
|
138
|
+
:ivar catalog_generated_at: Catalog timestamp used for the snapshot.
|
|
139
139
|
:vartype catalog_generated_at: str
|
|
140
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
140
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
|
|
141
141
|
:vartype created_at: str
|
|
142
142
|
:ivar items: Per-item results.
|
|
143
143
|
:vartype items: list[ExtractionItemResult]
|
|
144
|
-
:ivar stats:
|
|
144
|
+
:ivar stats: Snapshot statistics.
|
|
145
145
|
:vartype stats: dict[str, Any]
|
|
146
146
|
"""
|
|
147
147
|
|
|
148
148
|
model_config = ConfigDict(extra="forbid")
|
|
149
149
|
|
|
150
|
-
|
|
151
|
-
|
|
150
|
+
snapshot_id: str
|
|
151
|
+
configuration: ExtractionConfigurationManifest
|
|
152
152
|
corpus_uri: str
|
|
153
153
|
catalog_generated_at: str
|
|
154
154
|
created_at: str
|
|
@@ -156,52 +156,53 @@ class ExtractionRunManifest(BaseModel):
|
|
|
156
156
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
157
157
|
|
|
158
158
|
|
|
159
|
-
def
|
|
160
|
-
*, extractor_id: str, name: str,
|
|
161
|
-
) ->
|
|
159
|
+
def create_extraction_configuration_manifest(
|
|
160
|
+
*, extractor_id: str, name: str, configuration: Dict[str, Any]
|
|
161
|
+
) -> ExtractionConfigurationManifest:
|
|
162
162
|
"""
|
|
163
|
-
Create a deterministic extraction
|
|
163
|
+
Create a deterministic extraction configuration manifest.
|
|
164
164
|
|
|
165
165
|
:param extractor_id: Extractor plugin identifier.
|
|
166
166
|
:type extractor_id: str
|
|
167
|
-
:param name: Human
|
|
167
|
+
:param name: Human configuration name.
|
|
168
168
|
:type name: str
|
|
169
|
-
:param
|
|
170
|
-
:type
|
|
171
|
-
:return:
|
|
172
|
-
:rtype:
|
|
169
|
+
:param configuration: Extractor configuration.
|
|
170
|
+
:type configuration: dict[str, Any]
|
|
171
|
+
:return: Configuration manifest.
|
|
172
|
+
:rtype: ExtractionConfigurationManifest
|
|
173
173
|
"""
|
|
174
|
-
|
|
175
|
-
{"extractor_id": extractor_id, "name": name, "
|
|
174
|
+
configuration_payload = json.dumps(
|
|
175
|
+
{"extractor_id": extractor_id, "name": name, "configuration": configuration},
|
|
176
|
+
sort_keys=True,
|
|
176
177
|
)
|
|
177
|
-
|
|
178
|
-
return
|
|
179
|
-
|
|
178
|
+
configuration_id = hash_text(configuration_payload)
|
|
179
|
+
return ExtractionConfigurationManifest(
|
|
180
|
+
configuration_id=configuration_id,
|
|
180
181
|
extractor_id=extractor_id,
|
|
181
182
|
name=name,
|
|
182
183
|
created_at=utc_now_iso(),
|
|
183
|
-
|
|
184
|
+
configuration=configuration,
|
|
184
185
|
)
|
|
185
186
|
|
|
186
187
|
|
|
187
|
-
def
|
|
188
|
-
corpus: Corpus, *,
|
|
189
|
-
) ->
|
|
188
|
+
def create_extraction_snapshot_manifest(
|
|
189
|
+
corpus: Corpus, *, configuration: ExtractionConfigurationManifest
|
|
190
|
+
) -> ExtractionSnapshotManifest:
|
|
190
191
|
"""
|
|
191
|
-
Create a new extraction
|
|
192
|
+
Create a new extraction snapshot manifest for a corpus.
|
|
192
193
|
|
|
193
|
-
:param corpus: Corpus associated with the
|
|
194
|
+
:param corpus: Corpus associated with the snapshot.
|
|
194
195
|
:type corpus: Corpus
|
|
195
|
-
:param
|
|
196
|
-
:type
|
|
197
|
-
:return:
|
|
198
|
-
:rtype:
|
|
196
|
+
:param configuration: Configuration manifest.
|
|
197
|
+
:type configuration: ExtractionConfigurationManifest
|
|
198
|
+
:return: Snapshot manifest.
|
|
199
|
+
:rtype: ExtractionSnapshotManifest
|
|
199
200
|
"""
|
|
200
201
|
catalog = corpus.load_catalog()
|
|
201
|
-
|
|
202
|
-
return
|
|
203
|
-
|
|
204
|
-
|
|
202
|
+
snapshot_id = hash_text(f"{configuration.configuration_id}:{catalog.generated_at}")
|
|
203
|
+
return ExtractionSnapshotManifest(
|
|
204
|
+
snapshot_id=snapshot_id,
|
|
205
|
+
configuration=configuration,
|
|
205
206
|
corpus_uri=corpus.uri,
|
|
206
207
|
catalog_generated_at=catalog.generated_at,
|
|
207
208
|
created_at=utc_now_iso(),
|
|
@@ -210,27 +211,29 @@ def create_extraction_run_manifest(
|
|
|
210
211
|
)
|
|
211
212
|
|
|
212
213
|
|
|
213
|
-
def
|
|
214
|
+
def write_extraction_snapshot_manifest(
|
|
215
|
+
*, snapshot_dir: Path, manifest: ExtractionSnapshotManifest
|
|
216
|
+
) -> None:
|
|
214
217
|
"""
|
|
215
|
-
Persist an extraction
|
|
218
|
+
Persist an extraction snapshot manifest to a snapshot directory.
|
|
216
219
|
|
|
217
|
-
:param
|
|
218
|
-
:type
|
|
219
|
-
:param manifest:
|
|
220
|
-
:type manifest:
|
|
220
|
+
:param snapshot_dir: Extraction snapshot directory.
|
|
221
|
+
:type snapshot_dir: Path
|
|
222
|
+
:param manifest: Snapshot manifest to write.
|
|
223
|
+
:type manifest: ExtractionSnapshotManifest
|
|
221
224
|
:return: None.
|
|
222
225
|
:rtype: None
|
|
223
226
|
"""
|
|
224
|
-
manifest_path =
|
|
227
|
+
manifest_path = snapshot_dir / "manifest.json"
|
|
225
228
|
manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
226
229
|
|
|
227
230
|
|
|
228
|
-
def write_extracted_text_artifact(*,
|
|
231
|
+
def write_extracted_text_artifact(*, snapshot_dir: Path, item: CatalogItem, text: str) -> str:
|
|
229
232
|
"""
|
|
230
|
-
Write an extracted text artifact for an item into the
|
|
233
|
+
Write an extracted text artifact for an item into the snapshot directory.
|
|
231
234
|
|
|
232
|
-
:param
|
|
233
|
-
:type
|
|
235
|
+
:param snapshot_dir: Extraction snapshot directory.
|
|
236
|
+
:type snapshot_dir: Path
|
|
234
237
|
:param item: Catalog item being extracted.
|
|
235
238
|
:type item: CatalogItem
|
|
236
239
|
:param text: Extracted text.
|
|
@@ -238,10 +241,10 @@ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str
|
|
|
238
241
|
:return: Relative path to the stored text artifact.
|
|
239
242
|
:rtype: str
|
|
240
243
|
"""
|
|
241
|
-
text_dir =
|
|
244
|
+
text_dir = snapshot_dir / "text"
|
|
242
245
|
text_dir.mkdir(parents=True, exist_ok=True)
|
|
243
246
|
relpath = str(Path("text") / f"{item.id}.txt")
|
|
244
|
-
path =
|
|
247
|
+
path = snapshot_dir / relpath
|
|
245
248
|
path.write_text(text, encoding="utf-8")
|
|
246
249
|
return relpath
|
|
247
250
|
|
|
@@ -262,7 +265,7 @@ def _pipeline_step_dir_name(*, step_index: int, extractor_id: str) -> str:
|
|
|
262
265
|
|
|
263
266
|
def write_pipeline_step_text_artifact(
|
|
264
267
|
*,
|
|
265
|
-
|
|
268
|
+
snapshot_dir: Path,
|
|
266
269
|
step_index: int,
|
|
267
270
|
extractor_id: str,
|
|
268
271
|
item: CatalogItem,
|
|
@@ -271,8 +274,8 @@ def write_pipeline_step_text_artifact(
|
|
|
271
274
|
"""
|
|
272
275
|
Write a pipeline step text artifact for an item.
|
|
273
276
|
|
|
274
|
-
:param
|
|
275
|
-
:type
|
|
277
|
+
:param snapshot_dir: Extraction snapshot directory.
|
|
278
|
+
:type snapshot_dir: Path
|
|
276
279
|
:param step_index: One-based pipeline step index.
|
|
277
280
|
:type step_index: int
|
|
278
281
|
:param extractor_id: Extractor identifier for the step.
|
|
@@ -285,10 +288,10 @@ def write_pipeline_step_text_artifact(
|
|
|
285
288
|
:rtype: str
|
|
286
289
|
"""
|
|
287
290
|
step_dir_name = _pipeline_step_dir_name(step_index=step_index, extractor_id=extractor_id)
|
|
288
|
-
text_dir =
|
|
291
|
+
text_dir = snapshot_dir / "steps" / step_dir_name / "text"
|
|
289
292
|
text_dir.mkdir(parents=True, exist_ok=True)
|
|
290
293
|
relpath = str(Path("steps") / step_dir_name / "text" / f"{item.id}.txt")
|
|
291
|
-
(
|
|
294
|
+
(snapshot_dir / relpath).write_text(text, encoding="utf-8")
|
|
292
295
|
return relpath
|
|
293
296
|
|
|
294
297
|
|
|
@@ -310,49 +313,51 @@ def _final_output_from_steps(
|
|
|
310
313
|
return step_outputs[-1]
|
|
311
314
|
|
|
312
315
|
|
|
313
|
-
def
|
|
316
|
+
def build_extraction_snapshot(
|
|
314
317
|
corpus: Corpus,
|
|
315
318
|
*,
|
|
316
319
|
extractor_id: str,
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
) ->
|
|
320
|
+
configuration_name: str,
|
|
321
|
+
configuration: Dict[str, Any],
|
|
322
|
+
) -> ExtractionSnapshotManifest:
|
|
320
323
|
"""
|
|
321
|
-
Build an extraction
|
|
324
|
+
Build an extraction snapshot for a corpus using the pipeline extractor.
|
|
322
325
|
|
|
323
326
|
:param corpus: Corpus to extract from.
|
|
324
327
|
:type corpus: Corpus
|
|
325
328
|
:param extractor_id: Extractor plugin identifier (must be ``pipeline``).
|
|
326
329
|
:type extractor_id: str
|
|
327
|
-
:param
|
|
328
|
-
:type
|
|
329
|
-
:param
|
|
330
|
-
:type
|
|
331
|
-
:return: Extraction
|
|
332
|
-
:rtype:
|
|
330
|
+
:param configuration_name: Human-readable configuration name.
|
|
331
|
+
:type configuration_name: str
|
|
332
|
+
:param configuration: Extractor configuration mapping.
|
|
333
|
+
:type configuration: dict[str, Any]
|
|
334
|
+
:return: Extraction snapshot manifest describing the build.
|
|
335
|
+
:rtype: ExtractionSnapshotManifest
|
|
333
336
|
:raises KeyError: If the extractor identifier is unknown.
|
|
334
337
|
:raises ValueError: If the extractor configuration is invalid.
|
|
335
|
-
:raises OSError: If the
|
|
336
|
-
:raises
|
|
338
|
+
:raises OSError: If the snapshot directory or artifacts cannot be written.
|
|
339
|
+
:raises ExtractionSnapshotFatalError: If the extractor is not the pipeline.
|
|
337
340
|
"""
|
|
338
341
|
extractor = get_extractor(extractor_id)
|
|
339
|
-
parsed_config = extractor.validate_config(
|
|
340
|
-
|
|
342
|
+
parsed_config = extractor.validate_config(configuration)
|
|
343
|
+
config_manifest = create_extraction_configuration_manifest(
|
|
341
344
|
extractor_id=extractor_id,
|
|
342
|
-
name=
|
|
343
|
-
|
|
345
|
+
name=configuration_name,
|
|
346
|
+
configuration=parsed_config.model_dump(),
|
|
344
347
|
)
|
|
345
|
-
manifest =
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
348
|
+
manifest = create_extraction_snapshot_manifest(corpus, configuration=config_manifest)
|
|
349
|
+
snapshot_dir = corpus.extraction_snapshot_dir(
|
|
350
|
+
extractor_id=extractor_id, snapshot_id=manifest.snapshot_id
|
|
351
|
+
)
|
|
352
|
+
if snapshot_dir.exists():
|
|
353
|
+
return corpus.load_extraction_snapshot_manifest(
|
|
354
|
+
extractor_id=extractor_id, snapshot_id=manifest.snapshot_id
|
|
350
355
|
)
|
|
351
|
-
|
|
356
|
+
snapshot_dir.mkdir(parents=True, exist_ok=False)
|
|
352
357
|
|
|
353
358
|
catalog = corpus.load_catalog()
|
|
354
359
|
if extractor_id != "pipeline":
|
|
355
|
-
raise
|
|
360
|
+
raise ExtractionSnapshotFatalError("Extraction snapshots must use the pipeline extractor")
|
|
356
361
|
|
|
357
362
|
pipeline_config = (
|
|
358
363
|
parsed_config
|
|
@@ -363,7 +368,7 @@ def build_extraction_run(
|
|
|
363
368
|
validated_steps: List[Tuple[PipelineStepSpec, TextExtractor, BaseModel]] = []
|
|
364
369
|
for step in pipeline_config.steps:
|
|
365
370
|
step_extractor = get_extractor(step.extractor_id)
|
|
366
|
-
parsed_step_config = step_extractor.validate_config(step.
|
|
371
|
+
parsed_step_config = step_extractor.validate_config(step.configuration)
|
|
367
372
|
validated_steps.append((step, step_extractor, parsed_step_config))
|
|
368
373
|
|
|
369
374
|
extracted_items: List[ExtractionItemResult] = []
|
|
@@ -400,7 +405,7 @@ def build_extraction_run(
|
|
|
400
405
|
previous_extractions=step_outputs,
|
|
401
406
|
)
|
|
402
407
|
except Exception as extraction_error:
|
|
403
|
-
if isinstance(extraction_error,
|
|
408
|
+
if isinstance(extraction_error, ExtractionSnapshotFatalError):
|
|
404
409
|
raise
|
|
405
410
|
last_error_type = extraction_error.__class__.__name__
|
|
406
411
|
last_error_message = str(extraction_error)
|
|
@@ -436,7 +441,7 @@ def build_extraction_run(
|
|
|
436
441
|
continue
|
|
437
442
|
|
|
438
443
|
relpath = write_pipeline_step_text_artifact(
|
|
439
|
-
|
|
444
|
+
snapshot_dir=snapshot_dir,
|
|
440
445
|
step_index=step_index,
|
|
441
446
|
extractor_id=step.extractor_id,
|
|
442
447
|
item=item,
|
|
@@ -497,7 +502,7 @@ def build_extraction_run(
|
|
|
497
502
|
|
|
498
503
|
final_text = final_output.text or ""
|
|
499
504
|
final_text_relpath = write_extracted_text_artifact(
|
|
500
|
-
|
|
505
|
+
snapshot_dir=snapshot_dir, item=item, text=final_text
|
|
501
506
|
)
|
|
502
507
|
extracted_count += 1
|
|
503
508
|
if final_text.strip():
|
|
@@ -534,5 +539,5 @@ def build_extraction_run(
|
|
|
534
539
|
"converted_items": converted_item_count,
|
|
535
540
|
}
|
|
536
541
|
manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
|
|
537
|
-
|
|
542
|
+
write_extraction_snapshot_manifest(snapshot_dir=snapshot_dir, manifest=manifest)
|
|
538
543
|
return manifest
|
|
@@ -13,7 +13,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
|
13
13
|
|
|
14
14
|
from .constants import EXTRACTION_DATASET_SCHEMA_VERSION
|
|
15
15
|
from .corpus import Corpus
|
|
16
|
-
from .extraction import
|
|
16
|
+
from .extraction import ExtractionSnapshotManifest
|
|
17
17
|
from .models import CatalogItem
|
|
18
18
|
from .time import utc_now_iso
|
|
19
19
|
|
|
@@ -118,12 +118,12 @@ class ExtractionEvaluationResult(BaseModel):
|
|
|
118
118
|
:vartype dataset: dict[str, object]
|
|
119
119
|
:ivar extractor_id: Extractor identifier.
|
|
120
120
|
:vartype extractor_id: str
|
|
121
|
-
:ivar
|
|
122
|
-
:vartype
|
|
123
|
-
:ivar
|
|
124
|
-
:vartype
|
|
125
|
-
:ivar
|
|
126
|
-
:vartype
|
|
121
|
+
:ivar snapshot_id: Extraction snapshot identifier.
|
|
122
|
+
:vartype snapshot_id: str
|
|
123
|
+
:ivar configuration_id: Extraction configuration identifier.
|
|
124
|
+
:vartype configuration_id: str
|
|
125
|
+
:ivar configuration_name: Extraction configuration name.
|
|
126
|
+
:vartype configuration_name: str
|
|
127
127
|
:ivar evaluated_at: International Organization for Standardization 8601 timestamp.
|
|
128
128
|
:vartype evaluated_at: str
|
|
129
129
|
:ivar metrics: Evaluation metrics for coverage and accuracy.
|
|
@@ -136,9 +136,9 @@ class ExtractionEvaluationResult(BaseModel):
|
|
|
136
136
|
|
|
137
137
|
dataset: Dict[str, object]
|
|
138
138
|
extractor_id: str
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
snapshot_id: str
|
|
140
|
+
configuration_id: str
|
|
141
|
+
configuration_name: str
|
|
142
142
|
evaluated_at: str
|
|
143
143
|
metrics: Dict[str, float]
|
|
144
144
|
items: List[ExtractionEvaluationItemReport]
|
|
@@ -160,21 +160,21 @@ def load_extraction_dataset(path: Path) -> ExtractionEvaluationDataset:
|
|
|
160
160
|
return ExtractionEvaluationDataset.model_validate(data)
|
|
161
161
|
|
|
162
162
|
|
|
163
|
-
def
|
|
163
|
+
def evaluate_extraction_snapshot(
|
|
164
164
|
*,
|
|
165
165
|
corpus: Corpus,
|
|
166
|
-
|
|
166
|
+
snapshot: ExtractionSnapshotManifest,
|
|
167
167
|
extractor_id: str,
|
|
168
168
|
dataset: ExtractionEvaluationDataset,
|
|
169
169
|
) -> ExtractionEvaluationResult:
|
|
170
170
|
"""
|
|
171
|
-
Evaluate an extraction
|
|
171
|
+
Evaluate an extraction snapshot against a dataset.
|
|
172
172
|
|
|
173
|
-
:param corpus: Corpus associated with the
|
|
173
|
+
:param corpus: Corpus associated with the snapshot.
|
|
174
174
|
:type corpus: Corpus
|
|
175
|
-
:param
|
|
176
|
-
:type
|
|
177
|
-
:param extractor_id: Extractor identifier for the
|
|
175
|
+
:param snapshot: Extraction snapshot manifest.
|
|
176
|
+
:type snapshot: ExtractionSnapshotManifest
|
|
177
|
+
:param extractor_id: Extractor identifier for the snapshot.
|
|
178
178
|
:type extractor_id: str
|
|
179
179
|
:param dataset: Extraction evaluation dataset.
|
|
180
180
|
:type dataset: ExtractionEvaluationDataset
|
|
@@ -182,7 +182,7 @@ def evaluate_extraction_run(
|
|
|
182
182
|
:rtype: ExtractionEvaluationResult
|
|
183
183
|
"""
|
|
184
184
|
catalog = corpus.load_catalog()
|
|
185
|
-
item_index = {item.item_id: item for item in
|
|
185
|
+
item_index = {item.item_id: item for item in snapshot.items}
|
|
186
186
|
coverage_present = 0
|
|
187
187
|
coverage_empty = 0
|
|
188
188
|
coverage_missing = 0
|
|
@@ -201,7 +201,7 @@ def evaluate_extraction_run(
|
|
|
201
201
|
processable += 1
|
|
202
202
|
|
|
203
203
|
extracted_text = corpus.read_extracted_text(
|
|
204
|
-
extractor_id=extractor_id,
|
|
204
|
+
extractor_id=extractor_id, snapshot_id=snapshot.snapshot_id, item_id=item_id
|
|
205
205
|
)
|
|
206
206
|
coverage_status = _coverage_status(extracted_text)
|
|
207
207
|
if coverage_status == "present":
|
|
@@ -245,9 +245,9 @@ def evaluate_extraction_run(
|
|
|
245
245
|
return ExtractionEvaluationResult(
|
|
246
246
|
dataset=dataset_meta,
|
|
247
247
|
extractor_id=extractor_id,
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
248
|
+
snapshot_id=snapshot.snapshot_id,
|
|
249
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
250
|
+
configuration_name=snapshot.configuration.name,
|
|
251
251
|
evaluated_at=utc_now_iso(),
|
|
252
252
|
metrics=metrics,
|
|
253
253
|
items=item_reports,
|
|
@@ -255,21 +255,21 @@ def evaluate_extraction_run(
|
|
|
255
255
|
|
|
256
256
|
|
|
257
257
|
def write_extraction_evaluation_result(
|
|
258
|
-
*, corpus: Corpus,
|
|
258
|
+
*, corpus: Corpus, snapshot_id: str, result: ExtractionEvaluationResult
|
|
259
259
|
) -> Path:
|
|
260
260
|
"""
|
|
261
261
|
Persist extraction evaluation output under the corpus.
|
|
262
262
|
|
|
263
263
|
:param corpus: Corpus associated with the evaluation.
|
|
264
264
|
:type corpus: Corpus
|
|
265
|
-
:param
|
|
266
|
-
:type
|
|
265
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
266
|
+
:type snapshot_id: str
|
|
267
267
|
:param result: Evaluation result to write.
|
|
268
268
|
:type result: ExtractionEvaluationResult
|
|
269
269
|
:return: Output path.
|
|
270
270
|
:rtype: Path
|
|
271
271
|
"""
|
|
272
|
-
output_dir = corpus.
|
|
272
|
+
output_dir = corpus.snapshots_dir / "evaluation" / "extraction" / snapshot_id
|
|
273
273
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
274
274
|
output_path = output_dir / "output.json"
|
|
275
275
|
output_path.write_text(result.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
|
-
from ..errors import
|
|
14
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
15
15
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
16
|
from ..user_config import resolve_deepgram_api_key
|
|
17
17
|
from .base import TextExtractor
|
|
@@ -66,19 +66,19 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
66
66
|
:type config: dict[str, Any]
|
|
67
67
|
:return: Parsed configuration model.
|
|
68
68
|
:rtype: DeepgramSpeechToTextExtractorConfig
|
|
69
|
-
:raises
|
|
69
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
|
|
70
70
|
"""
|
|
71
71
|
try:
|
|
72
72
|
from deepgram import DeepgramClient # noqa: F401
|
|
73
73
|
except ImportError as import_error:
|
|
74
|
-
raise
|
|
74
|
+
raise ExtractionSnapshotFatalError(
|
|
75
75
|
"Deepgram speech to text extractor requires an optional dependency. "
|
|
76
76
|
'Install it with pip install "biblicus[deepgram]".'
|
|
77
77
|
) from import_error
|
|
78
78
|
|
|
79
79
|
api_key = resolve_deepgram_api_key()
|
|
80
80
|
if api_key is None:
|
|
81
|
-
raise
|
|
81
|
+
raise ExtractionSnapshotFatalError(
|
|
82
82
|
"Deepgram speech to text extractor requires a Deepgram API key. "
|
|
83
83
|
"Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
84
84
|
"deepgram.api_key."
|
|
@@ -107,7 +107,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
107
107
|
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
108
108
|
:return: Extracted text payload, or None when the item is not audio.
|
|
109
109
|
:rtype: ExtractedText or None
|
|
110
|
-
:raises
|
|
110
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
|
|
111
111
|
"""
|
|
112
112
|
_ = previous_extractions
|
|
113
113
|
if not item.media_type.startswith("audio/"):
|
|
@@ -121,7 +121,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
121
121
|
|
|
122
122
|
api_key = resolve_deepgram_api_key()
|
|
123
123
|
if api_key is None:
|
|
124
|
-
raise
|
|
124
|
+
raise ExtractionSnapshotFatalError(
|
|
125
125
|
"Deepgram speech to text extractor requires a Deepgram API key. "
|
|
126
126
|
"Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
127
127
|
"deepgram.api_key."
|
|
@@ -130,7 +130,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
130
130
|
try:
|
|
131
131
|
from deepgram import DeepgramClient
|
|
132
132
|
except ImportError as import_error:
|
|
133
|
-
raise
|
|
133
|
+
raise ExtractionSnapshotFatalError(
|
|
134
134
|
"Deepgram speech to text extractor requires an optional dependency. "
|
|
135
135
|
'Install it with pip install "biblicus[deepgram]".'
|
|
136
136
|
) from import_error
|
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -40,14 +40,14 @@ class DoclingGraniteExtractorConfig(BaseModel):
|
|
|
40
40
|
|
|
41
41
|
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
42
|
:vartype output_format: str
|
|
43
|
-
:ivar
|
|
44
|
-
:vartype
|
|
43
|
+
:ivar retriever: Inference retriever (mlx or transformers).
|
|
44
|
+
:vartype retriever: str
|
|
45
45
|
"""
|
|
46
46
|
|
|
47
|
-
model_config = ConfigDict(extra="forbid")
|
|
47
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
48
48
|
|
|
49
49
|
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
-
|
|
50
|
+
retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
class DoclingGraniteExtractor(TextExtractor):
|
|
@@ -71,7 +71,7 @@ class DoclingGraniteExtractor(TextExtractor):
|
|
|
71
71
|
:type config: dict[str, Any]
|
|
72
72
|
:return: Parsed config.
|
|
73
73
|
:rtype: DoclingGraniteExtractorConfig
|
|
74
|
-
:raises
|
|
74
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
75
75
|
"""
|
|
76
76
|
parsed = DoclingGraniteExtractorConfig.model_validate(config)
|
|
77
77
|
|
|
@@ -82,19 +82,19 @@ class DoclingGraniteExtractor(TextExtractor):
|
|
|
82
82
|
vlm_model_specs,
|
|
83
83
|
)
|
|
84
84
|
except ImportError as import_error:
|
|
85
|
-
raise
|
|
85
|
+
raise ExtractionSnapshotFatalError(
|
|
86
86
|
"DoclingGranite extractor requires an optional dependency. "
|
|
87
87
|
'Install it with pip install "biblicus[docling]".'
|
|
88
88
|
) from import_error
|
|
89
89
|
|
|
90
|
-
if parsed.
|
|
90
|
+
if parsed.retriever == "mlx":
|
|
91
91
|
try:
|
|
92
92
|
from docling.pipeline_options import vlm_model_specs
|
|
93
93
|
|
|
94
94
|
_ = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
95
95
|
except (ImportError, AttributeError) as exc:
|
|
96
|
-
raise
|
|
97
|
-
"DoclingGranite extractor with MLX
|
|
96
|
+
raise ExtractionSnapshotFatalError(
|
|
97
|
+
"DoclingGranite extractor with MLX retriever requires MLX support. "
|
|
98
98
|
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
99
|
) from exc
|
|
100
100
|
|
|
@@ -167,7 +167,7 @@ class DoclingGraniteExtractor(TextExtractor):
|
|
|
167
167
|
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
168
|
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
169
|
|
|
170
|
-
if config.
|
|
170
|
+
if config.retriever == "mlx":
|
|
171
171
|
vlm_options = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
172
172
|
else:
|
|
173
173
|
vlm_options = vlm_model_specs.GRANITE_DOCLING_TRANSFORMERS
|