biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +21 -15
- biblicus/backends/sqlite_full_text_search.py +14 -15
- biblicus/cli.py +33 -49
- biblicus/corpus.py +39 -58
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +276 -77
- biblicus/extractors/__init__.py +14 -3
- biblicus/extractors/base.py +12 -5
- biblicus/extractors/metadata_text.py +13 -5
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +16 -6
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +0 -5
- biblicus/hook_manager.py +3 -5
- biblicus/hooks.py +3 -7
- biblicus/ignore.py +0 -3
- biblicus/models.py +87 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +44 -9
- biblicus/time.py +0 -1
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
- biblicus-0.3.0.dist-info/RECORD +44 -0
- biblicus/extractors/cascade.py +0 -101
- biblicus-0.2.0.dist-info/RECORD +0 -32
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -13,6 +13,7 @@ from pathlib import Path
|
|
|
13
13
|
from typing import Any, Dict, List, Optional, Sequence
|
|
14
14
|
|
|
15
15
|
import yaml
|
|
16
|
+
from pydantic import ValidationError
|
|
16
17
|
|
|
17
18
|
from .constants import (
|
|
18
19
|
CORPUS_DIR_NAME,
|
|
@@ -23,15 +24,13 @@ from .constants import (
|
|
|
23
24
|
SIDECAR_SUFFIX,
|
|
24
25
|
)
|
|
25
26
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
26
|
-
from pydantic import ValidationError
|
|
27
|
-
|
|
28
27
|
from .hook_manager import HookManager
|
|
29
28
|
from .hooks import HookPoint
|
|
30
29
|
from .ignore import load_corpus_ignore_spec
|
|
31
30
|
from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
|
|
32
31
|
from .sources import load_source
|
|
33
32
|
from .time import utc_now_iso
|
|
34
|
-
from .uris import
|
|
33
|
+
from .uris import corpus_ref_to_path, normalize_corpus_uri
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
def _sha256_bytes(data: bytes) -> str:
|
|
@@ -43,11 +42,12 @@ def _sha256_bytes(data: bytes) -> str:
|
|
|
43
42
|
:return: Secure Hash Algorithm 256 hex digest.
|
|
44
43
|
:rtype: str
|
|
45
44
|
"""
|
|
46
|
-
|
|
47
45
|
return hashlib.sha256(data).hexdigest()
|
|
48
46
|
|
|
49
47
|
|
|
50
|
-
def _write_stream_and_hash(
|
|
48
|
+
def _write_stream_and_hash(
|
|
49
|
+
stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
|
|
50
|
+
) -> Dict[str, object]:
|
|
51
51
|
"""
|
|
52
52
|
Write a binary stream to disk while computing a digest.
|
|
53
53
|
|
|
@@ -61,7 +61,6 @@ def _write_stream_and_hash(stream, destination_path: Path, *, chunk_size: int =
|
|
|
61
61
|
:rtype: dict[str, object]
|
|
62
62
|
:raises OSError: If the destination cannot be written.
|
|
63
63
|
"""
|
|
64
|
-
|
|
65
64
|
hasher = hashlib.sha256()
|
|
66
65
|
bytes_written = 0
|
|
67
66
|
with destination_path.open("wb") as destination_handle:
|
|
@@ -84,7 +83,6 @@ def _sanitize_filename(name: str) -> str:
|
|
|
84
83
|
:return: Sanitized filename.
|
|
85
84
|
:rtype: str
|
|
86
85
|
"""
|
|
87
|
-
|
|
88
86
|
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
89
87
|
sanitized_name = "".join(
|
|
90
88
|
(character if character in allowed_characters else "_") for character in name
|
|
@@ -101,9 +99,9 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
|
101
99
|
:return: Preferred extension or None.
|
|
102
100
|
:rtype: str or None
|
|
103
101
|
"""
|
|
104
|
-
|
|
105
102
|
media_type_overrides = {
|
|
106
103
|
"image/jpeg": ".jpg",
|
|
104
|
+
"audio/ogg": ".ogg",
|
|
107
105
|
}
|
|
108
106
|
if media_type in media_type_overrides:
|
|
109
107
|
return media_type_overrides[media_type]
|
|
@@ -121,7 +119,6 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
121
119
|
:return: Filename with a compatible extension.
|
|
122
120
|
:rtype: str
|
|
123
121
|
"""
|
|
124
|
-
|
|
125
122
|
raw_name = filename.strip()
|
|
126
123
|
|
|
127
124
|
if media_type == "text/markdown":
|
|
@@ -129,11 +126,12 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
129
126
|
return raw_name
|
|
130
127
|
return raw_name + ".md"
|
|
131
128
|
|
|
129
|
+
if Path(raw_name).suffix:
|
|
130
|
+
return raw_name
|
|
131
|
+
|
|
132
132
|
ext = _preferred_extension_for_media_type(media_type)
|
|
133
133
|
if not ext:
|
|
134
134
|
return raw_name
|
|
135
|
-
if raw_name.lower().endswith(ext.lower()):
|
|
136
|
-
return raw_name
|
|
137
135
|
return raw_name + ext
|
|
138
136
|
|
|
139
137
|
|
|
@@ -148,7 +146,6 @@ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
|
148
146
|
:return: Deduplicated tag list preserving order.
|
|
149
147
|
:rtype: list[str]
|
|
150
148
|
"""
|
|
151
|
-
|
|
152
149
|
merged_tags: List[str] = []
|
|
153
150
|
|
|
154
151
|
for explicit_tag in explicit:
|
|
@@ -181,7 +178,6 @@ def _sidecar_path_for(content_path: Path) -> Path:
|
|
|
181
178
|
:return: Sidecar path.
|
|
182
179
|
:rtype: Path
|
|
183
180
|
"""
|
|
184
|
-
|
|
185
181
|
return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
|
|
186
182
|
|
|
187
183
|
|
|
@@ -195,7 +191,6 @@ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
|
|
|
195
191
|
:rtype: dict[str, Any]
|
|
196
192
|
:raises ValueError: If the sidecar content is not a mapping.
|
|
197
193
|
"""
|
|
198
|
-
|
|
199
194
|
path = _sidecar_path_for(content_path)
|
|
200
195
|
if not path.is_file():
|
|
201
196
|
return {}
|
|
@@ -226,7 +221,9 @@ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
|
|
|
226
221
|
path.write_text(text + "\n", encoding="utf-8")
|
|
227
222
|
|
|
228
223
|
|
|
229
|
-
def _ensure_biblicus_block(
|
|
224
|
+
def _ensure_biblicus_block(
|
|
225
|
+
metadata: Dict[str, Any], *, item_id: str, source_uri: str
|
|
226
|
+
) -> Dict[str, Any]:
|
|
230
227
|
"""
|
|
231
228
|
Ensure the biblicus metadata block exists and is populated.
|
|
232
229
|
|
|
@@ -324,7 +321,6 @@ class Corpus:
|
|
|
324
321
|
:param root: Corpus root directory.
|
|
325
322
|
:type root: Path
|
|
326
323
|
"""
|
|
327
|
-
|
|
328
324
|
self.root = root
|
|
329
325
|
self.meta_dir = self.root / CORPUS_DIR_NAME
|
|
330
326
|
self.raw_dir = self.root / DEFAULT_RAW_DIR
|
|
@@ -339,7 +335,6 @@ class Corpus:
|
|
|
339
335
|
:return: Corpus uniform resource identifier.
|
|
340
336
|
:rtype: str
|
|
341
337
|
"""
|
|
342
|
-
|
|
343
338
|
return self.root.as_uri()
|
|
344
339
|
|
|
345
340
|
def _load_config(self) -> Optional[CorpusConfig]:
|
|
@@ -350,7 +345,6 @@ class Corpus:
|
|
|
350
345
|
:rtype: CorpusConfig or None
|
|
351
346
|
:raises ValueError: If the config schema is invalid.
|
|
352
347
|
"""
|
|
353
|
-
|
|
354
348
|
path = self.meta_dir / "config.json"
|
|
355
349
|
if not path.is_file():
|
|
356
350
|
return None
|
|
@@ -359,7 +353,9 @@ class Corpus:
|
|
|
359
353
|
return CorpusConfig.model_validate(data)
|
|
360
354
|
except ValidationError as exc:
|
|
361
355
|
has_hook_error = any(
|
|
362
|
-
isinstance(error.get("loc"), tuple)
|
|
356
|
+
isinstance(error.get("loc"), tuple)
|
|
357
|
+
and error.get("loc")
|
|
358
|
+
and error.get("loc")[0] == "hooks"
|
|
363
359
|
for error in exc.errors()
|
|
364
360
|
)
|
|
365
361
|
if has_hook_error:
|
|
@@ -374,7 +370,6 @@ class Corpus:
|
|
|
374
370
|
:rtype: HookManager or None
|
|
375
371
|
:raises ValueError: If hook specifications are invalid.
|
|
376
372
|
"""
|
|
377
|
-
|
|
378
373
|
if self.config is None or not self.config.hooks:
|
|
379
374
|
return None
|
|
380
375
|
return HookManager.from_config(
|
|
@@ -394,7 +389,6 @@ class Corpus:
|
|
|
394
389
|
:rtype: Corpus
|
|
395
390
|
:raises FileNotFoundError: If no corpus config is found.
|
|
396
391
|
"""
|
|
397
|
-
|
|
398
392
|
start = start.resolve()
|
|
399
393
|
for candidate in [start, *start.parents]:
|
|
400
394
|
if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
|
|
@@ -413,7 +407,6 @@ class Corpus:
|
|
|
413
407
|
:return: Opened corpus instance.
|
|
414
408
|
:rtype: Corpus
|
|
415
409
|
"""
|
|
416
|
-
|
|
417
410
|
return cls.find(corpus_ref_to_path(ref))
|
|
418
411
|
|
|
419
412
|
@classmethod
|
|
@@ -429,7 +422,6 @@ class Corpus:
|
|
|
429
422
|
:rtype: Corpus
|
|
430
423
|
:raises FileExistsError: If the corpus already exists and force is False.
|
|
431
424
|
"""
|
|
432
|
-
|
|
433
425
|
root = root.resolve()
|
|
434
426
|
corpus = cls(root)
|
|
435
427
|
|
|
@@ -459,7 +451,6 @@ class Corpus:
|
|
|
459
451
|
:return: Catalog file path.
|
|
460
452
|
:rtype: Path
|
|
461
453
|
"""
|
|
462
|
-
|
|
463
454
|
return self.meta_dir / "catalog.json"
|
|
464
455
|
|
|
465
456
|
def _init_catalog(self) -> None:
|
|
@@ -469,7 +460,6 @@ class Corpus:
|
|
|
469
460
|
:return: None.
|
|
470
461
|
:rtype: None
|
|
471
462
|
"""
|
|
472
|
-
|
|
473
463
|
if self.catalog_path.exists():
|
|
474
464
|
return
|
|
475
465
|
catalog = CorpusCatalog(
|
|
@@ -492,7 +482,6 @@ class Corpus:
|
|
|
492
482
|
:raises FileNotFoundError: If the catalog file does not exist.
|
|
493
483
|
:raises ValueError: If the catalog schema is invalid.
|
|
494
484
|
"""
|
|
495
|
-
|
|
496
485
|
if not self.catalog_path.is_file():
|
|
497
486
|
raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
|
|
498
487
|
catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
|
|
@@ -507,7 +496,6 @@ class Corpus:
|
|
|
507
496
|
:raises FileNotFoundError: If the catalog file does not exist.
|
|
508
497
|
:raises ValueError: If the catalog schema is invalid.
|
|
509
498
|
"""
|
|
510
|
-
|
|
511
499
|
return self._load_catalog()
|
|
512
500
|
|
|
513
501
|
def _write_catalog(self, catalog: CorpusCatalog) -> None:
|
|
@@ -519,7 +507,6 @@ class Corpus:
|
|
|
519
507
|
:return: None.
|
|
520
508
|
:rtype: None
|
|
521
509
|
"""
|
|
522
|
-
|
|
523
510
|
temp_path = self.catalog_path.with_suffix(".json.tmp")
|
|
524
511
|
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
525
512
|
temp_path.replace(self.catalog_path)
|
|
@@ -532,7 +519,6 @@ class Corpus:
|
|
|
532
519
|
:return: Path to the runs directory.
|
|
533
520
|
:rtype: Path
|
|
534
521
|
"""
|
|
535
|
-
|
|
536
522
|
return self.meta_dir / RUNS_DIR_NAME
|
|
537
523
|
|
|
538
524
|
@property
|
|
@@ -543,7 +529,6 @@ class Corpus:
|
|
|
543
529
|
:return: Path to the extraction runs directory.
|
|
544
530
|
:rtype: Path
|
|
545
531
|
"""
|
|
546
|
-
|
|
547
532
|
return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
|
|
548
533
|
|
|
549
534
|
def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
|
|
@@ -557,7 +542,6 @@ class Corpus:
|
|
|
557
542
|
:return: Extraction run directory.
|
|
558
543
|
:rtype: Path
|
|
559
544
|
"""
|
|
560
|
-
|
|
561
545
|
return self.extraction_runs_dir / extractor_id / run_id
|
|
562
546
|
|
|
563
547
|
def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
|
|
@@ -574,8 +558,11 @@ class Corpus:
|
|
|
574
558
|
:rtype: str or None
|
|
575
559
|
:raises OSError: If the file exists but cannot be read.
|
|
576
560
|
"""
|
|
577
|
-
|
|
578
|
-
|
|
561
|
+
path = (
|
|
562
|
+
self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
|
|
563
|
+
/ "text"
|
|
564
|
+
/ f"{item_id}.txt"
|
|
565
|
+
)
|
|
579
566
|
if not path.is_file():
|
|
580
567
|
return None
|
|
581
568
|
return path.read_text(encoding="utf-8")
|
|
@@ -587,7 +574,6 @@ class Corpus:
|
|
|
587
574
|
:return: None.
|
|
588
575
|
:rtype: None
|
|
589
576
|
"""
|
|
590
|
-
|
|
591
577
|
self.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
592
578
|
|
|
593
579
|
def write_run(self, run: RetrievalRun) -> None:
|
|
@@ -599,7 +585,6 @@ class Corpus:
|
|
|
599
585
|
:return: None.
|
|
600
586
|
:rtype: None
|
|
601
587
|
"""
|
|
602
|
-
|
|
603
588
|
self._ensure_runs_dir()
|
|
604
589
|
path = self.runs_dir / f"{run.run_id}.json"
|
|
605
590
|
path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
@@ -618,7 +603,6 @@ class Corpus:
|
|
|
618
603
|
:rtype: RetrievalRun
|
|
619
604
|
:raises FileNotFoundError: If the run manifest does not exist.
|
|
620
605
|
"""
|
|
621
|
-
|
|
622
606
|
path = self.runs_dir / f"{run_id}.json"
|
|
623
607
|
if not path.is_file():
|
|
624
608
|
raise FileNotFoundError(f"Missing run manifest: {path}")
|
|
@@ -633,7 +617,6 @@ class Corpus:
|
|
|
633
617
|
:return: Latest run identifier or None.
|
|
634
618
|
:rtype: str or None
|
|
635
619
|
"""
|
|
636
|
-
|
|
637
620
|
return self._load_catalog().latest_run_id
|
|
638
621
|
|
|
639
622
|
def _upsert_catalog_item(self, item: CatalogItem) -> None:
|
|
@@ -645,7 +628,6 @@ class Corpus:
|
|
|
645
628
|
:return: None.
|
|
646
629
|
:rtype: None
|
|
647
630
|
"""
|
|
648
|
-
|
|
649
631
|
self._init_catalog()
|
|
650
632
|
catalog = self._load_catalog()
|
|
651
633
|
catalog.items[item.id] = item
|
|
@@ -693,7 +675,6 @@ class Corpus:
|
|
|
693
675
|
:rtype: IngestResult
|
|
694
676
|
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
695
677
|
"""
|
|
696
|
-
|
|
697
678
|
item_id = str(uuid.uuid4())
|
|
698
679
|
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
699
680
|
|
|
@@ -741,7 +722,9 @@ class Corpus:
|
|
|
741
722
|
try:
|
|
742
723
|
markdown_text = data.decode("utf-8")
|
|
743
724
|
except UnicodeDecodeError as decode_error:
|
|
744
|
-
raise ValueError(
|
|
725
|
+
raise ValueError(
|
|
726
|
+
"Markdown must be Unicode Transformation Format 8"
|
|
727
|
+
) from decode_error
|
|
745
728
|
|
|
746
729
|
parsed_document = parse_front_matter(markdown_text)
|
|
747
730
|
frontmatter = dict(parsed_document.metadata)
|
|
@@ -760,7 +743,9 @@ class Corpus:
|
|
|
760
743
|
if isinstance(title_value, str) and title_value.strip():
|
|
761
744
|
resolved_title = title_value.strip()
|
|
762
745
|
|
|
763
|
-
frontmatter = _ensure_biblicus_block(
|
|
746
|
+
frontmatter = _ensure_biblicus_block(
|
|
747
|
+
frontmatter, item_id=item_id, source_uri=source_uri
|
|
748
|
+
)
|
|
764
749
|
rendered_document = render_front_matter(frontmatter, parsed_document.body)
|
|
765
750
|
data_to_write = rendered_document.encode("utf-8")
|
|
766
751
|
else:
|
|
@@ -807,7 +792,9 @@ class Corpus:
|
|
|
807
792
|
sidecar_metadata["media_type"] = media_type
|
|
808
793
|
sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
|
|
809
794
|
_write_sidecar(output_path, sidecar_metadata)
|
|
810
|
-
frontmatter = _merge_metadata(
|
|
795
|
+
frontmatter = _merge_metadata(
|
|
796
|
+
frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
|
|
797
|
+
)
|
|
811
798
|
|
|
812
799
|
created_at = utc_now_iso()
|
|
813
800
|
item_record = CatalogItem(
|
|
@@ -858,7 +845,6 @@ class Corpus:
|
|
|
858
845
|
:rtype: IngestResult
|
|
859
846
|
:raises ValueError: If the media_type is text/markdown.
|
|
860
847
|
"""
|
|
861
|
-
|
|
862
848
|
if media_type == "text/markdown":
|
|
863
849
|
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
864
850
|
|
|
@@ -972,7 +958,6 @@ class Corpus:
|
|
|
972
958
|
:return: Ingestion result summary.
|
|
973
959
|
:rtype: IngestResult
|
|
974
960
|
"""
|
|
975
|
-
|
|
976
961
|
data = text.encode("utf-8")
|
|
977
962
|
return self.ingest_item(
|
|
978
963
|
data,
|
|
@@ -1003,7 +988,6 @@ class Corpus:
|
|
|
1003
988
|
:return: Ingestion result summary.
|
|
1004
989
|
:rtype: IngestResult
|
|
1005
990
|
"""
|
|
1006
|
-
|
|
1007
991
|
candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
|
|
1008
992
|
if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
|
|
1009
993
|
path = source if isinstance(source, Path) else candidate_path
|
|
@@ -1061,7 +1045,6 @@ class Corpus:
|
|
|
1061
1045
|
:raises FileNotFoundError: If the source_root does not exist.
|
|
1062
1046
|
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1063
1047
|
"""
|
|
1064
|
-
|
|
1065
1048
|
source_root = source_root.resolve()
|
|
1066
1049
|
if not source_root.is_dir():
|
|
1067
1050
|
raise FileNotFoundError(f"Import source root does not exist: {source_root}")
|
|
@@ -1111,9 +1094,10 @@ class Corpus:
|
|
|
1111
1094
|
:rtype: None
|
|
1112
1095
|
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1113
1096
|
"""
|
|
1114
|
-
|
|
1115
1097
|
item_id = str(uuid.uuid4())
|
|
1116
|
-
destination_relpath = str(
|
|
1098
|
+
destination_relpath = str(
|
|
1099
|
+
Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
|
|
1100
|
+
)
|
|
1117
1101
|
destination_path = (self.root / destination_relpath).resolve()
|
|
1118
1102
|
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1119
1103
|
|
|
@@ -1176,11 +1160,8 @@ class Corpus:
|
|
|
1176
1160
|
:return: Catalog items ordered by recency.
|
|
1177
1161
|
:rtype: list[CatalogItem]
|
|
1178
1162
|
"""
|
|
1179
|
-
|
|
1180
1163
|
catalog = self._load_catalog()
|
|
1181
|
-
ordered_ids = (
|
|
1182
|
-
catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
1183
|
-
)
|
|
1164
|
+
ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
1184
1165
|
collected_items: List[CatalogItem] = []
|
|
1185
1166
|
for item_id in ordered_ids:
|
|
1186
1167
|
item = catalog.items.get(item_id)
|
|
@@ -1198,7 +1179,6 @@ class Corpus:
|
|
|
1198
1179
|
:rtype: CatalogItem
|
|
1199
1180
|
:raises KeyError: If the item identifier is unknown.
|
|
1200
1181
|
"""
|
|
1201
|
-
|
|
1202
1182
|
catalog = self._load_catalog()
|
|
1203
1183
|
item = catalog.items.get(item_id)
|
|
1204
1184
|
if item is None:
|
|
@@ -1216,7 +1196,6 @@ class Corpus:
|
|
|
1216
1196
|
:rtype: dict[str, int]
|
|
1217
1197
|
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
1218
1198
|
"""
|
|
1219
|
-
|
|
1220
1199
|
self._init_catalog()
|
|
1221
1200
|
existing_catalog = self._load_catalog()
|
|
1222
1201
|
stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
|
|
@@ -1291,7 +1270,9 @@ class Corpus:
|
|
|
1291
1270
|
|
|
1292
1271
|
previous_item = existing_catalog.items.get(item_id)
|
|
1293
1272
|
created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
|
|
1294
|
-
source_uri = source_uri or (
|
|
1273
|
+
source_uri = source_uri or (
|
|
1274
|
+
previous_item.source_uri if previous_item is not None else None
|
|
1275
|
+
)
|
|
1295
1276
|
|
|
1296
1277
|
if previous_item is None:
|
|
1297
1278
|
stats["inserted"] += 1
|
|
@@ -1338,7 +1319,6 @@ class Corpus:
|
|
|
1338
1319
|
:return: Corpus name.
|
|
1339
1320
|
:rtype: str
|
|
1340
1321
|
"""
|
|
1341
|
-
|
|
1342
1322
|
return self.root.name
|
|
1343
1323
|
|
|
1344
1324
|
def purge(self, *, confirm: str) -> None:
|
|
@@ -1351,10 +1331,11 @@ class Corpus:
|
|
|
1351
1331
|
:rtype: None
|
|
1352
1332
|
:raises ValueError: If the confirmation does not match.
|
|
1353
1333
|
"""
|
|
1354
|
-
|
|
1355
1334
|
expected = self.name
|
|
1356
1335
|
if confirm != expected:
|
|
1357
|
-
raise ValueError(
|
|
1336
|
+
raise ValueError(
|
|
1337
|
+
f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
|
|
1338
|
+
)
|
|
1358
1339
|
|
|
1359
1340
|
if self.raw_dir.exists():
|
|
1360
1341
|
shutil.rmtree(self.raw_dir)
|
biblicus/errors.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Error types for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ExtractionRunFatalError(RuntimeError):
|
|
9
|
+
"""
|
|
10
|
+
Fatal extraction run error that should abort the entire run.
|
|
11
|
+
|
|
12
|
+
This exception is used for conditions that indicate a configuration or environment problem
|
|
13
|
+
rather than a per-item extraction failure. For example, a selection extractor that depends
|
|
14
|
+
on referenced extraction run manifests treats missing manifests as fatal.
|
|
15
|
+
"""
|
biblicus/evaluation.py
CHANGED
|
@@ -11,8 +11,8 @@ from typing import Dict, List, Optional
|
|
|
11
11
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
13
13
|
|
|
14
|
-
from .constants import DATASET_SCHEMA_VERSION
|
|
15
14
|
from .backends import get_backend
|
|
15
|
+
from .constants import DATASET_SCHEMA_VERSION
|
|
16
16
|
from .corpus import Corpus
|
|
17
17
|
from .models import QueryBudget, RetrievalResult, RetrievalRun
|
|
18
18
|
from .time import utc_now_iso
|
|
@@ -45,7 +45,9 @@ class EvaluationQuery(BaseModel):
|
|
|
45
45
|
@model_validator(mode="after")
|
|
46
46
|
def _require_expectation(self) -> "EvaluationQuery":
|
|
47
47
|
if not self.expected_item_id and not self.expected_source_uri:
|
|
48
|
-
raise ValueError(
|
|
48
|
+
raise ValueError(
|
|
49
|
+
"Evaluation queries must include expected_item_id or expected_source_uri"
|
|
50
|
+
)
|
|
49
51
|
return self
|
|
50
52
|
|
|
51
53
|
|
|
@@ -114,7 +116,6 @@ def load_dataset(path: Path) -> EvaluationDataset:
|
|
|
114
116
|
:return: Parsed evaluation dataset.
|
|
115
117
|
:rtype: EvaluationDataset
|
|
116
118
|
"""
|
|
117
|
-
|
|
118
119
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
119
120
|
return EvaluationDataset.model_validate(data)
|
|
120
121
|
|
|
@@ -140,7 +141,6 @@ def evaluate_run(
|
|
|
140
141
|
:return: Evaluation result bundle.
|
|
141
142
|
:rtype: EvaluationResult
|
|
142
143
|
"""
|
|
143
|
-
|
|
144
144
|
backend = get_backend(run.recipe.backend_id)
|
|
145
145
|
latency_seconds: List[float] = []
|
|
146
146
|
hit_count = 0
|
|
@@ -200,7 +200,6 @@ def _expected_rank(result: RetrievalResult, query: EvaluationQuery) -> Optional[
|
|
|
200
200
|
:return: Rank of the first matching evidence item, or None.
|
|
201
201
|
:rtype: int or None
|
|
202
202
|
"""
|
|
203
|
-
|
|
204
203
|
for evidence in result.evidence:
|
|
205
204
|
if query.expected_item_id and evidence.item_id == query.expected_item_id:
|
|
206
205
|
return evidence.rank
|
|
@@ -218,7 +217,6 @@ def _average_latency_milliseconds(latencies: List[float]) -> float:
|
|
|
218
217
|
:return: Average latency in milliseconds.
|
|
219
218
|
:rtype: float
|
|
220
219
|
"""
|
|
221
|
-
|
|
222
220
|
if not latencies:
|
|
223
221
|
return 0.0
|
|
224
222
|
return sum(latencies) / len(latencies) * 1000.0
|
|
@@ -233,7 +231,6 @@ def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
|
|
|
233
231
|
:return: Percentile 95 latency in milliseconds.
|
|
234
232
|
:rtype: float
|
|
235
233
|
"""
|
|
236
|
-
|
|
237
234
|
if not latencies:
|
|
238
235
|
return 0.0
|
|
239
236
|
sorted_latencies = sorted(latencies)
|
|
@@ -252,7 +249,6 @@ def _run_artifact_bytes(corpus: Corpus, run: RetrievalRun) -> int:
|
|
|
252
249
|
:return: Total artifact bytes.
|
|
253
250
|
:rtype: int
|
|
254
251
|
"""
|
|
255
|
-
|
|
256
252
|
total_bytes = 0
|
|
257
253
|
for artifact_relpath in run.artifact_paths:
|
|
258
254
|
artifact_path = corpus.root / artifact_relpath
|