biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
- biblicus-1.1.1.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -20,10 +20,10 @@ from .constants import (
|
|
|
20
20
|
ANALYSIS_RUNS_DIR_NAME,
|
|
21
21
|
CORPUS_DIR_NAME,
|
|
22
22
|
DEFAULT_RAW_DIR,
|
|
23
|
-
|
|
24
|
-
RUNS_DIR_NAME,
|
|
23
|
+
EXTRACTION_SNAPSHOTS_DIR_NAME,
|
|
25
24
|
SCHEMA_VERSION,
|
|
26
25
|
SIDECAR_SUFFIX,
|
|
26
|
+
SNAPSHOTS_DIR_NAME,
|
|
27
27
|
)
|
|
28
28
|
from .errors import IngestCollisionError
|
|
29
29
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
@@ -34,10 +34,10 @@ from .models import (
|
|
|
34
34
|
CatalogItem,
|
|
35
35
|
CorpusCatalog,
|
|
36
36
|
CorpusConfig,
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
ExtractionSnapshotListEntry,
|
|
38
|
+
ExtractionSnapshotReference,
|
|
39
39
|
IngestResult,
|
|
40
|
-
|
|
40
|
+
RetrievalSnapshot,
|
|
41
41
|
)
|
|
42
42
|
from .sources import load_source
|
|
43
43
|
from .time import utc_now_iso
|
|
@@ -539,7 +539,7 @@ class Corpus:
|
|
|
539
539
|
generated_at=utc_now_iso(),
|
|
540
540
|
corpus_uri=normalize_corpus_uri(self.root),
|
|
541
541
|
raw_dir=DEFAULT_RAW_DIR,
|
|
542
|
-
|
|
542
|
+
latest_snapshot_id=None,
|
|
543
543
|
items={},
|
|
544
544
|
order=[],
|
|
545
545
|
)
|
|
@@ -602,69 +602,71 @@ class Corpus:
|
|
|
602
602
|
return None
|
|
603
603
|
|
|
604
604
|
@property
|
|
605
|
-
def
|
|
605
|
+
def snapshots_dir(self) -> Path:
|
|
606
606
|
"""
|
|
607
|
-
Location of retrieval
|
|
607
|
+
Location of retrieval snapshot manifests.
|
|
608
608
|
|
|
609
|
-
:return: Path to the
|
|
609
|
+
:return: Path to the snapshots directory.
|
|
610
610
|
:rtype: Path
|
|
611
611
|
"""
|
|
612
|
-
return self.meta_dir /
|
|
612
|
+
return self.meta_dir / SNAPSHOTS_DIR_NAME
|
|
613
613
|
|
|
614
614
|
@property
|
|
615
|
-
def
|
|
615
|
+
def extraction_snapshots_dir(self) -> Path:
|
|
616
616
|
"""
|
|
617
|
-
Location of extraction
|
|
617
|
+
Location of extraction snapshot artifacts.
|
|
618
618
|
|
|
619
|
-
:return: Path to the extraction
|
|
619
|
+
:return: Path to the extraction snapshots directory.
|
|
620
620
|
:rtype: Path
|
|
621
621
|
"""
|
|
622
|
-
return self.
|
|
622
|
+
return self.snapshots_dir / EXTRACTION_SNAPSHOTS_DIR_NAME
|
|
623
623
|
|
|
624
624
|
@property
|
|
625
625
|
def analysis_runs_dir(self) -> Path:
|
|
626
626
|
"""
|
|
627
|
-
Location of analysis
|
|
627
|
+
Location of analysis snapshot artifacts.
|
|
628
628
|
|
|
629
|
-
:return: Path to the analysis
|
|
629
|
+
:return: Path to the analysis snapshots directory.
|
|
630
630
|
:rtype: Path
|
|
631
631
|
"""
|
|
632
|
-
return self.
|
|
632
|
+
return self.snapshots_dir / ANALYSIS_RUNS_DIR_NAME
|
|
633
633
|
|
|
634
|
-
def
|
|
634
|
+
def extraction_snapshot_dir(self, *, extractor_id: str, snapshot_id: str) -> Path:
|
|
635
635
|
"""
|
|
636
|
-
Resolve an extraction
|
|
636
|
+
Resolve an extraction snapshot directory.
|
|
637
637
|
|
|
638
638
|
:param extractor_id: Extractor plugin identifier.
|
|
639
639
|
:type extractor_id: str
|
|
640
|
-
:param
|
|
641
|
-
:type
|
|
642
|
-
:return: Extraction
|
|
640
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
641
|
+
:type snapshot_id: str
|
|
642
|
+
:return: Extraction snapshot directory.
|
|
643
643
|
:rtype: Path
|
|
644
644
|
"""
|
|
645
|
-
return self.
|
|
645
|
+
return self.extraction_snapshots_dir / extractor_id / snapshot_id
|
|
646
646
|
|
|
647
|
-
def analysis_run_dir(self, *, analysis_id: str,
|
|
647
|
+
def analysis_run_dir(self, *, analysis_id: str, snapshot_id: str) -> Path:
|
|
648
648
|
"""
|
|
649
|
-
Resolve an analysis
|
|
649
|
+
Resolve an analysis snapshot directory.
|
|
650
650
|
|
|
651
651
|
:param analysis_id: Analysis backend identifier.
|
|
652
652
|
:type analysis_id: str
|
|
653
|
-
:param
|
|
654
|
-
:type
|
|
655
|
-
:return: Analysis
|
|
653
|
+
:param snapshot_id: Analysis snapshot identifier.
|
|
654
|
+
:type snapshot_id: str
|
|
655
|
+
:return: Analysis snapshot directory.
|
|
656
656
|
:rtype: Path
|
|
657
657
|
"""
|
|
658
|
-
return self.analysis_runs_dir / analysis_id /
|
|
658
|
+
return self.analysis_runs_dir / analysis_id / snapshot_id
|
|
659
659
|
|
|
660
|
-
def read_extracted_text(
|
|
660
|
+
def read_extracted_text(
|
|
661
|
+
self, *, extractor_id: str, snapshot_id: str, item_id: str
|
|
662
|
+
) -> Optional[str]:
|
|
661
663
|
"""
|
|
662
|
-
Read extracted text for an item from an extraction
|
|
664
|
+
Read extracted text for an item from an extraction snapshot, when present.
|
|
663
665
|
|
|
664
666
|
:param extractor_id: Extractor plugin identifier.
|
|
665
667
|
:type extractor_id: str
|
|
666
|
-
:param
|
|
667
|
-
:type
|
|
668
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
669
|
+
:type snapshot_id: str
|
|
668
670
|
:param item_id: Item identifier.
|
|
669
671
|
:type item_id: str
|
|
670
672
|
:return: Extracted text or None if the artifact does not exist.
|
|
@@ -672,7 +674,7 @@ class Corpus:
|
|
|
672
674
|
:raises OSError: If the file exists but cannot be read.
|
|
673
675
|
"""
|
|
674
676
|
path = (
|
|
675
|
-
self.
|
|
677
|
+
self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
|
|
676
678
|
/ "text"
|
|
677
679
|
/ f"{item_id}.txt"
|
|
678
680
|
)
|
|
@@ -680,72 +682,73 @@ class Corpus:
|
|
|
680
682
|
return None
|
|
681
683
|
return path.read_text(encoding="utf-8")
|
|
682
684
|
|
|
683
|
-
def
|
|
685
|
+
def load_extraction_snapshot_manifest(self, *, extractor_id: str, snapshot_id: str):
|
|
684
686
|
"""
|
|
685
|
-
Load an extraction
|
|
687
|
+
Load an extraction snapshot manifest from the corpus.
|
|
686
688
|
|
|
687
689
|
:param extractor_id: Extractor plugin identifier.
|
|
688
690
|
:type extractor_id: str
|
|
689
|
-
:param
|
|
690
|
-
:type
|
|
691
|
-
:return: Parsed extraction
|
|
692
|
-
:rtype: biblicus.extraction.
|
|
691
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
692
|
+
:type snapshot_id: str
|
|
693
|
+
:return: Parsed extraction snapshot manifest.
|
|
694
|
+
:rtype: biblicus.extraction.ExtractionSnapshotManifest
|
|
693
695
|
:raises FileNotFoundError: If the manifest file does not exist.
|
|
694
696
|
:raises ValueError: If the manifest data is invalid.
|
|
695
697
|
"""
|
|
696
|
-
from .extraction import
|
|
698
|
+
from .extraction import ExtractionSnapshotManifest
|
|
697
699
|
|
|
698
700
|
manifest_path = (
|
|
699
|
-
self.
|
|
701
|
+
self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
|
|
702
|
+
/ "manifest.json"
|
|
700
703
|
)
|
|
701
704
|
if not manifest_path.is_file():
|
|
702
|
-
raise FileNotFoundError(f"Missing extraction
|
|
705
|
+
raise FileNotFoundError(f"Missing extraction snapshot manifest: {manifest_path}")
|
|
703
706
|
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
704
|
-
return
|
|
707
|
+
return ExtractionSnapshotManifest.model_validate(data)
|
|
705
708
|
|
|
706
|
-
def
|
|
709
|
+
def list_extraction_snapshots(
|
|
707
710
|
self, *, extractor_id: Optional[str] = None
|
|
708
|
-
) -> List[
|
|
711
|
+
) -> List[ExtractionSnapshotListEntry]:
|
|
709
712
|
"""
|
|
710
|
-
List extraction
|
|
713
|
+
List extraction snapshots stored under the corpus.
|
|
711
714
|
|
|
712
715
|
:param extractor_id: Optional extractor identifier filter.
|
|
713
716
|
:type extractor_id: str or None
|
|
714
|
-
:return: Summary list entries for each
|
|
715
|
-
:rtype: list[biblicus.models.
|
|
717
|
+
:return: Summary list entries for each snapshot.
|
|
718
|
+
:rtype: list[biblicus.models.ExtractionSnapshotListEntry]
|
|
716
719
|
"""
|
|
717
|
-
|
|
718
|
-
if not
|
|
720
|
+
snapshots_root = self.extraction_snapshots_dir
|
|
721
|
+
if not snapshots_root.is_dir():
|
|
719
722
|
return []
|
|
720
723
|
|
|
721
724
|
extractor_dirs: List[Path]
|
|
722
725
|
if extractor_id is None:
|
|
723
|
-
extractor_dirs = [path for path in sorted(
|
|
726
|
+
extractor_dirs = [path for path in sorted(snapshots_root.iterdir()) if path.is_dir()]
|
|
724
727
|
else:
|
|
725
|
-
extractor_path =
|
|
728
|
+
extractor_path = snapshots_root / extractor_id
|
|
726
729
|
extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
|
|
727
730
|
|
|
728
|
-
entries: List[
|
|
731
|
+
entries: List[ExtractionSnapshotListEntry] = []
|
|
729
732
|
for extractor_dir in extractor_dirs:
|
|
730
|
-
for
|
|
731
|
-
if not
|
|
733
|
+
for snapshot_dir in sorted(extractor_dir.iterdir()):
|
|
734
|
+
if not snapshot_dir.is_dir():
|
|
732
735
|
continue
|
|
733
|
-
manifest_path =
|
|
736
|
+
manifest_path = snapshot_dir / "manifest.json"
|
|
734
737
|
if not manifest_path.is_file():
|
|
735
738
|
continue
|
|
736
739
|
try:
|
|
737
|
-
manifest = self.
|
|
740
|
+
manifest = self.load_extraction_snapshot_manifest(
|
|
738
741
|
extractor_id=extractor_dir.name,
|
|
739
|
-
|
|
742
|
+
snapshot_id=snapshot_dir.name,
|
|
740
743
|
)
|
|
741
744
|
except (FileNotFoundError, ValueError):
|
|
742
745
|
continue
|
|
743
746
|
entries.append(
|
|
744
|
-
|
|
747
|
+
ExtractionSnapshotListEntry(
|
|
745
748
|
extractor_id=extractor_dir.name,
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
+
snapshot_id=snapshot_dir.name,
|
|
750
|
+
configuration_id=manifest.configuration.configuration_id,
|
|
751
|
+
configuration_name=manifest.configuration.name,
|
|
749
752
|
catalog_generated_at=manifest.catalog_generated_at,
|
|
750
753
|
created_at=manifest.created_at,
|
|
751
754
|
stats=dict(manifest.stats),
|
|
@@ -753,95 +756,100 @@ class Corpus:
|
|
|
753
756
|
)
|
|
754
757
|
|
|
755
758
|
entries.sort(
|
|
756
|
-
key=lambda entry: (entry.created_at, entry.extractor_id, entry.
|
|
759
|
+
key=lambda entry: (entry.created_at, entry.extractor_id, entry.snapshot_id),
|
|
760
|
+
reverse=True,
|
|
757
761
|
)
|
|
758
762
|
return entries
|
|
759
763
|
|
|
760
|
-
def
|
|
764
|
+
def latest_extraction_snapshot_reference(
|
|
761
765
|
self, *, extractor_id: Optional[str] = None
|
|
762
|
-
) -> Optional[
|
|
766
|
+
) -> Optional[ExtractionSnapshotReference]:
|
|
763
767
|
"""
|
|
764
|
-
Return the most recent extraction
|
|
768
|
+
Return the most recent extraction snapshot reference.
|
|
765
769
|
|
|
766
770
|
:param extractor_id: Optional extractor identifier filter.
|
|
767
771
|
:type extractor_id: str or None
|
|
768
|
-
:return: Latest extraction
|
|
769
|
-
:rtype: biblicus.models.
|
|
772
|
+
:return: Latest extraction snapshot reference or None when no snapshots exist.
|
|
773
|
+
:rtype: biblicus.models.ExtractionSnapshotReference or None
|
|
770
774
|
"""
|
|
771
|
-
entries = self.
|
|
775
|
+
entries = self.list_extraction_snapshots(extractor_id=extractor_id)
|
|
772
776
|
if not entries:
|
|
773
777
|
return None
|
|
774
778
|
latest = entries[0]
|
|
775
|
-
return
|
|
779
|
+
return ExtractionSnapshotReference(
|
|
780
|
+
extractor_id=latest.extractor_id, snapshot_id=latest.snapshot_id
|
|
781
|
+
)
|
|
776
782
|
|
|
777
|
-
def
|
|
783
|
+
def delete_extraction_snapshot(self, *, extractor_id: str, snapshot_id: str) -> None:
|
|
778
784
|
"""
|
|
779
|
-
Delete an extraction
|
|
785
|
+
Delete an extraction snapshot directory and its derived artifacts.
|
|
780
786
|
|
|
781
787
|
:param extractor_id: Extractor plugin identifier.
|
|
782
788
|
:type extractor_id: str
|
|
783
|
-
:param
|
|
784
|
-
:type
|
|
789
|
+
:param snapshot_id: Extraction snapshot identifier.
|
|
790
|
+
:type snapshot_id: str
|
|
785
791
|
:return: None.
|
|
786
792
|
:rtype: None
|
|
787
|
-
:raises FileNotFoundError: If the extraction
|
|
793
|
+
:raises FileNotFoundError: If the extraction snapshot directory does not exist.
|
|
788
794
|
"""
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
795
|
+
snapshot_dir = self.extraction_snapshot_dir(
|
|
796
|
+
extractor_id=extractor_id, snapshot_id=snapshot_id
|
|
797
|
+
)
|
|
798
|
+
if not snapshot_dir.is_dir():
|
|
799
|
+
raise FileNotFoundError(f"Missing extraction snapshot directory: {snapshot_dir}")
|
|
800
|
+
shutil.rmtree(snapshot_dir)
|
|
793
801
|
|
|
794
|
-
def
|
|
802
|
+
def _ensure_snapshots_dir(self) -> None:
|
|
795
803
|
"""
|
|
796
|
-
Ensure the retrieval
|
|
804
|
+
Ensure the retrieval snapshots directory exists.
|
|
797
805
|
|
|
798
806
|
:return: None.
|
|
799
807
|
:rtype: None
|
|
800
808
|
"""
|
|
801
|
-
self.
|
|
809
|
+
self.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
802
810
|
|
|
803
|
-
def
|
|
811
|
+
def write_snapshot(self, snapshot: RetrievalSnapshot) -> None:
|
|
804
812
|
"""
|
|
805
|
-
Persist a retrieval
|
|
813
|
+
Persist a retrieval snapshot manifest and update the catalog pointer.
|
|
806
814
|
|
|
807
|
-
:param
|
|
808
|
-
:type
|
|
815
|
+
:param snapshot: Snapshot manifest to persist.
|
|
816
|
+
:type snapshot: RetrievalSnapshot
|
|
809
817
|
:return: None.
|
|
810
818
|
:rtype: None
|
|
811
819
|
"""
|
|
812
|
-
self.
|
|
813
|
-
path = self.
|
|
814
|
-
path.write_text(
|
|
820
|
+
self._ensure_snapshots_dir()
|
|
821
|
+
path = self.snapshots_dir / f"{snapshot.snapshot_id}.json"
|
|
822
|
+
path.write_text(snapshot.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
815
823
|
catalog = self._load_catalog()
|
|
816
|
-
catalog.
|
|
824
|
+
catalog.latest_snapshot_id = snapshot.snapshot_id
|
|
817
825
|
catalog.generated_at = utc_now_iso()
|
|
818
826
|
self._write_catalog(catalog)
|
|
819
827
|
|
|
820
|
-
def
|
|
828
|
+
def load_snapshot(self, snapshot_id: str) -> RetrievalSnapshot:
|
|
821
829
|
"""
|
|
822
|
-
Load a retrieval
|
|
830
|
+
Load a retrieval snapshot manifest by identifier.
|
|
823
831
|
|
|
824
|
-
:param
|
|
825
|
-
:type
|
|
826
|
-
:return: Parsed
|
|
827
|
-
:rtype:
|
|
828
|
-
:raises FileNotFoundError: If the
|
|
832
|
+
:param snapshot_id: Snapshot identifier.
|
|
833
|
+
:type snapshot_id: str
|
|
834
|
+
:return: Parsed snapshot manifest.
|
|
835
|
+
:rtype: RetrievalSnapshot
|
|
836
|
+
:raises FileNotFoundError: If the snapshot manifest does not exist.
|
|
829
837
|
"""
|
|
830
|
-
path = self.
|
|
838
|
+
path = self.snapshots_dir / f"{snapshot_id}.json"
|
|
831
839
|
if not path.is_file():
|
|
832
|
-
raise FileNotFoundError(f"Missing
|
|
840
|
+
raise FileNotFoundError(f"Missing snapshot manifest: {path}")
|
|
833
841
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
834
|
-
return
|
|
842
|
+
return RetrievalSnapshot.model_validate(data)
|
|
835
843
|
|
|
836
844
|
@property
|
|
837
|
-
def
|
|
845
|
+
def latest_snapshot_id(self) -> Optional[str]:
|
|
838
846
|
"""
|
|
839
|
-
Latest retrieval
|
|
847
|
+
Latest retrieval snapshot identifier recorded in the catalog.
|
|
840
848
|
|
|
841
|
-
:return: Latest
|
|
849
|
+
:return: Latest snapshot identifier or None.
|
|
842
850
|
:rtype: str or None
|
|
843
851
|
"""
|
|
844
|
-
return self._load_catalog().
|
|
852
|
+
return self._load_catalog().latest_snapshot_id
|
|
845
853
|
|
|
846
854
|
def _upsert_catalog_item(self, item: CatalogItem) -> None:
|
|
847
855
|
"""
|
|
@@ -860,7 +868,7 @@ class Corpus:
|
|
|
860
868
|
ordered_ids.insert(0, item.id)
|
|
861
869
|
catalog.order = ordered_ids
|
|
862
870
|
catalog.generated_at = utc_now_iso()
|
|
863
|
-
catalog.
|
|
871
|
+
catalog.latest_snapshot_id = None
|
|
864
872
|
|
|
865
873
|
self._write_catalog(catalog)
|
|
866
874
|
|
|
@@ -1621,7 +1629,7 @@ class Corpus:
|
|
|
1621
1629
|
generated_at=utc_now_iso(),
|
|
1622
1630
|
corpus_uri=normalize_corpus_uri(self.root),
|
|
1623
1631
|
raw_dir=DEFAULT_RAW_DIR,
|
|
1624
|
-
|
|
1632
|
+
latest_snapshot_id=None,
|
|
1625
1633
|
items=new_items,
|
|
1626
1634
|
order=order,
|
|
1627
1635
|
)
|
|
@@ -1673,7 +1681,7 @@ class Corpus:
|
|
|
1673
1681
|
generated_at=utc_now_iso(),
|
|
1674
1682
|
corpus_uri=normalize_corpus_uri(self.root),
|
|
1675
1683
|
raw_dir=DEFAULT_RAW_DIR,
|
|
1676
|
-
|
|
1684
|
+
latest_snapshot_id=None,
|
|
1677
1685
|
items={},
|
|
1678
1686
|
order=[],
|
|
1679
1687
|
)
|
biblicus/errors.py
CHANGED
|
@@ -5,13 +5,13 @@ Error types for Biblicus.
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class
|
|
8
|
+
class ExtractionSnapshotFatalError(RuntimeError):
|
|
9
9
|
"""
|
|
10
|
-
Fatal extraction
|
|
10
|
+
Fatal extraction snapshot error that should abort the entire snapshot.
|
|
11
11
|
|
|
12
12
|
This exception is used for conditions that indicate a configuration or environment problem
|
|
13
13
|
rather than a per-item extraction failure. For example, a selection extractor that depends
|
|
14
|
-
on referenced extraction
|
|
14
|
+
on referenced extraction snapshot manifests treats missing manifests as fatal.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
|
biblicus/evaluation.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Evaluation utilities for Biblicus retrieval
|
|
2
|
+
Evaluation utilities for Biblicus retrieval snapshots.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -11,10 +11,10 @@ from typing import Dict, List, Optional
|
|
|
11
11
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
13
13
|
|
|
14
|
-
from .backends import get_backend
|
|
15
14
|
from .constants import DATASET_SCHEMA_VERSION
|
|
16
15
|
from .corpus import Corpus
|
|
17
|
-
from .models import QueryBudget, RetrievalResult,
|
|
16
|
+
from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
|
|
17
|
+
from .retrievers import get_retriever
|
|
18
18
|
from .time import utc_now_iso
|
|
19
19
|
|
|
20
20
|
|
|
@@ -85,10 +85,10 @@ class EvaluationResult(BaseModel):
|
|
|
85
85
|
|
|
86
86
|
:ivar dataset: Dataset metadata.
|
|
87
87
|
:vartype dataset: dict[str, object]
|
|
88
|
-
:ivar
|
|
89
|
-
:vartype
|
|
90
|
-
:ivar
|
|
91
|
-
:vartype
|
|
88
|
+
:ivar retriever_id: Retriever identifier.
|
|
89
|
+
:vartype retriever_id: str
|
|
90
|
+
:ivar snapshot_id: Retrieval snapshot identifier.
|
|
91
|
+
:vartype snapshot_id: str
|
|
92
92
|
:ivar evaluated_at: International Organization for Standardization 8601 evaluation timestamp.
|
|
93
93
|
:vartype evaluated_at: str
|
|
94
94
|
:ivar metrics: Quality metrics for retrieval.
|
|
@@ -100,8 +100,8 @@ class EvaluationResult(BaseModel):
|
|
|
100
100
|
model_config = ConfigDict(extra="forbid")
|
|
101
101
|
|
|
102
102
|
dataset: Dict[str, object]
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
retriever_id: str
|
|
104
|
+
snapshot_id: str
|
|
105
105
|
evaluated_at: str
|
|
106
106
|
metrics: Dict[str, float]
|
|
107
107
|
system: Dict[str, float]
|
|
@@ -120,20 +120,20 @@ def load_dataset(path: Path) -> EvaluationDataset:
|
|
|
120
120
|
return EvaluationDataset.model_validate(data)
|
|
121
121
|
|
|
122
122
|
|
|
123
|
-
def
|
|
123
|
+
def evaluate_snapshot(
|
|
124
124
|
*,
|
|
125
125
|
corpus: Corpus,
|
|
126
|
-
|
|
126
|
+
snapshot: RetrievalSnapshot,
|
|
127
127
|
dataset: EvaluationDataset,
|
|
128
128
|
budget: QueryBudget,
|
|
129
129
|
) -> EvaluationResult:
|
|
130
130
|
"""
|
|
131
|
-
Evaluate a retrieval
|
|
131
|
+
Evaluate a retrieval snapshot against a dataset.
|
|
132
132
|
|
|
133
|
-
:param corpus: Corpus associated with the
|
|
133
|
+
:param corpus: Corpus associated with the snapshot.
|
|
134
134
|
:type corpus: Corpus
|
|
135
|
-
:param
|
|
136
|
-
:type
|
|
135
|
+
:param snapshot: Retrieval snapshot manifest.
|
|
136
|
+
:type snapshot: RetrievalSnapshot
|
|
137
137
|
:param dataset: Evaluation dataset.
|
|
138
138
|
:type dataset: EvaluationDataset
|
|
139
139
|
:param budget: Evidence selection budget.
|
|
@@ -141,14 +141,16 @@ def evaluate_run(
|
|
|
141
141
|
:return: Evaluation result bundle.
|
|
142
142
|
:rtype: EvaluationResult
|
|
143
143
|
"""
|
|
144
|
-
|
|
144
|
+
retriever = get_retriever(snapshot.configuration.retriever_id)
|
|
145
145
|
latency_seconds: List[float] = []
|
|
146
146
|
hit_count = 0
|
|
147
147
|
reciprocal_ranks: List[float] = []
|
|
148
148
|
|
|
149
149
|
for query in dataset.queries:
|
|
150
150
|
timer_start = time.perf_counter()
|
|
151
|
-
result =
|
|
151
|
+
result = retriever.query(
|
|
152
|
+
corpus, snapshot=snapshot, query_text=query.query_text, budget=budget
|
|
153
|
+
)
|
|
152
154
|
elapsed_seconds = time.perf_counter() - timer_start
|
|
153
155
|
latency_seconds.append(elapsed_seconds)
|
|
154
156
|
expected_rank = _expected_rank(result, query)
|
|
@@ -172,7 +174,7 @@ def evaluate_run(
|
|
|
172
174
|
system = {
|
|
173
175
|
"average_latency_milliseconds": _average_latency_milliseconds(latency_seconds),
|
|
174
176
|
"percentile_95_latency_milliseconds": _percentile_95_latency_milliseconds(latency_seconds),
|
|
175
|
-
"index_bytes": float(
|
|
177
|
+
"index_bytes": float(_snapshot_artifact_bytes(corpus, snapshot)),
|
|
176
178
|
}
|
|
177
179
|
dataset_meta = {
|
|
178
180
|
"name": dataset.name,
|
|
@@ -181,8 +183,8 @@ def evaluate_run(
|
|
|
181
183
|
}
|
|
182
184
|
return EvaluationResult(
|
|
183
185
|
dataset=dataset_meta,
|
|
184
|
-
|
|
185
|
-
|
|
186
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
187
|
+
snapshot_id=snapshot.snapshot_id,
|
|
186
188
|
evaluated_at=utc_now_iso(),
|
|
187
189
|
metrics=metrics,
|
|
188
190
|
system=system,
|
|
@@ -238,19 +240,19 @@ def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
|
|
|
238
240
|
return sorted_latencies[percentile_index] * 1000.0
|
|
239
241
|
|
|
240
242
|
|
|
241
|
-
def
|
|
243
|
+
def _snapshot_artifact_bytes(corpus: Corpus, snapshot: RetrievalSnapshot) -> int:
|
|
242
244
|
"""
|
|
243
|
-
Sum artifact sizes for a retrieval
|
|
245
|
+
Sum artifact sizes for a retrieval snapshot.
|
|
244
246
|
|
|
245
247
|
:param corpus: Corpus that owns the artifacts.
|
|
246
248
|
:type corpus: Corpus
|
|
247
|
-
:param
|
|
248
|
-
:type
|
|
249
|
+
:param snapshot: Retrieval snapshot manifest.
|
|
250
|
+
:type snapshot: RetrievalSnapshot
|
|
249
251
|
:return: Total artifact bytes.
|
|
250
252
|
:rtype: int
|
|
251
253
|
"""
|
|
252
254
|
total_bytes = 0
|
|
253
|
-
for artifact_relpath in
|
|
255
|
+
for artifact_relpath in snapshot.snapshot_artifacts:
|
|
254
256
|
artifact_path = corpus.root / artifact_relpath
|
|
255
257
|
if artifact_path.exists():
|
|
256
258
|
total_bytes += artifact_path.stat().st_size
|