biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
  43. biblicus-1.1.1.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
biblicus/corpus.py CHANGED
@@ -20,10 +20,10 @@ from .constants import (
20
20
  ANALYSIS_RUNS_DIR_NAME,
21
21
  CORPUS_DIR_NAME,
22
22
  DEFAULT_RAW_DIR,
23
- EXTRACTION_RUNS_DIR_NAME,
24
- RUNS_DIR_NAME,
23
+ EXTRACTION_SNAPSHOTS_DIR_NAME,
25
24
  SCHEMA_VERSION,
26
25
  SIDECAR_SUFFIX,
26
+ SNAPSHOTS_DIR_NAME,
27
27
  )
28
28
  from .errors import IngestCollisionError
29
29
  from .frontmatter import parse_front_matter, render_front_matter
@@ -34,10 +34,10 @@ from .models import (
34
34
  CatalogItem,
35
35
  CorpusCatalog,
36
36
  CorpusConfig,
37
- ExtractionRunListEntry,
38
- ExtractionRunReference,
37
+ ExtractionSnapshotListEntry,
38
+ ExtractionSnapshotReference,
39
39
  IngestResult,
40
- RetrievalRun,
40
+ RetrievalSnapshot,
41
41
  )
42
42
  from .sources import load_source
43
43
  from .time import utc_now_iso
@@ -539,7 +539,7 @@ class Corpus:
539
539
  generated_at=utc_now_iso(),
540
540
  corpus_uri=normalize_corpus_uri(self.root),
541
541
  raw_dir=DEFAULT_RAW_DIR,
542
- latest_run_id=None,
542
+ latest_snapshot_id=None,
543
543
  items={},
544
544
  order=[],
545
545
  )
@@ -602,69 +602,71 @@ class Corpus:
602
602
  return None
603
603
 
604
604
  @property
605
- def runs_dir(self) -> Path:
605
+ def snapshots_dir(self) -> Path:
606
606
  """
607
- Location of retrieval run manifests.
607
+ Location of retrieval snapshot manifests.
608
608
 
609
- :return: Path to the runs directory.
609
+ :return: Path to the snapshots directory.
610
610
  :rtype: Path
611
611
  """
612
- return self.meta_dir / RUNS_DIR_NAME
612
+ return self.meta_dir / SNAPSHOTS_DIR_NAME
613
613
 
614
614
  @property
615
- def extraction_runs_dir(self) -> Path:
615
+ def extraction_snapshots_dir(self) -> Path:
616
616
  """
617
- Location of extraction run artifacts.
617
+ Location of extraction snapshot artifacts.
618
618
 
619
- :return: Path to the extraction runs directory.
619
+ :return: Path to the extraction snapshots directory.
620
620
  :rtype: Path
621
621
  """
622
- return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
622
+ return self.snapshots_dir / EXTRACTION_SNAPSHOTS_DIR_NAME
623
623
 
624
624
  @property
625
625
  def analysis_runs_dir(self) -> Path:
626
626
  """
627
- Location of analysis run artifacts.
627
+ Location of analysis snapshot artifacts.
628
628
 
629
- :return: Path to the analysis runs directory.
629
+ :return: Path to the analysis snapshots directory.
630
630
  :rtype: Path
631
631
  """
632
- return self.runs_dir / ANALYSIS_RUNS_DIR_NAME
632
+ return self.snapshots_dir / ANALYSIS_RUNS_DIR_NAME
633
633
 
634
- def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
634
+ def extraction_snapshot_dir(self, *, extractor_id: str, snapshot_id: str) -> Path:
635
635
  """
636
- Resolve an extraction run directory.
636
+ Resolve an extraction snapshot directory.
637
637
 
638
638
  :param extractor_id: Extractor plugin identifier.
639
639
  :type extractor_id: str
640
- :param run_id: Extraction run identifier.
641
- :type run_id: str
642
- :return: Extraction run directory.
640
+ :param snapshot_id: Extraction snapshot identifier.
641
+ :type snapshot_id: str
642
+ :return: Extraction snapshot directory.
643
643
  :rtype: Path
644
644
  """
645
- return self.extraction_runs_dir / extractor_id / run_id
645
+ return self.extraction_snapshots_dir / extractor_id / snapshot_id
646
646
 
647
- def analysis_run_dir(self, *, analysis_id: str, run_id: str) -> Path:
647
+ def analysis_run_dir(self, *, analysis_id: str, snapshot_id: str) -> Path:
648
648
  """
649
- Resolve an analysis run directory.
649
+ Resolve an analysis snapshot directory.
650
650
 
651
651
  :param analysis_id: Analysis backend identifier.
652
652
  :type analysis_id: str
653
- :param run_id: Analysis run identifier.
654
- :type run_id: str
655
- :return: Analysis run directory.
653
+ :param snapshot_id: Analysis snapshot identifier.
654
+ :type snapshot_id: str
655
+ :return: Analysis snapshot directory.
656
656
  :rtype: Path
657
657
  """
658
- return self.analysis_runs_dir / analysis_id / run_id
658
+ return self.analysis_runs_dir / analysis_id / snapshot_id
659
659
 
660
- def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
660
+ def read_extracted_text(
661
+ self, *, extractor_id: str, snapshot_id: str, item_id: str
662
+ ) -> Optional[str]:
661
663
  """
662
- Read extracted text for an item from an extraction run, when present.
664
+ Read extracted text for an item from an extraction snapshot, when present.
663
665
 
664
666
  :param extractor_id: Extractor plugin identifier.
665
667
  :type extractor_id: str
666
- :param run_id: Extraction run identifier.
667
- :type run_id: str
668
+ :param snapshot_id: Extraction snapshot identifier.
669
+ :type snapshot_id: str
668
670
  :param item_id: Item identifier.
669
671
  :type item_id: str
670
672
  :return: Extracted text or None if the artifact does not exist.
@@ -672,7 +674,7 @@ class Corpus:
672
674
  :raises OSError: If the file exists but cannot be read.
673
675
  """
674
676
  path = (
675
- self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
677
+ self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
676
678
  / "text"
677
679
  / f"{item_id}.txt"
678
680
  )
@@ -680,72 +682,73 @@ class Corpus:
680
682
  return None
681
683
  return path.read_text(encoding="utf-8")
682
684
 
683
- def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
685
+ def load_extraction_snapshot_manifest(self, *, extractor_id: str, snapshot_id: str):
684
686
  """
685
- Load an extraction run manifest from the corpus.
687
+ Load an extraction snapshot manifest from the corpus.
686
688
 
687
689
  :param extractor_id: Extractor plugin identifier.
688
690
  :type extractor_id: str
689
- :param run_id: Extraction run identifier.
690
- :type run_id: str
691
- :return: Parsed extraction run manifest.
692
- :rtype: biblicus.extraction.ExtractionRunManifest
691
+ :param snapshot_id: Extraction snapshot identifier.
692
+ :type snapshot_id: str
693
+ :return: Parsed extraction snapshot manifest.
694
+ :rtype: biblicus.extraction.ExtractionSnapshotManifest
693
695
  :raises FileNotFoundError: If the manifest file does not exist.
694
696
  :raises ValueError: If the manifest data is invalid.
695
697
  """
696
- from .extraction import ExtractionRunManifest
698
+ from .extraction import ExtractionSnapshotManifest
697
699
 
698
700
  manifest_path = (
699
- self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
701
+ self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
702
+ / "manifest.json"
700
703
  )
701
704
  if not manifest_path.is_file():
702
- raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
705
+ raise FileNotFoundError(f"Missing extraction snapshot manifest: {manifest_path}")
703
706
  data = json.loads(manifest_path.read_text(encoding="utf-8"))
704
- return ExtractionRunManifest.model_validate(data)
707
+ return ExtractionSnapshotManifest.model_validate(data)
705
708
 
706
- def list_extraction_runs(
709
+ def list_extraction_snapshots(
707
710
  self, *, extractor_id: Optional[str] = None
708
- ) -> List[ExtractionRunListEntry]:
711
+ ) -> List[ExtractionSnapshotListEntry]:
709
712
  """
710
- List extraction runs stored under the corpus.
713
+ List extraction snapshots stored under the corpus.
711
714
 
712
715
  :param extractor_id: Optional extractor identifier filter.
713
716
  :type extractor_id: str or None
714
- :return: Summary list entries for each run.
715
- :rtype: list[biblicus.models.ExtractionRunListEntry]
717
+ :return: Summary list entries for each snapshot.
718
+ :rtype: list[biblicus.models.ExtractionSnapshotListEntry]
716
719
  """
717
- runs_root = self.extraction_runs_dir
718
- if not runs_root.is_dir():
720
+ snapshots_root = self.extraction_snapshots_dir
721
+ if not snapshots_root.is_dir():
719
722
  return []
720
723
 
721
724
  extractor_dirs: List[Path]
722
725
  if extractor_id is None:
723
- extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
726
+ extractor_dirs = [path for path in sorted(snapshots_root.iterdir()) if path.is_dir()]
724
727
  else:
725
- extractor_path = runs_root / extractor_id
728
+ extractor_path = snapshots_root / extractor_id
726
729
  extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
727
730
 
728
- entries: List[ExtractionRunListEntry] = []
731
+ entries: List[ExtractionSnapshotListEntry] = []
729
732
  for extractor_dir in extractor_dirs:
730
- for run_dir in sorted(extractor_dir.iterdir()):
731
- if not run_dir.is_dir():
733
+ for snapshot_dir in sorted(extractor_dir.iterdir()):
734
+ if not snapshot_dir.is_dir():
732
735
  continue
733
- manifest_path = run_dir / "manifest.json"
736
+ manifest_path = snapshot_dir / "manifest.json"
734
737
  if not manifest_path.is_file():
735
738
  continue
736
739
  try:
737
- manifest = self.load_extraction_run_manifest(
740
+ manifest = self.load_extraction_snapshot_manifest(
738
741
  extractor_id=extractor_dir.name,
739
- run_id=run_dir.name,
742
+ snapshot_id=snapshot_dir.name,
740
743
  )
741
744
  except (FileNotFoundError, ValueError):
742
745
  continue
743
746
  entries.append(
744
- ExtractionRunListEntry(
747
+ ExtractionSnapshotListEntry(
745
748
  extractor_id=extractor_dir.name,
746
- run_id=run_dir.name,
747
- recipe_id=manifest.recipe.recipe_id,
748
- recipe_name=manifest.recipe.name,
749
+ snapshot_id=snapshot_dir.name,
750
+ configuration_id=manifest.configuration.configuration_id,
751
+ configuration_name=manifest.configuration.name,
749
752
  catalog_generated_at=manifest.catalog_generated_at,
750
753
  created_at=manifest.created_at,
751
754
  stats=dict(manifest.stats),
@@ -753,95 +756,100 @@ class Corpus:
753
756
  )
754
757
 
755
758
  entries.sort(
756
- key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True
759
+ key=lambda entry: (entry.created_at, entry.extractor_id, entry.snapshot_id),
760
+ reverse=True,
757
761
  )
758
762
  return entries
759
763
 
760
- def latest_extraction_run_reference(
764
+ def latest_extraction_snapshot_reference(
761
765
  self, *, extractor_id: Optional[str] = None
762
- ) -> Optional[ExtractionRunReference]:
766
+ ) -> Optional[ExtractionSnapshotReference]:
763
767
  """
764
- Return the most recent extraction run reference.
768
+ Return the most recent extraction snapshot reference.
765
769
 
766
770
  :param extractor_id: Optional extractor identifier filter.
767
771
  :type extractor_id: str or None
768
- :return: Latest extraction run reference or None when no runs exist.
769
- :rtype: biblicus.models.ExtractionRunReference or None
772
+ :return: Latest extraction snapshot reference or None when no snapshots exist.
773
+ :rtype: biblicus.models.ExtractionSnapshotReference or None
770
774
  """
771
- entries = self.list_extraction_runs(extractor_id=extractor_id)
775
+ entries = self.list_extraction_snapshots(extractor_id=extractor_id)
772
776
  if not entries:
773
777
  return None
774
778
  latest = entries[0]
775
- return ExtractionRunReference(extractor_id=latest.extractor_id, run_id=latest.run_id)
779
+ return ExtractionSnapshotReference(
780
+ extractor_id=latest.extractor_id, snapshot_id=latest.snapshot_id
781
+ )
776
782
 
777
- def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
783
+ def delete_extraction_snapshot(self, *, extractor_id: str, snapshot_id: str) -> None:
778
784
  """
779
- Delete an extraction run directory and its derived artifacts.
785
+ Delete an extraction snapshot directory and its derived artifacts.
780
786
 
781
787
  :param extractor_id: Extractor plugin identifier.
782
788
  :type extractor_id: str
783
- :param run_id: Extraction run identifier.
784
- :type run_id: str
789
+ :param snapshot_id: Extraction snapshot identifier.
790
+ :type snapshot_id: str
785
791
  :return: None.
786
792
  :rtype: None
787
- :raises FileNotFoundError: If the extraction run directory does not exist.
793
+ :raises FileNotFoundError: If the extraction snapshot directory does not exist.
788
794
  """
789
- run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
790
- if not run_dir.is_dir():
791
- raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
792
- shutil.rmtree(run_dir)
795
+ snapshot_dir = self.extraction_snapshot_dir(
796
+ extractor_id=extractor_id, snapshot_id=snapshot_id
797
+ )
798
+ if not snapshot_dir.is_dir():
799
+ raise FileNotFoundError(f"Missing extraction snapshot directory: {snapshot_dir}")
800
+ shutil.rmtree(snapshot_dir)
793
801
 
794
- def _ensure_runs_dir(self) -> None:
802
+ def _ensure_snapshots_dir(self) -> None:
795
803
  """
796
- Ensure the retrieval runs directory exists.
804
+ Ensure the retrieval snapshots directory exists.
797
805
 
798
806
  :return: None.
799
807
  :rtype: None
800
808
  """
801
- self.runs_dir.mkdir(parents=True, exist_ok=True)
809
+ self.snapshots_dir.mkdir(parents=True, exist_ok=True)
802
810
 
803
- def write_run(self, run: RetrievalRun) -> None:
811
+ def write_snapshot(self, snapshot: RetrievalSnapshot) -> None:
804
812
  """
805
- Persist a retrieval run manifest and update the catalog pointer.
813
+ Persist a retrieval snapshot manifest and update the catalog pointer.
806
814
 
807
- :param run: Run manifest to persist.
808
- :type run: RetrievalRun
815
+ :param snapshot: Snapshot manifest to persist.
816
+ :type snapshot: RetrievalSnapshot
809
817
  :return: None.
810
818
  :rtype: None
811
819
  """
812
- self._ensure_runs_dir()
813
- path = self.runs_dir / f"{run.run_id}.json"
814
- path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
820
+ self._ensure_snapshots_dir()
821
+ path = self.snapshots_dir / f"{snapshot.snapshot_id}.json"
822
+ path.write_text(snapshot.model_dump_json(indent=2) + "\n", encoding="utf-8")
815
823
  catalog = self._load_catalog()
816
- catalog.latest_run_id = run.run_id
824
+ catalog.latest_snapshot_id = snapshot.snapshot_id
817
825
  catalog.generated_at = utc_now_iso()
818
826
  self._write_catalog(catalog)
819
827
 
820
- def load_run(self, run_id: str) -> RetrievalRun:
828
+ def load_snapshot(self, snapshot_id: str) -> RetrievalSnapshot:
821
829
  """
822
- Load a retrieval run manifest by identifier.
830
+ Load a retrieval snapshot manifest by identifier.
823
831
 
824
- :param run_id: Run identifier.
825
- :type run_id: str
826
- :return: Parsed run manifest.
827
- :rtype: RetrievalRun
828
- :raises FileNotFoundError: If the run manifest does not exist.
832
+ :param snapshot_id: Snapshot identifier.
833
+ :type snapshot_id: str
834
+ :return: Parsed snapshot manifest.
835
+ :rtype: RetrievalSnapshot
836
+ :raises FileNotFoundError: If the snapshot manifest does not exist.
829
837
  """
830
- path = self.runs_dir / f"{run_id}.json"
838
+ path = self.snapshots_dir / f"{snapshot_id}.json"
831
839
  if not path.is_file():
832
- raise FileNotFoundError(f"Missing run manifest: {path}")
840
+ raise FileNotFoundError(f"Missing snapshot manifest: {path}")
833
841
  data = json.loads(path.read_text(encoding="utf-8"))
834
- return RetrievalRun.model_validate(data)
842
+ return RetrievalSnapshot.model_validate(data)
835
843
 
836
844
  @property
837
- def latest_run_id(self) -> Optional[str]:
845
+ def latest_snapshot_id(self) -> Optional[str]:
838
846
  """
839
- Latest retrieval run identifier recorded in the catalog.
847
+ Latest retrieval snapshot identifier recorded in the catalog.
840
848
 
841
- :return: Latest run identifier or None.
849
+ :return: Latest snapshot identifier or None.
842
850
  :rtype: str or None
843
851
  """
844
- return self._load_catalog().latest_run_id
852
+ return self._load_catalog().latest_snapshot_id
845
853
 
846
854
  def _upsert_catalog_item(self, item: CatalogItem) -> None:
847
855
  """
@@ -860,7 +868,7 @@ class Corpus:
860
868
  ordered_ids.insert(0, item.id)
861
869
  catalog.order = ordered_ids
862
870
  catalog.generated_at = utc_now_iso()
863
- catalog.latest_run_id = None
871
+ catalog.latest_snapshot_id = None
864
872
 
865
873
  self._write_catalog(catalog)
866
874
 
@@ -1621,7 +1629,7 @@ class Corpus:
1621
1629
  generated_at=utc_now_iso(),
1622
1630
  corpus_uri=normalize_corpus_uri(self.root),
1623
1631
  raw_dir=DEFAULT_RAW_DIR,
1624
- latest_run_id=None,
1632
+ latest_snapshot_id=None,
1625
1633
  items=new_items,
1626
1634
  order=order,
1627
1635
  )
@@ -1673,7 +1681,7 @@ class Corpus:
1673
1681
  generated_at=utc_now_iso(),
1674
1682
  corpus_uri=normalize_corpus_uri(self.root),
1675
1683
  raw_dir=DEFAULT_RAW_DIR,
1676
- latest_run_id=None,
1684
+ latest_snapshot_id=None,
1677
1685
  items={},
1678
1686
  order=[],
1679
1687
  )
biblicus/errors.py CHANGED
@@ -5,13 +5,13 @@ Error types for Biblicus.
5
5
  from __future__ import annotations
6
6
 
7
7
 
8
- class ExtractionRunFatalError(RuntimeError):
8
+ class ExtractionSnapshotFatalError(RuntimeError):
9
9
  """
10
- Fatal extraction run error that should abort the entire run.
10
+ Fatal extraction snapshot error that should abort the entire snapshot.
11
11
 
12
12
  This exception is used for conditions that indicate a configuration or environment problem
13
13
  rather than a per-item extraction failure. For example, a selection extractor that depends
14
- on referenced extraction run manifests treats missing manifests as fatal.
14
+ on referenced extraction snapshot manifests treats missing manifests as fatal.
15
15
  """
16
16
 
17
17
 
biblicus/evaluation.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- Evaluation utilities for Biblicus retrieval runs.
2
+ Evaluation utilities for Biblicus retrieval snapshots.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -11,10 +11,10 @@ from typing import Dict, List, Optional
11
11
 
12
12
  from pydantic import BaseModel, ConfigDict, Field, model_validator
13
13
 
14
- from .backends import get_backend
15
14
  from .constants import DATASET_SCHEMA_VERSION
16
15
  from .corpus import Corpus
17
- from .models import QueryBudget, RetrievalResult, RetrievalRun
16
+ from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
17
+ from .retrievers import get_retriever
18
18
  from .time import utc_now_iso
19
19
 
20
20
 
@@ -85,10 +85,10 @@ class EvaluationResult(BaseModel):
85
85
 
86
86
  :ivar dataset: Dataset metadata.
87
87
  :vartype dataset: dict[str, object]
88
- :ivar backend_id: Backend identifier.
89
- :vartype backend_id: str
90
- :ivar run_id: Retrieval run identifier.
91
- :vartype run_id: str
88
+ :ivar retriever_id: Retriever identifier.
89
+ :vartype retriever_id: str
90
+ :ivar snapshot_id: Retrieval snapshot identifier.
91
+ :vartype snapshot_id: str
92
92
  :ivar evaluated_at: International Organization for Standardization 8601 evaluation timestamp.
93
93
  :vartype evaluated_at: str
94
94
  :ivar metrics: Quality metrics for retrieval.
@@ -100,8 +100,8 @@ class EvaluationResult(BaseModel):
100
100
  model_config = ConfigDict(extra="forbid")
101
101
 
102
102
  dataset: Dict[str, object]
103
- backend_id: str
104
- run_id: str
103
+ retriever_id: str
104
+ snapshot_id: str
105
105
  evaluated_at: str
106
106
  metrics: Dict[str, float]
107
107
  system: Dict[str, float]
@@ -120,20 +120,20 @@ def load_dataset(path: Path) -> EvaluationDataset:
120
120
  return EvaluationDataset.model_validate(data)
121
121
 
122
122
 
123
- def evaluate_run(
123
+ def evaluate_snapshot(
124
124
  *,
125
125
  corpus: Corpus,
126
- run: RetrievalRun,
126
+ snapshot: RetrievalSnapshot,
127
127
  dataset: EvaluationDataset,
128
128
  budget: QueryBudget,
129
129
  ) -> EvaluationResult:
130
130
  """
131
- Evaluate a retrieval run against a dataset.
131
+ Evaluate a retrieval snapshot against a dataset.
132
132
 
133
- :param corpus: Corpus associated with the run.
133
+ :param corpus: Corpus associated with the snapshot.
134
134
  :type corpus: Corpus
135
- :param run: Retrieval run manifest.
136
- :type run: RetrievalRun
135
+ :param snapshot: Retrieval snapshot manifest.
136
+ :type snapshot: RetrievalSnapshot
137
137
  :param dataset: Evaluation dataset.
138
138
  :type dataset: EvaluationDataset
139
139
  :param budget: Evidence selection budget.
@@ -141,14 +141,16 @@ def evaluate_run(
141
141
  :return: Evaluation result bundle.
142
142
  :rtype: EvaluationResult
143
143
  """
144
- backend = get_backend(run.recipe.backend_id)
144
+ retriever = get_retriever(snapshot.configuration.retriever_id)
145
145
  latency_seconds: List[float] = []
146
146
  hit_count = 0
147
147
  reciprocal_ranks: List[float] = []
148
148
 
149
149
  for query in dataset.queries:
150
150
  timer_start = time.perf_counter()
151
- result = backend.query(corpus, run=run, query_text=query.query_text, budget=budget)
151
+ result = retriever.query(
152
+ corpus, snapshot=snapshot, query_text=query.query_text, budget=budget
153
+ )
152
154
  elapsed_seconds = time.perf_counter() - timer_start
153
155
  latency_seconds.append(elapsed_seconds)
154
156
  expected_rank = _expected_rank(result, query)
@@ -172,7 +174,7 @@ def evaluate_run(
172
174
  system = {
173
175
  "average_latency_milliseconds": _average_latency_milliseconds(latency_seconds),
174
176
  "percentile_95_latency_milliseconds": _percentile_95_latency_milliseconds(latency_seconds),
175
- "index_bytes": float(_run_artifact_bytes(corpus, run)),
177
+ "index_bytes": float(_snapshot_artifact_bytes(corpus, snapshot)),
176
178
  }
177
179
  dataset_meta = {
178
180
  "name": dataset.name,
@@ -181,8 +183,8 @@ def evaluate_run(
181
183
  }
182
184
  return EvaluationResult(
183
185
  dataset=dataset_meta,
184
- backend_id=run.recipe.backend_id,
185
- run_id=run.run_id,
186
+ retriever_id=snapshot.configuration.retriever_id,
187
+ snapshot_id=snapshot.snapshot_id,
186
188
  evaluated_at=utc_now_iso(),
187
189
  metrics=metrics,
188
190
  system=system,
@@ -238,19 +240,19 @@ def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
238
240
  return sorted_latencies[percentile_index] * 1000.0
239
241
 
240
242
 
241
- def _run_artifact_bytes(corpus: Corpus, run: RetrievalRun) -> int:
243
+ def _snapshot_artifact_bytes(corpus: Corpus, snapshot: RetrievalSnapshot) -> int:
242
244
  """
243
- Sum artifact sizes for a retrieval run.
245
+ Sum artifact sizes for a retrieval snapshot.
244
246
 
245
247
  :param corpus: Corpus that owns the artifacts.
246
248
  :type corpus: Corpus
247
- :param run: Retrieval run manifest.
248
- :type run: RetrievalRun
249
+ :param snapshot: Retrieval snapshot manifest.
250
+ :type snapshot: RetrievalSnapshot
249
251
  :return: Total artifact bytes.
250
252
  :rtype: int
251
253
  """
252
254
  total_bytes = 0
253
- for artifact_relpath in run.artifact_paths:
255
+ for artifact_relpath in snapshot.snapshot_artifacts:
254
256
  artifact_path = corpus.root / artifact_relpath
255
257
  if artifact_path.exists():
256
258
  total_bytes += artifact_path.stat().st_size