biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py CHANGED
@@ -11,6 +11,7 @@ import shutil
11
11
  import uuid
12
12
  from pathlib import Path
13
13
  from typing import Any, Dict, List, Optional, Sequence
14
+ from urllib.parse import quote, unquote, urlparse
14
15
 
15
16
  import yaml
16
17
  from pydantic import ValidationError
@@ -19,11 +20,12 @@ from .constants import (
19
20
  ANALYSIS_RUNS_DIR_NAME,
20
21
  CORPUS_DIR_NAME,
21
22
  DEFAULT_RAW_DIR,
22
- EXTRACTION_RUNS_DIR_NAME,
23
- RUNS_DIR_NAME,
23
+ EXTRACTION_SNAPSHOTS_DIR_NAME,
24
24
  SCHEMA_VERSION,
25
25
  SIDECAR_SUFFIX,
26
+ SNAPSHOTS_DIR_NAME,
26
27
  )
28
+ from .errors import IngestCollisionError
27
29
  from .frontmatter import parse_front_matter, render_front_matter
28
30
  from .hook_manager import HookManager
29
31
  from .hooks import HookPoint
@@ -32,10 +34,10 @@ from .models import (
32
34
  CatalogItem,
33
35
  CorpusCatalog,
34
36
  CorpusConfig,
35
- ExtractionRunListEntry,
36
- ExtractionRunReference,
37
+ ExtractionSnapshotListEntry,
38
+ ExtractionSnapshotReference,
37
39
  IngestResult,
38
- RetrievalRun,
40
+ RetrievalSnapshot,
39
41
  )
40
42
  from .sources import load_source
41
43
  from .time import utc_now_iso
@@ -110,7 +112,10 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
110
112
  """
111
113
  media_type_overrides = {
112
114
  "image/jpeg": ".jpg",
115
+ "audio/mpeg": ".mp3",
113
116
  "audio/ogg": ".ogg",
117
+ "audio/wav": ".wav",
118
+ "audio/x-wav": ".wav",
114
119
  }
115
120
  if media_type in media_type_overrides:
116
121
  return media_type_overrides[media_type]
@@ -136,7 +141,16 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
136
141
  return raw_name + ".md"
137
142
 
138
143
  if Path(raw_name).suffix:
139
- return raw_name
144
+ if "%2F" in raw_name or "%3A" in raw_name:
145
+ decoded = unquote(raw_name)
146
+ parsed = urlparse(decoded)
147
+ decoded_path = parsed.path if parsed.scheme else decoded
148
+ if not Path(decoded_path).suffix:
149
+ pass
150
+ else:
151
+ return raw_name
152
+ else:
153
+ return raw_name
140
154
 
141
155
  ext = _preferred_extension_for_media_type(media_type)
142
156
  if not ext:
@@ -144,6 +158,55 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
144
158
  return raw_name + ext
145
159
 
146
160
 
161
+ def _encode_source_uri_for_filename(source_uri: str) -> str:
162
+ """
163
+ Percent-encode a source uniform resource identifier for filename use.
164
+
165
+ :param source_uri: Source uniform resource identifier to encode.
166
+ :type source_uri: str
167
+ :return: Percent-encoded uniform resource identifier safe for filenames.
168
+ :rtype: str
169
+ """
170
+ return quote(source_uri, safe="")
171
+
172
+
173
+ def _storage_filename_for_ingest(
174
+ *, filename: Optional[str], media_type: str, source_uri: Optional[str]
175
+ ) -> str:
176
+ """
177
+ Derive a collision-safe filename for corpus storage.
178
+
179
+ If a source uniform resource identifier is provided, the full uniform resource identifier is
180
+ percent-encoded to namespace the stored file, preventing collisions between identical basenames
181
+ from different sources. When no uniform resource identifier is available, fall back to a
182
+ sanitized filename.
183
+
184
+ :param filename: Optional filename hint from the caller.
185
+ :type filename: str or None
186
+ :param media_type: Media type of the payload.
187
+ :type media_type: str
188
+ :param source_uri: Optional source uniform resource identifier for provenance.
189
+ :type source_uri: str or None
190
+ :return: Storage filename with an appropriate extension, or an empty string when no hint exists.
191
+ :rtype: str
192
+ """
193
+ base_name = ""
194
+ if source_uri:
195
+ base_name = _encode_source_uri_for_filename(source_uri)
196
+ if filename and not source_uri.startswith("file:"):
197
+ sanitized = _sanitize_filename(filename)
198
+ if sanitized:
199
+ base_name = f"{base_name}--{sanitized}"
200
+ if not base_name and filename:
201
+ base_name = _sanitize_filename(filename)
202
+ if not base_name:
203
+ return ""
204
+ if len(base_name) > 180:
205
+ digest = hashlib.sha256(base_name.encode("utf-8")).hexdigest()
206
+ base_name = f"hash-{digest}"
207
+ return _ensure_filename_extension(base_name, media_type=media_type)
208
+
209
+
147
210
  def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
148
211
  """
149
212
  Merge tags from explicit input and front matter values.
@@ -476,7 +539,7 @@ class Corpus:
476
539
  generated_at=utc_now_iso(),
477
540
  corpus_uri=normalize_corpus_uri(self.root),
478
541
  raw_dir=DEFAULT_RAW_DIR,
479
- latest_run_id=None,
542
+ latest_snapshot_id=None,
480
543
  items={},
481
544
  order=[],
482
545
  )
@@ -520,70 +583,90 @@ class Corpus:
520
583
  temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
521
584
  temp_path.replace(self.catalog_path)
522
585
 
586
+ def _find_item_by_source_uri(self, source_uri: str) -> Optional[CatalogItem]:
587
+ """
588
+ Locate an existing catalog item by source uniform resource identifier.
589
+
590
+ :param source_uri: Source uniform resource identifier to search for.
591
+ :type source_uri: str
592
+ :return: Matching catalog item or None.
593
+ :rtype: CatalogItem or None
594
+ """
595
+ if not source_uri:
596
+ return None
597
+ self._init_catalog()
598
+ catalog = self._load_catalog()
599
+ for item in catalog.items.values():
600
+ if item.source_uri == source_uri:
601
+ return item
602
+ return None
603
+
523
604
  @property
524
- def runs_dir(self) -> Path:
605
+ def snapshots_dir(self) -> Path:
525
606
  """
526
- Location of retrieval run manifests.
607
+ Location of retrieval snapshot manifests.
527
608
 
528
- :return: Path to the runs directory.
609
+ :return: Path to the snapshots directory.
529
610
  :rtype: Path
530
611
  """
531
- return self.meta_dir / RUNS_DIR_NAME
612
+ return self.meta_dir / SNAPSHOTS_DIR_NAME
532
613
 
533
614
  @property
534
- def extraction_runs_dir(self) -> Path:
615
+ def extraction_snapshots_dir(self) -> Path:
535
616
  """
536
- Location of extraction run artifacts.
617
+ Location of extraction snapshot artifacts.
537
618
 
538
- :return: Path to the extraction runs directory.
619
+ :return: Path to the extraction snapshots directory.
539
620
  :rtype: Path
540
621
  """
541
- return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
622
+ return self.snapshots_dir / EXTRACTION_SNAPSHOTS_DIR_NAME
542
623
 
543
624
  @property
544
625
  def analysis_runs_dir(self) -> Path:
545
626
  """
546
- Location of analysis run artifacts.
627
+ Location of analysis snapshot artifacts.
547
628
 
548
- :return: Path to the analysis runs directory.
629
+ :return: Path to the analysis snapshots directory.
549
630
  :rtype: Path
550
631
  """
551
- return self.runs_dir / ANALYSIS_RUNS_DIR_NAME
632
+ return self.snapshots_dir / ANALYSIS_RUNS_DIR_NAME
552
633
 
553
- def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
634
+ def extraction_snapshot_dir(self, *, extractor_id: str, snapshot_id: str) -> Path:
554
635
  """
555
- Resolve an extraction run directory.
636
+ Resolve an extraction snapshot directory.
556
637
 
557
638
  :param extractor_id: Extractor plugin identifier.
558
639
  :type extractor_id: str
559
- :param run_id: Extraction run identifier.
560
- :type run_id: str
561
- :return: Extraction run directory.
640
+ :param snapshot_id: Extraction snapshot identifier.
641
+ :type snapshot_id: str
642
+ :return: Extraction snapshot directory.
562
643
  :rtype: Path
563
644
  """
564
- return self.extraction_runs_dir / extractor_id / run_id
645
+ return self.extraction_snapshots_dir / extractor_id / snapshot_id
565
646
 
566
- def analysis_run_dir(self, *, analysis_id: str, run_id: str) -> Path:
647
+ def analysis_run_dir(self, *, analysis_id: str, snapshot_id: str) -> Path:
567
648
  """
568
- Resolve an analysis run directory.
649
+ Resolve an analysis snapshot directory.
569
650
 
570
651
  :param analysis_id: Analysis backend identifier.
571
652
  :type analysis_id: str
572
- :param run_id: Analysis run identifier.
573
- :type run_id: str
574
- :return: Analysis run directory.
653
+ :param snapshot_id: Analysis snapshot identifier.
654
+ :type snapshot_id: str
655
+ :return: Analysis snapshot directory.
575
656
  :rtype: Path
576
657
  """
577
- return self.analysis_runs_dir / analysis_id / run_id
658
+ return self.analysis_runs_dir / analysis_id / snapshot_id
578
659
 
579
- def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
660
+ def read_extracted_text(
661
+ self, *, extractor_id: str, snapshot_id: str, item_id: str
662
+ ) -> Optional[str]:
580
663
  """
581
- Read extracted text for an item from an extraction run, when present.
664
+ Read extracted text for an item from an extraction snapshot, when present.
582
665
 
583
666
  :param extractor_id: Extractor plugin identifier.
584
667
  :type extractor_id: str
585
- :param run_id: Extraction run identifier.
586
- :type run_id: str
668
+ :param snapshot_id: Extraction snapshot identifier.
669
+ :type snapshot_id: str
587
670
  :param item_id: Item identifier.
588
671
  :type item_id: str
589
672
  :return: Extracted text or None if the artifact does not exist.
@@ -591,7 +674,7 @@ class Corpus:
591
674
  :raises OSError: If the file exists but cannot be read.
592
675
  """
593
676
  path = (
594
- self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
677
+ self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
595
678
  / "text"
596
679
  / f"{item_id}.txt"
597
680
  )
@@ -599,72 +682,73 @@ class Corpus:
599
682
  return None
600
683
  return path.read_text(encoding="utf-8")
601
684
 
602
- def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
685
+ def load_extraction_snapshot_manifest(self, *, extractor_id: str, snapshot_id: str):
603
686
  """
604
- Load an extraction run manifest from the corpus.
687
+ Load an extraction snapshot manifest from the corpus.
605
688
 
606
689
  :param extractor_id: Extractor plugin identifier.
607
690
  :type extractor_id: str
608
- :param run_id: Extraction run identifier.
609
- :type run_id: str
610
- :return: Parsed extraction run manifest.
611
- :rtype: biblicus.extraction.ExtractionRunManifest
691
+ :param snapshot_id: Extraction snapshot identifier.
692
+ :type snapshot_id: str
693
+ :return: Parsed extraction snapshot manifest.
694
+ :rtype: biblicus.extraction.ExtractionSnapshotManifest
612
695
  :raises FileNotFoundError: If the manifest file does not exist.
613
696
  :raises ValueError: If the manifest data is invalid.
614
697
  """
615
- from .extraction import ExtractionRunManifest
698
+ from .extraction import ExtractionSnapshotManifest
616
699
 
617
700
  manifest_path = (
618
- self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
701
+ self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
702
+ / "manifest.json"
619
703
  )
620
704
  if not manifest_path.is_file():
621
- raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
705
+ raise FileNotFoundError(f"Missing extraction snapshot manifest: {manifest_path}")
622
706
  data = json.loads(manifest_path.read_text(encoding="utf-8"))
623
- return ExtractionRunManifest.model_validate(data)
707
+ return ExtractionSnapshotManifest.model_validate(data)
624
708
 
625
- def list_extraction_runs(
709
+ def list_extraction_snapshots(
626
710
  self, *, extractor_id: Optional[str] = None
627
- ) -> List[ExtractionRunListEntry]:
711
+ ) -> List[ExtractionSnapshotListEntry]:
628
712
  """
629
- List extraction runs stored under the corpus.
713
+ List extraction snapshots stored under the corpus.
630
714
 
631
715
  :param extractor_id: Optional extractor identifier filter.
632
716
  :type extractor_id: str or None
633
- :return: Summary list entries for each run.
634
- :rtype: list[biblicus.models.ExtractionRunListEntry]
717
+ :return: Summary list entries for each snapshot.
718
+ :rtype: list[biblicus.models.ExtractionSnapshotListEntry]
635
719
  """
636
- runs_root = self.extraction_runs_dir
637
- if not runs_root.is_dir():
720
+ snapshots_root = self.extraction_snapshots_dir
721
+ if not snapshots_root.is_dir():
638
722
  return []
639
723
 
640
724
  extractor_dirs: List[Path]
641
725
  if extractor_id is None:
642
- extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
726
+ extractor_dirs = [path for path in sorted(snapshots_root.iterdir()) if path.is_dir()]
643
727
  else:
644
- extractor_path = runs_root / extractor_id
728
+ extractor_path = snapshots_root / extractor_id
645
729
  extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
646
730
 
647
- entries: List[ExtractionRunListEntry] = []
731
+ entries: List[ExtractionSnapshotListEntry] = []
648
732
  for extractor_dir in extractor_dirs:
649
- for run_dir in sorted(extractor_dir.iterdir()):
650
- if not run_dir.is_dir():
733
+ for snapshot_dir in sorted(extractor_dir.iterdir()):
734
+ if not snapshot_dir.is_dir():
651
735
  continue
652
- manifest_path = run_dir / "manifest.json"
736
+ manifest_path = snapshot_dir / "manifest.json"
653
737
  if not manifest_path.is_file():
654
738
  continue
655
739
  try:
656
- manifest = self.load_extraction_run_manifest(
740
+ manifest = self.load_extraction_snapshot_manifest(
657
741
  extractor_id=extractor_dir.name,
658
- run_id=run_dir.name,
742
+ snapshot_id=snapshot_dir.name,
659
743
  )
660
744
  except (FileNotFoundError, ValueError):
661
745
  continue
662
746
  entries.append(
663
- ExtractionRunListEntry(
747
+ ExtractionSnapshotListEntry(
664
748
  extractor_id=extractor_dir.name,
665
- run_id=run_dir.name,
666
- recipe_id=manifest.recipe.recipe_id,
667
- recipe_name=manifest.recipe.name,
749
+ snapshot_id=snapshot_dir.name,
750
+ configuration_id=manifest.configuration.configuration_id,
751
+ configuration_name=manifest.configuration.name,
668
752
  catalog_generated_at=manifest.catalog_generated_at,
669
753
  created_at=manifest.created_at,
670
754
  stats=dict(manifest.stats),
@@ -672,95 +756,100 @@ class Corpus:
672
756
  )
673
757
 
674
758
  entries.sort(
675
- key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True
759
+ key=lambda entry: (entry.created_at, entry.extractor_id, entry.snapshot_id),
760
+ reverse=True,
676
761
  )
677
762
  return entries
678
763
 
679
- def latest_extraction_run_reference(
764
+ def latest_extraction_snapshot_reference(
680
765
  self, *, extractor_id: Optional[str] = None
681
- ) -> Optional[ExtractionRunReference]:
766
+ ) -> Optional[ExtractionSnapshotReference]:
682
767
  """
683
- Return the most recent extraction run reference.
768
+ Return the most recent extraction snapshot reference.
684
769
 
685
770
  :param extractor_id: Optional extractor identifier filter.
686
771
  :type extractor_id: str or None
687
- :return: Latest extraction run reference or None when no runs exist.
688
- :rtype: biblicus.models.ExtractionRunReference or None
772
+ :return: Latest extraction snapshot reference or None when no snapshots exist.
773
+ :rtype: biblicus.models.ExtractionSnapshotReference or None
689
774
  """
690
- entries = self.list_extraction_runs(extractor_id=extractor_id)
775
+ entries = self.list_extraction_snapshots(extractor_id=extractor_id)
691
776
  if not entries:
692
777
  return None
693
778
  latest = entries[0]
694
- return ExtractionRunReference(extractor_id=latest.extractor_id, run_id=latest.run_id)
779
+ return ExtractionSnapshotReference(
780
+ extractor_id=latest.extractor_id, snapshot_id=latest.snapshot_id
781
+ )
695
782
 
696
- def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
783
+ def delete_extraction_snapshot(self, *, extractor_id: str, snapshot_id: str) -> None:
697
784
  """
698
- Delete an extraction run directory and its derived artifacts.
785
+ Delete an extraction snapshot directory and its derived artifacts.
699
786
 
700
787
  :param extractor_id: Extractor plugin identifier.
701
788
  :type extractor_id: str
702
- :param run_id: Extraction run identifier.
703
- :type run_id: str
789
+ :param snapshot_id: Extraction snapshot identifier.
790
+ :type snapshot_id: str
704
791
  :return: None.
705
792
  :rtype: None
706
- :raises FileNotFoundError: If the extraction run directory does not exist.
793
+ :raises FileNotFoundError: If the extraction snapshot directory does not exist.
707
794
  """
708
- run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
709
- if not run_dir.is_dir():
710
- raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
711
- shutil.rmtree(run_dir)
795
+ snapshot_dir = self.extraction_snapshot_dir(
796
+ extractor_id=extractor_id, snapshot_id=snapshot_id
797
+ )
798
+ if not snapshot_dir.is_dir():
799
+ raise FileNotFoundError(f"Missing extraction snapshot directory: {snapshot_dir}")
800
+ shutil.rmtree(snapshot_dir)
712
801
 
713
- def _ensure_runs_dir(self) -> None:
802
+ def _ensure_snapshots_dir(self) -> None:
714
803
  """
715
- Ensure the retrieval runs directory exists.
804
+ Ensure the retrieval snapshots directory exists.
716
805
 
717
806
  :return: None.
718
807
  :rtype: None
719
808
  """
720
- self.runs_dir.mkdir(parents=True, exist_ok=True)
809
+ self.snapshots_dir.mkdir(parents=True, exist_ok=True)
721
810
 
722
- def write_run(self, run: RetrievalRun) -> None:
811
+ def write_snapshot(self, snapshot: RetrievalSnapshot) -> None:
723
812
  """
724
- Persist a retrieval run manifest and update the catalog pointer.
813
+ Persist a retrieval snapshot manifest and update the catalog pointer.
725
814
 
726
- :param run: Run manifest to persist.
727
- :type run: RetrievalRun
815
+ :param snapshot: Snapshot manifest to persist.
816
+ :type snapshot: RetrievalSnapshot
728
817
  :return: None.
729
818
  :rtype: None
730
819
  """
731
- self._ensure_runs_dir()
732
- path = self.runs_dir / f"{run.run_id}.json"
733
- path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
820
+ self._ensure_snapshots_dir()
821
+ path = self.snapshots_dir / f"{snapshot.snapshot_id}.json"
822
+ path.write_text(snapshot.model_dump_json(indent=2) + "\n", encoding="utf-8")
734
823
  catalog = self._load_catalog()
735
- catalog.latest_run_id = run.run_id
824
+ catalog.latest_snapshot_id = snapshot.snapshot_id
736
825
  catalog.generated_at = utc_now_iso()
737
826
  self._write_catalog(catalog)
738
827
 
739
- def load_run(self, run_id: str) -> RetrievalRun:
828
+ def load_snapshot(self, snapshot_id: str) -> RetrievalSnapshot:
740
829
  """
741
- Load a retrieval run manifest by identifier.
830
+ Load a retrieval snapshot manifest by identifier.
742
831
 
743
- :param run_id: Run identifier.
744
- :type run_id: str
745
- :return: Parsed run manifest.
746
- :rtype: RetrievalRun
747
- :raises FileNotFoundError: If the run manifest does not exist.
832
+ :param snapshot_id: Snapshot identifier.
833
+ :type snapshot_id: str
834
+ :return: Parsed snapshot manifest.
835
+ :rtype: RetrievalSnapshot
836
+ :raises FileNotFoundError: If the snapshot manifest does not exist.
748
837
  """
749
- path = self.runs_dir / f"{run_id}.json"
838
+ path = self.snapshots_dir / f"{snapshot_id}.json"
750
839
  if not path.is_file():
751
- raise FileNotFoundError(f"Missing run manifest: {path}")
840
+ raise FileNotFoundError(f"Missing snapshot manifest: {path}")
752
841
  data = json.loads(path.read_text(encoding="utf-8"))
753
- return RetrievalRun.model_validate(data)
842
+ return RetrievalSnapshot.model_validate(data)
754
843
 
755
844
  @property
756
- def latest_run_id(self) -> Optional[str]:
845
+ def latest_snapshot_id(self) -> Optional[str]:
757
846
  """
758
- Latest retrieval run identifier recorded in the catalog.
847
+ Latest retrieval snapshot identifier recorded in the catalog.
759
848
 
760
- :return: Latest run identifier or None.
849
+ :return: Latest snapshot identifier or None.
761
850
  :rtype: str or None
762
851
  """
763
- return self._load_catalog().latest_run_id
852
+ return self._load_catalog().latest_snapshot_id
764
853
 
765
854
  def _upsert_catalog_item(self, item: CatalogItem) -> None:
766
855
  """
@@ -779,7 +868,7 @@ class Corpus:
779
868
  ordered_ids.insert(0, item.id)
780
869
  catalog.order = ordered_ids
781
870
  catalog.generated_at = utc_now_iso()
782
- catalog.latest_run_id = None
871
+ catalog.latest_snapshot_id = None
783
872
 
784
873
  self._write_catalog(catalog)
785
874
 
@@ -817,18 +906,26 @@ class Corpus:
817
906
  :return: Ingestion result summary.
818
907
  :rtype: IngestResult
819
908
  :raises ValueError: If markdown is not Unicode Transformation Format 8.
909
+ :raises IngestCollisionError: If a source uniform resource identifier is already ingested.
820
910
  """
821
- item_id = str(uuid.uuid4())
822
- safe_filename = _sanitize_filename(filename) if filename else ""
911
+ existing_item = self._find_item_by_source_uri(source_uri)
912
+ if existing_item is not None:
913
+ raise IngestCollisionError(
914
+ source_uri=source_uri,
915
+ existing_item_id=existing_item.id,
916
+ existing_relpath=existing_item.relpath,
917
+ )
823
918
 
824
- if safe_filename:
825
- safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
919
+ item_id = str(uuid.uuid4())
920
+ storage_filename = _storage_filename_for_ingest(
921
+ filename=filename, media_type=media_type, source_uri=source_uri
922
+ )
826
923
 
827
924
  if media_type == "text/markdown":
828
- output_name = f"{item_id}--{safe_filename}" if safe_filename else f"{item_id}.md"
925
+ output_name = f"{item_id}--{storage_filename}" if storage_filename else f"{item_id}.md"
829
926
  else:
830
- if safe_filename:
831
- output_name = f"{item_id}--{safe_filename}"
927
+ if storage_filename:
928
+ output_name = f"{item_id}--{storage_filename}"
832
929
  else:
833
930
  extension = _preferred_extension_for_media_type(media_type) or ""
834
931
  output_name = f"{item_id}{extension}" if extension else f"{item_id}"
@@ -991,13 +1088,21 @@ class Corpus:
991
1088
  if media_type == "text/markdown":
992
1089
  raise ValueError("Stream ingestion is not supported for Markdown")
993
1090
 
1091
+ existing_item = self._find_item_by_source_uri(source_uri)
1092
+ if existing_item is not None:
1093
+ raise IngestCollisionError(
1094
+ source_uri=source_uri,
1095
+ existing_item_id=existing_item.id,
1096
+ existing_relpath=existing_item.relpath,
1097
+ )
1098
+
994
1099
  item_id = str(uuid.uuid4())
995
- safe_filename = _sanitize_filename(filename) if filename else ""
996
- if safe_filename:
997
- safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
1100
+ storage_filename = _storage_filename_for_ingest(
1101
+ filename=filename, media_type=media_type, source_uri=source_uri
1102
+ )
998
1103
 
999
- if safe_filename:
1000
- output_name = f"{item_id}--{safe_filename}"
1104
+ if storage_filename:
1105
+ output_name = f"{item_id}--{storage_filename}"
1001
1106
  else:
1002
1107
  extension = _preferred_extension_for_media_type(media_type) or ""
1003
1108
  output_name = f"{item_id}{extension}" if extension else f"{item_id}"
@@ -1085,7 +1190,7 @@ class Corpus:
1085
1190
  *,
1086
1191
  title: Optional[str] = None,
1087
1192
  tags: Sequence[str] = (),
1088
- source_uri: str = "text",
1193
+ source_uri: Optional[str] = None,
1089
1194
  ) -> IngestResult:
1090
1195
  """
1091
1196
  Ingest a text note as Markdown.
@@ -1096,11 +1201,15 @@ class Corpus:
1096
1201
  :type title: str or None
1097
1202
  :param tags: Tags to associate with the note.
1098
1203
  :type tags: Sequence[str]
1099
- :param source_uri: Source uniform resource identifier for provenance.
1100
- :type source_uri: str
1204
+ :param source_uri: Optional source uniform resource identifier for provenance.
1205
+ :type source_uri: str or None
1101
1206
  :return: Ingestion result summary.
1102
1207
  :rtype: IngestResult
1103
1208
  """
1209
+ if source_uri is None:
1210
+ digest_source = (title or "") + "\n" + text
1211
+ digest = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()
1212
+ source_uri = f"text:{digest}"
1104
1213
  data = text.encode("utf-8")
1105
1214
  return self.ingest_item(
1106
1215
  data,
@@ -1520,7 +1629,7 @@ class Corpus:
1520
1629
  generated_at=utc_now_iso(),
1521
1630
  corpus_uri=normalize_corpus_uri(self.root),
1522
1631
  raw_dir=DEFAULT_RAW_DIR,
1523
- latest_run_id=None,
1632
+ latest_snapshot_id=None,
1524
1633
  items=new_items,
1525
1634
  order=order,
1526
1635
  )
@@ -1572,7 +1681,7 @@ class Corpus:
1572
1681
  generated_at=utc_now_iso(),
1573
1682
  corpus_uri=normalize_corpus_uri(self.root),
1574
1683
  raw_dir=DEFAULT_RAW_DIR,
1575
- latest_run_id=None,
1684
+ latest_snapshot_id=None,
1576
1685
  items={},
1577
1686
  order=[],
1578
1687
  )
biblicus/errors.py CHANGED
@@ -5,11 +5,35 @@ Error types for Biblicus.
5
5
  from __future__ import annotations
6
6
 
7
7
 
8
- class ExtractionRunFatalError(RuntimeError):
8
+ class ExtractionSnapshotFatalError(RuntimeError):
9
9
  """
10
- Fatal extraction run error that should abort the entire run.
10
+ Fatal extraction snapshot error that should abort the entire snapshot.
11
11
 
12
12
  This exception is used for conditions that indicate a configuration or environment problem
13
13
  rather than a per-item extraction failure. For example, a selection extractor that depends
14
- on referenced extraction run manifests treats missing manifests as fatal.
14
+ on referenced extraction snapshot manifests treats missing manifests as fatal.
15
15
  """
16
+
17
+
18
+ class IngestCollisionError(RuntimeError):
19
+ """
20
+ Ingest collision for an already ingested source.
21
+
22
+ :param source_uri: Source uniform resource identifier that caused the collision.
23
+ :type source_uri: str
24
+ :param existing_item_id: Identifier of the existing catalog item.
25
+ :type existing_item_id: str
26
+ :param existing_relpath: Raw storage relpath of the existing item.
27
+ :type existing_relpath: str
28
+ """
29
+
30
+ def __init__(self, *, source_uri: str, existing_item_id: str, existing_relpath: str) -> None:
31
+ self.source_uri = source_uri
32
+ self.existing_item_id = existing_item_id
33
+ self.existing_relpath = existing_relpath
34
+ message = (
35
+ "Source already ingested"
36
+ f": source_uri={source_uri} existing_item_id={existing_item_id}"
37
+ f" existing_relpath={existing_relpath}"
38
+ )
39
+ super().__init__(message)