PyPI - biblicus - Versions diffs - 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

biblicus 0.16.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

biblicus/__init__.py +25 -5
biblicus/analysis/__init__.py +1 -1
biblicus/analysis/base.py +10 -10
biblicus/analysis/markov.py +78 -68
biblicus/analysis/models.py +47 -47
biblicus/analysis/profiling.py +58 -48
biblicus/analysis/topic_modeling.py +56 -51
biblicus/cli.py +248 -191
biblicus/{recipes.py → configuration.py} +14 -14
biblicus/constants.py +2 -2
biblicus/context.py +27 -12
biblicus/context_engine/__init__.py +53 -0
biblicus/context_engine/assembler.py +1090 -0
biblicus/context_engine/compaction.py +110 -0
biblicus/context_engine/models.py +423 -0
biblicus/context_engine/retrieval.py +133 -0
biblicus/corpus.py +233 -124
biblicus/errors.py +27 -3
biblicus/evaluation.py +27 -25
biblicus/extraction.py +103 -98
biblicus/extraction_evaluation.py +26 -26
biblicus/extractors/deepgram_stt.py +7 -7
biblicus/extractors/docling_granite_text.py +11 -11
biblicus/extractors/docling_smol_text.py +11 -11
biblicus/extractors/markitdown_text.py +4 -4
biblicus/extractors/openai_stt.py +7 -7
biblicus/extractors/paddleocr_vl_text.py +20 -18
biblicus/extractors/pipeline.py +8 -8
biblicus/extractors/rapidocr_text.py +3 -3
biblicus/extractors/unstructured_text.py +3 -3
biblicus/hooks.py +4 -4
biblicus/knowledge_base.py +34 -32
biblicus/models.py +84 -81
biblicus/retrieval.py +49 -42
biblicus/retrievers/__init__.py +50 -0
biblicus/retrievers/base.py +65 -0
biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
biblicus/retrievers/hybrid.py +301 -0
biblicus/{backends → retrievers}/scan.py +84 -73
biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
biblicus/{backends → retrievers}/tf_vector.py +103 -100
biblicus/sources.py +46 -11
biblicus/text/link.py +6 -0
biblicus/text/prompts.py +18 -8
biblicus/text/tool_loop.py +63 -5
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
biblicus-1.1.0.dist-info/RECORD +91 -0
biblicus/backends/__init__.py +0 -50
biblicus/backends/base.py +0 -65
biblicus/backends/hybrid.py +0 -291
biblicus-0.16.0.dist-info/RECORD +0 -86
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0

biblicus/corpus.py CHANGED Viewed

@@ -11,6 +11,7 @@ import shutil
 import uuid
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
+from urllib.parse import quote, unquote, urlparse
 import yaml
 from pydantic import ValidationError
@@ -19,11 +20,12 @@ from .constants import (
     ANALYSIS_RUNS_DIR_NAME,
     CORPUS_DIR_NAME,
     DEFAULT_RAW_DIR,
-    EXTRACTION_RUNS_DIR_NAME,
-    RUNS_DIR_NAME,
+    EXTRACTION_SNAPSHOTS_DIR_NAME,
     SCHEMA_VERSION,
     SIDECAR_SUFFIX,
+    SNAPSHOTS_DIR_NAME,
 )
+from .errors import IngestCollisionError
 from .frontmatter import parse_front_matter, render_front_matter
 from .hook_manager import HookManager
 from .hooks import HookPoint
@@ -32,10 +34,10 @@ from .models import (
     CatalogItem,
     CorpusCatalog,
     CorpusConfig,
-    ExtractionRunListEntry,
-    ExtractionRunReference,
+    ExtractionSnapshotListEntry,
+    ExtractionSnapshotReference,
     IngestResult,
-    RetrievalRun,
+    RetrievalSnapshot,
 )
 from .sources import load_source
 from .time import utc_now_iso
@@ -110,7 +112,10 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
     """
     media_type_overrides = {
         "image/jpeg": ".jpg",
+        "audio/mpeg": ".mp3",
         "audio/ogg": ".ogg",
+        "audio/wav": ".wav",
+        "audio/x-wav": ".wav",
     }
     if media_type in media_type_overrides:
         return media_type_overrides[media_type]
@@ -136,7 +141,16 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
         return raw_name + ".md"
     if Path(raw_name).suffix:
-        return raw_name
+        if "%2F" in raw_name or "%3A" in raw_name:
+            decoded = unquote(raw_name)
+            parsed = urlparse(decoded)
+            decoded_path = parsed.path if parsed.scheme else decoded
+            if not Path(decoded_path).suffix:
+                pass
+            else:
+                return raw_name
+        else:
+            return raw_name
     ext = _preferred_extension_for_media_type(media_type)
     if not ext:
@@ -144,6 +158,55 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
     return raw_name + ext
+def _encode_source_uri_for_filename(source_uri: str) -> str:
+    """
+    Percent-encode a source uniform resource identifier for filename use.
+    :param source_uri: Source uniform resource identifier to encode.
+    :type source_uri: str
+    :return: Percent-encoded uniform resource identifier safe for filenames.
+    :rtype: str
+    """
+    return quote(source_uri, safe="")
+def _storage_filename_for_ingest(
+    *, filename: Optional[str], media_type: str, source_uri: Optional[str]
+) -> str:
+    """
+    Derive a collision-safe filename for corpus storage.
+    If a source uniform resource identifier is provided, the full uniform resource identifier is
+    percent-encoded to namespace the stored file, preventing collisions between identical basenames
+    from different sources. When no uniform resource identifier is available, fall back to a
+    sanitized filename.
+    :param filename: Optional filename hint from the caller.
+    :type filename: str or None
+    :param media_type: Media type of the payload.
+    :type media_type: str
+    :param source_uri: Optional source uniform resource identifier for provenance.
+    :type source_uri: str or None
+    :return: Storage filename with an appropriate extension, or an empty string when no hint exists.
+    :rtype: str
+    """
+    base_name = ""
+    if source_uri:
+        base_name = _encode_source_uri_for_filename(source_uri)
+        if filename and not source_uri.startswith("file:"):
+            sanitized = _sanitize_filename(filename)
+            if sanitized:
+                base_name = f"{base_name}--{sanitized}"
+    if not base_name and filename:
+        base_name = _sanitize_filename(filename)
+    if not base_name:
+        return ""
+    if len(base_name) > 180:
+        digest = hashlib.sha256(base_name.encode("utf-8")).hexdigest()
+        base_name = f"hash-{digest}"
+    return _ensure_filename_extension(base_name, media_type=media_type)
 def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
     """
     Merge tags from explicit input and front matter values.
@@ -476,7 +539,7 @@ class Corpus:
             generated_at=utc_now_iso(),
             corpus_uri=normalize_corpus_uri(self.root),
             raw_dir=DEFAULT_RAW_DIR,
-            latest_run_id=None,
+            latest_snapshot_id=None,
             items={},
             order=[],
         )
@@ -520,70 +583,90 @@ class Corpus:
         temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
         temp_path.replace(self.catalog_path)
+    def _find_item_by_source_uri(self, source_uri: str) -> Optional[CatalogItem]:
+        """
+        Locate an existing catalog item by source uniform resource identifier.
+        :param source_uri: Source uniform resource identifier to search for.
+        :type source_uri: str
+        :return: Matching catalog item or None.
+        :rtype: CatalogItem or None
+        """
+        if not source_uri:
+            return None
+        self._init_catalog()
+        catalog = self._load_catalog()
+        for item in catalog.items.values():
+            if item.source_uri == source_uri:
+                return item
+        return None
     @property
-    def runs_dir(self) -> Path:
+    def snapshots_dir(self) -> Path:
         """
-        Location of retrieval run manifests.
+        Location of retrieval snapshot manifests.
-        :return: Path to the runs directory.
+        :return: Path to the snapshots directory.
         :rtype: Path
         """
-        return self.meta_dir / RUNS_DIR_NAME
+        return self.meta_dir / SNAPSHOTS_DIR_NAME
     @property
-    def extraction_runs_dir(self) -> Path:
+    def extraction_snapshots_dir(self) -> Path:
         """
-        Location of extraction run artifacts.
+        Location of extraction snapshot artifacts.
-        :return: Path to the extraction runs directory.
+        :return: Path to the extraction snapshots directory.
         :rtype: Path
         """
-        return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
+        return self.snapshots_dir / EXTRACTION_SNAPSHOTS_DIR_NAME
     @property
     def analysis_runs_dir(self) -> Path:
         """
-        Location of analysis run artifacts.
+        Location of analysis snapshot artifacts.
-        :return: Path to the analysis runs directory.
+        :return: Path to the analysis snapshots directory.
         :rtype: Path
         """
-        return self.runs_dir / ANALYSIS_RUNS_DIR_NAME
+        return self.snapshots_dir / ANALYSIS_RUNS_DIR_NAME
-    def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
+    def extraction_snapshot_dir(self, *, extractor_id: str, snapshot_id: str) -> Path:
         """
-        Resolve an extraction run directory.
+        Resolve an extraction snapshot directory.
         :param extractor_id: Extractor plugin identifier.
         :type extractor_id: str
-        :param run_id: Extraction run identifier.
-        :type run_id: str
-        :return: Extraction run directory.
+        :param snapshot_id: Extraction snapshot identifier.
+        :type snapshot_id: str
+        :return: Extraction snapshot directory.
         :rtype: Path
         """
-        return self.extraction_runs_dir / extractor_id / run_id
+        return self.extraction_snapshots_dir / extractor_id / snapshot_id
-    def analysis_run_dir(self, *, analysis_id: str, run_id: str) -> Path:
+    def analysis_run_dir(self, *, analysis_id: str, snapshot_id: str) -> Path:
         """
-        Resolve an analysis run directory.
+        Resolve an analysis snapshot directory.
         :param analysis_id: Analysis backend identifier.
         :type analysis_id: str
-        :param run_id: Analysis run identifier.
-        :type run_id: str
-        :return: Analysis run directory.
+        :param snapshot_id: Analysis snapshot identifier.
+        :type snapshot_id: str
+        :return: Analysis snapshot directory.
         :rtype: Path
         """
-        return self.analysis_runs_dir / analysis_id / run_id
+        return self.analysis_runs_dir / analysis_id / snapshot_id
-    def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
+    def read_extracted_text(
+        self, *, extractor_id: str, snapshot_id: str, item_id: str
+    ) -> Optional[str]:
         """
-        Read extracted text for an item from an extraction run, when present.
+        Read extracted text for an item from an extraction snapshot, when present.
         :param extractor_id: Extractor plugin identifier.
         :type extractor_id: str
-        :param run_id: Extraction run identifier.
-        :type run_id: str
+        :param snapshot_id: Extraction snapshot identifier.
+        :type snapshot_id: str
         :param item_id: Item identifier.
         :type item_id: str
         :return: Extracted text or None if the artifact does not exist.
@@ -591,7 +674,7 @@ class Corpus:
         :raises OSError: If the file exists but cannot be read.
         """
         path = (
-            self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
+            self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
             / "text"
             / f"{item_id}.txt"
         )
@@ -599,72 +682,73 @@ class Corpus:
             return None
         return path.read_text(encoding="utf-8")
-    def load_extraction_run_manifest(self, *, extractor_id: str, run_id: str):
+    def load_extraction_snapshot_manifest(self, *, extractor_id: str, snapshot_id: str):
         """
-        Load an extraction run manifest from the corpus.
+        Load an extraction snapshot manifest from the corpus.
         :param extractor_id: Extractor plugin identifier.
         :type extractor_id: str
-        :param run_id: Extraction run identifier.
-        :type run_id: str
-        :return: Parsed extraction run manifest.
-        :rtype: biblicus.extraction.ExtractionRunManifest
+        :param snapshot_id: Extraction snapshot identifier.
+        :type snapshot_id: str
+        :return: Parsed extraction snapshot manifest.
+        :rtype: biblicus.extraction.ExtractionSnapshotManifest
         :raises FileNotFoundError: If the manifest file does not exist.
         :raises ValueError: If the manifest data is invalid.
         """
-        from .extraction import ExtractionRunManifest
+        from .extraction import ExtractionSnapshotManifest
         manifest_path = (
-            self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id) / "manifest.json"
+            self.extraction_snapshot_dir(extractor_id=extractor_id, snapshot_id=snapshot_id)
+            / "manifest.json"
         )
         if not manifest_path.is_file():
-            raise FileNotFoundError(f"Missing extraction run manifest: {manifest_path}")
+            raise FileNotFoundError(f"Missing extraction snapshot manifest: {manifest_path}")
         data = json.loads(manifest_path.read_text(encoding="utf-8"))
-        return ExtractionRunManifest.model_validate(data)
+        return ExtractionSnapshotManifest.model_validate(data)
-    def list_extraction_runs(
+    def list_extraction_snapshots(
         self, *, extractor_id: Optional[str] = None
-    ) -> List[ExtractionRunListEntry]:
+    ) -> List[ExtractionSnapshotListEntry]:
         """
-        List extraction runs stored under the corpus.
+        List extraction snapshots stored under the corpus.
         :param extractor_id: Optional extractor identifier filter.
         :type extractor_id: str or None
-        :return: Summary list entries for each run.
-        :rtype: list[biblicus.models.ExtractionRunListEntry]
+        :return: Summary list entries for each snapshot.
+        :rtype: list[biblicus.models.ExtractionSnapshotListEntry]
         """
-        runs_root = self.extraction_runs_dir
-        if not runs_root.is_dir():
+        snapshots_root = self.extraction_snapshots_dir
+        if not snapshots_root.is_dir():
             return []
         extractor_dirs: List[Path]
         if extractor_id is None:
-            extractor_dirs = [path for path in sorted(runs_root.iterdir()) if path.is_dir()]
+            extractor_dirs = [path for path in sorted(snapshots_root.iterdir()) if path.is_dir()]
         else:
-            extractor_path = runs_root / extractor_id
+            extractor_path = snapshots_root / extractor_id
             extractor_dirs = [extractor_path] if extractor_path.is_dir() else []
-        entries: List[ExtractionRunListEntry] = []
+        entries: List[ExtractionSnapshotListEntry] = []
         for extractor_dir in extractor_dirs:
-            for run_dir in sorted(extractor_dir.iterdir()):
-                if not run_dir.is_dir():
+            for snapshot_dir in sorted(extractor_dir.iterdir()):
+                if not snapshot_dir.is_dir():
                     continue
-                manifest_path = run_dir / "manifest.json"
+                manifest_path = snapshot_dir / "manifest.json"
                 if not manifest_path.is_file():
                     continue
                 try:
-                    manifest = self.load_extraction_run_manifest(
+                    manifest = self.load_extraction_snapshot_manifest(
                         extractor_id=extractor_dir.name,
-                        run_id=run_dir.name,
+                        snapshot_id=snapshot_dir.name,
                     )
                 except (FileNotFoundError, ValueError):
                     continue
                 entries.append(
-                    ExtractionRunListEntry(
+                    ExtractionSnapshotListEntry(
                         extractor_id=extractor_dir.name,
-                        run_id=run_dir.name,
-                        recipe_id=manifest.recipe.recipe_id,
-                        recipe_name=manifest.recipe.name,
+                        snapshot_id=snapshot_dir.name,
+                        configuration_id=manifest.configuration.configuration_id,
+                        configuration_name=manifest.configuration.name,
                         catalog_generated_at=manifest.catalog_generated_at,
                         created_at=manifest.created_at,
                         stats=dict(manifest.stats),
@@ -672,95 +756,100 @@ class Corpus:
                 )
         entries.sort(
-            key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True
+            key=lambda entry: (entry.created_at, entry.extractor_id, entry.snapshot_id),
+            reverse=True,
         )
         return entries
-    def latest_extraction_run_reference(
+    def latest_extraction_snapshot_reference(
         self, *, extractor_id: Optional[str] = None
-    ) -> Optional[ExtractionRunReference]:
+    ) -> Optional[ExtractionSnapshotReference]:
         """
-        Return the most recent extraction run reference.
+        Return the most recent extraction snapshot reference.
         :param extractor_id: Optional extractor identifier filter.
         :type extractor_id: str or None
-        :return: Latest extraction run reference or None when no runs exist.
-        :rtype: biblicus.models.ExtractionRunReference or None
+        :return: Latest extraction snapshot reference or None when no snapshots exist.
+        :rtype: biblicus.models.ExtractionSnapshotReference or None
         """
-        entries = self.list_extraction_runs(extractor_id=extractor_id)
+        entries = self.list_extraction_snapshots(extractor_id=extractor_id)
         if not entries:
             return None
         latest = entries[0]
-        return ExtractionRunReference(extractor_id=latest.extractor_id, run_id=latest.run_id)
+        return ExtractionSnapshotReference(
+            extractor_id=latest.extractor_id, snapshot_id=latest.snapshot_id
+        )
-    def delete_extraction_run(self, *, extractor_id: str, run_id: str) -> None:
+    def delete_extraction_snapshot(self, *, extractor_id: str, snapshot_id: str) -> None:
         """
-        Delete an extraction run directory and its derived artifacts.
+        Delete an extraction snapshot directory and its derived artifacts.
         :param extractor_id: Extractor plugin identifier.
         :type extractor_id: str
-        :param run_id: Extraction run identifier.
-        :type run_id: str
+        :param snapshot_id: Extraction snapshot identifier.
+        :type snapshot_id: str
         :return: None.
         :rtype: None
-        :raises FileNotFoundError: If the extraction run directory does not exist.
+        :raises FileNotFoundError: If the extraction snapshot directory does not exist.
         """
-        run_dir = self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
-        if not run_dir.is_dir():
-            raise FileNotFoundError(f"Missing extraction run directory: {run_dir}")
-        shutil.rmtree(run_dir)
+        snapshot_dir = self.extraction_snapshot_dir(
+            extractor_id=extractor_id, snapshot_id=snapshot_id
+        )
+        if not snapshot_dir.is_dir():
+            raise FileNotFoundError(f"Missing extraction snapshot directory: {snapshot_dir}")
+        shutil.rmtree(snapshot_dir)
-    def _ensure_runs_dir(self) -> None:
+    def _ensure_snapshots_dir(self) -> None:
         """
-        Ensure the retrieval runs directory exists.
+        Ensure the retrieval snapshots directory exists.
         :return: None.
         :rtype: None
         """
-        self.runs_dir.mkdir(parents=True, exist_ok=True)
+        self.snapshots_dir.mkdir(parents=True, exist_ok=True)
-    def write_run(self, run: RetrievalRun) -> None:
+    def write_snapshot(self, snapshot: RetrievalSnapshot) -> None:
         """
-        Persist a retrieval run manifest and update the catalog pointer.
+        Persist a retrieval snapshot manifest and update the catalog pointer.
-        :param run: Run manifest to persist.
-        :type run: RetrievalRun
+        :param snapshot: Snapshot manifest to persist.
+        :type snapshot: RetrievalSnapshot
         :return: None.
         :rtype: None
         """
-        self._ensure_runs_dir()
-        path = self.runs_dir / f"{run.run_id}.json"
-        path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
+        self._ensure_snapshots_dir()
+        path = self.snapshots_dir / f"{snapshot.snapshot_id}.json"
+        path.write_text(snapshot.model_dump_json(indent=2) + "\n", encoding="utf-8")
         catalog = self._load_catalog()
-        catalog.latest_run_id = run.run_id
+        catalog.latest_snapshot_id = snapshot.snapshot_id
         catalog.generated_at = utc_now_iso()
         self._write_catalog(catalog)
-    def load_run(self, run_id: str) -> RetrievalRun:
+    def load_snapshot(self, snapshot_id: str) -> RetrievalSnapshot:
         """
-        Load a retrieval run manifest by identifier.
+        Load a retrieval snapshot manifest by identifier.
-        :param run_id: Run identifier.
-        :type run_id: str
-        :return: Parsed run manifest.
-        :rtype: RetrievalRun
-        :raises FileNotFoundError: If the run manifest does not exist.
+        :param snapshot_id: Snapshot identifier.
+        :type snapshot_id: str
+        :return: Parsed snapshot manifest.
+        :rtype: RetrievalSnapshot
+        :raises FileNotFoundError: If the snapshot manifest does not exist.
         """
-        path = self.runs_dir / f"{run_id}.json"
+        path = self.snapshots_dir / f"{snapshot_id}.json"
         if not path.is_file():
-            raise FileNotFoundError(f"Missing run manifest: {path}")
+            raise FileNotFoundError(f"Missing snapshot manifest: {path}")
         data = json.loads(path.read_text(encoding="utf-8"))
-        return RetrievalRun.model_validate(data)
+        return RetrievalSnapshot.model_validate(data)
     @property
-    def latest_run_id(self) -> Optional[str]:
+    def latest_snapshot_id(self) -> Optional[str]:
         """
-        Latest retrieval run identifier recorded in the catalog.
+        Latest retrieval snapshot identifier recorded in the catalog.
-        :return: Latest run identifier or None.
+        :return: Latest snapshot identifier or None.
         :rtype: str or None
         """
-        return self._load_catalog().latest_run_id
+        return self._load_catalog().latest_snapshot_id
     def _upsert_catalog_item(self, item: CatalogItem) -> None:
         """
@@ -779,7 +868,7 @@ class Corpus:
         ordered_ids.insert(0, item.id)
         catalog.order = ordered_ids
         catalog.generated_at = utc_now_iso()
-        catalog.latest_run_id = None
+        catalog.latest_snapshot_id = None
         self._write_catalog(catalog)
@@ -817,18 +906,26 @@ class Corpus:
         :return: Ingestion result summary.
         :rtype: IngestResult
         :raises ValueError: If markdown is not Unicode Transformation Format 8.
+        :raises IngestCollisionError: If a source uniform resource identifier is already ingested.
         """
-        item_id = str(uuid.uuid4())
-        safe_filename = _sanitize_filename(filename) if filename else ""
+        existing_item = self._find_item_by_source_uri(source_uri)
+        if existing_item is not None:
+            raise IngestCollisionError(
+                source_uri=source_uri,
+                existing_item_id=existing_item.id,
+                existing_relpath=existing_item.relpath,
+            )
-        if safe_filename:
-            safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
+        item_id = str(uuid.uuid4())
+        storage_filename = _storage_filename_for_ingest(
+            filename=filename, media_type=media_type, source_uri=source_uri
+        )
         if media_type == "text/markdown":
-            output_name = f"{item_id}--{safe_filename}" if safe_filename else f"{item_id}.md"
+            output_name = f"{item_id}--{storage_filename}" if storage_filename else f"{item_id}.md"
         else:
-            if safe_filename:
-                output_name = f"{item_id}--{safe_filename}"
+            if storage_filename:
+                output_name = f"{item_id}--{storage_filename}"
             else:
                 extension = _preferred_extension_for_media_type(media_type) or ""
                 output_name = f"{item_id}{extension}" if extension else f"{item_id}"
@@ -991,13 +1088,21 @@ class Corpus:
         if media_type == "text/markdown":
             raise ValueError("Stream ingestion is not supported for Markdown")
+        existing_item = self._find_item_by_source_uri(source_uri)
+        if existing_item is not None:
+            raise IngestCollisionError(
+                source_uri=source_uri,
+                existing_item_id=existing_item.id,
+                existing_relpath=existing_item.relpath,
+            )
         item_id = str(uuid.uuid4())
-        safe_filename = _sanitize_filename(filename) if filename else ""
-        if safe_filename:
-            safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
+        storage_filename = _storage_filename_for_ingest(
+            filename=filename, media_type=media_type, source_uri=source_uri
+        )
-        if safe_filename:
-            output_name = f"{item_id}--{safe_filename}"
+        if storage_filename:
+            output_name = f"{item_id}--{storage_filename}"
         else:
             extension = _preferred_extension_for_media_type(media_type) or ""
             output_name = f"{item_id}{extension}" if extension else f"{item_id}"
@@ -1085,7 +1190,7 @@ class Corpus:
         *,
         title: Optional[str] = None,
         tags: Sequence[str] = (),
-        source_uri: str = "text",
+        source_uri: Optional[str] = None,
     ) -> IngestResult:
         """
         Ingest a text note as Markdown.
@@ -1096,11 +1201,15 @@ class Corpus:
         :type title: str or None
         :param tags: Tags to associate with the note.
         :type tags: Sequence[str]
-        :param source_uri: Source uniform resource identifier for provenance.
-        :type source_uri: str
+        :param source_uri: Optional source uniform resource identifier for provenance.
+        :type source_uri: str or None
         :return: Ingestion result summary.
         :rtype: IngestResult
         """
+        if source_uri is None:
+            digest_source = (title or "") + "\n" + text
+            digest = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()
+            source_uri = f"text:{digest}"
         data = text.encode("utf-8")
         return self.ingest_item(
             data,
@@ -1520,7 +1629,7 @@ class Corpus:
             generated_at=utc_now_iso(),
             corpus_uri=normalize_corpus_uri(self.root),
             raw_dir=DEFAULT_RAW_DIR,
-            latest_run_id=None,
+            latest_snapshot_id=None,
             items=new_items,
             order=order,
         )
@@ -1572,7 +1681,7 @@ class Corpus:
                 generated_at=utc_now_iso(),
                 corpus_uri=normalize_corpus_uri(self.root),
                 raw_dir=DEFAULT_RAW_DIR,
-                latest_run_id=None,
+                latest_snapshot_id=None,
                 items={},
                 order=[],
             )

biblicus/errors.py CHANGED Viewed

@@ -5,11 +5,35 @@ Error types for Biblicus.
 from __future__ import annotations
-class ExtractionRunFatalError(RuntimeError):
+class ExtractionSnapshotFatalError(RuntimeError):
     """
-    Fatal extraction run error that should abort the entire run.
+    Fatal extraction snapshot error that should abort the entire snapshot.
     This exception is used for conditions that indicate a configuration or environment problem
     rather than a per-item extraction failure. For example, a selection extractor that depends
-    on referenced extraction run manifests treats missing manifests as fatal.
+    on referenced extraction snapshot manifests treats missing manifests as fatal.
     """
+class IngestCollisionError(RuntimeError):
+    """
+    Ingest collision for an already ingested source.
+    :param source_uri: Source uniform resource identifier that caused the collision.
+    :type source_uri: str
+    :param existing_item_id: Identifier of the existing catalog item.
+    :type existing_item_id: str
+    :param existing_relpath: Raw storage relpath of the existing item.
+    :type existing_relpath: str
+    """
+    def __init__(self, *, source_uri: str, existing_item_id: str, existing_relpath: str) -> None:
+        self.source_uri = source_uri
+        self.existing_item_id = existing_item_id
+        self.existing_relpath = existing_relpath
+        message = (
+            "Source already ingested"
+            f": source_uri={source_uri} existing_item_id={existing_item_id}"
+            f" existing_relpath={existing_relpath}"
+        )
+        super().__init__(message)

biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

biblicus 0.16.0py3-none-any.whl → 1.1.0py3-none-any.whl