PyPI - biblicus - Versions diffs - 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

biblicus 0.1.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

biblicus/__init__.py +2 -2
biblicus/_vendor/dotyaml/__init__.py +14 -0
biblicus/_vendor/dotyaml/interpolation.py +63 -0
biblicus/_vendor/dotyaml/loader.py +181 -0
biblicus/_vendor/dotyaml/transformer.py +135 -0
biblicus/backends/__init__.py +0 -2
biblicus/backends/base.py +3 -3
biblicus/backends/scan.py +96 -13
biblicus/backends/sqlite_full_text_search.py +74 -14
biblicus/cli.py +126 -19
biblicus/constants.py +2 -0
biblicus/corpus.py +455 -45
biblicus/errors.py +15 -0
biblicus/evaluation.py +4 -8
biblicus/extraction.py +529 -0
biblicus/extractors/__init__.py +44 -0
biblicus/extractors/base.py +68 -0
biblicus/extractors/metadata_text.py +106 -0
biblicus/extractors/openai_stt.py +180 -0
biblicus/extractors/pass_through_text.py +84 -0
biblicus/extractors/pdf_text.py +100 -0
biblicus/extractors/pipeline.py +105 -0
biblicus/extractors/rapidocr_text.py +129 -0
biblicus/extractors/select_longest_text.py +105 -0
biblicus/extractors/select_text.py +100 -0
biblicus/extractors/unstructured_text.py +100 -0
biblicus/frontmatter.py +0 -3
biblicus/hook_logging.py +180 -0
biblicus/hook_manager.py +203 -0
biblicus/hooks.py +261 -0
biblicus/ignore.py +64 -0
biblicus/models.py +107 -0
biblicus/retrieval.py +0 -4
biblicus/sources.py +85 -5
biblicus/time.py +0 -1
biblicus/uris.py +3 -4
biblicus/user_config.py +138 -0
biblicus-0.3.0.dist-info/METADATA +336 -0
biblicus-0.3.0.dist-info/RECORD +44 -0
biblicus-0.1.1.dist-info/METADATA +0 -174
biblicus-0.1.1.dist-info/RECORD +0 -22
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.1.1.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0

biblicus/corpus.py CHANGED Viewed

@@ -13,13 +13,24 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
 import yaml
-from .constants import CORPUS_DIR_NAME, DEFAULT_RAW_DIR, RUNS_DIR_NAME, SCHEMA_VERSION, SIDECAR_SUFFIX
+from pydantic import ValidationError
+from .constants import (
+    CORPUS_DIR_NAME,
+    DEFAULT_RAW_DIR,
+    EXTRACTION_RUNS_DIR_NAME,
+    RUNS_DIR_NAME,
+    SCHEMA_VERSION,
+    SIDECAR_SUFFIX,
+)
 from .frontmatter import parse_front_matter, render_front_matter
+from .hook_manager import HookManager
+from .hooks import HookPoint
+from .ignore import load_corpus_ignore_spec
 from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
 from .sources import load_source
 from .time import utc_now_iso
-from .uris import normalize_corpus_uri, corpus_ref_to_path
+from .uris import corpus_ref_to_path, normalize_corpus_uri
 def _sha256_bytes(data: bytes) -> str:
@@ -31,10 +42,38 @@ def _sha256_bytes(data: bytes) -> str:
     :return: Secure Hash Algorithm 256 hex digest.
     :rtype: str
     """
     return hashlib.sha256(data).hexdigest()
+def _write_stream_and_hash(
+    stream, destination_path: Path, *, chunk_size: int = 1024 * 1024
+) -> Dict[str, object]:
+    """
+    Write a binary stream to disk while computing a digest.
+    :param stream: Binary stream to read from.
+    :type stream: object
+    :param destination_path: Destination path to write to.
+    :type destination_path: Path
+    :param chunk_size: Chunk size for reads.
+    :type chunk_size: int
+    :return: Mapping containing sha256 and bytes_written.
+    :rtype: dict[str, object]
+    :raises OSError: If the destination cannot be written.
+    """
+    hasher = hashlib.sha256()
+    bytes_written = 0
+    with destination_path.open("wb") as destination_handle:
+        while True:
+            chunk = stream.read(chunk_size)
+            if not chunk:
+                break
+            hasher.update(chunk)
+            destination_handle.write(chunk)
+            bytes_written += len(chunk)
+    return {"sha256": hasher.hexdigest(), "bytes_written": bytes_written}
 def _sanitize_filename(name: str) -> str:
     """
     Sanitize a filename into a portable, filesystem-friendly form.
@@ -44,7 +83,6 @@ def _sanitize_filename(name: str) -> str:
     :return: Sanitized filename.
     :rtype: str
     """
     allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
     sanitized_name = "".join(
         (character if character in allowed_characters else "_") for character in name
@@ -61,9 +99,9 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
     :return: Preferred extension or None.
     :rtype: str or None
     """
     media_type_overrides = {
         "image/jpeg": ".jpg",
+        "audio/ogg": ".ogg",
     }
     if media_type in media_type_overrides:
         return media_type_overrides[media_type]
@@ -81,7 +119,6 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
     :return: Filename with a compatible extension.
     :rtype: str
     """
     raw_name = filename.strip()
     if media_type == "text/markdown":
@@ -89,11 +126,12 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
             return raw_name
         return raw_name + ".md"
+    if Path(raw_name).suffix:
+        return raw_name
     ext = _preferred_extension_for_media_type(media_type)
     if not ext:
         return raw_name
-    if raw_name.lower().endswith(ext.lower()):
-        return raw_name
     return raw_name + ext
@@ -108,7 +146,6 @@ def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
     :return: Deduplicated tag list preserving order.
     :rtype: list[str]
     """
     merged_tags: List[str] = []
     for explicit_tag in explicit:
@@ -141,7 +178,6 @@ def _sidecar_path_for(content_path: Path) -> Path:
     :return: Sidecar path.
     :rtype: Path
     """
     return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
@@ -155,7 +191,6 @@ def _load_sidecar(content_path: Path) -> Dict[str, Any]:
     :rtype: dict[str, Any]
     :raises ValueError: If the sidecar content is not a mapping.
     """
     path = _sidecar_path_for(content_path)
     if not path.is_file():
         return {}
@@ -186,7 +221,9 @@ def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
     path.write_text(text + "\n", encoding="utf-8")
-def _ensure_biblicus_block(metadata: Dict[str, Any], *, item_id: str, source_uri: str) -> Dict[str, Any]:
+def _ensure_biblicus_block(
+    metadata: Dict[str, Any], *, item_id: str, source_uri: str
+) -> Dict[str, Any]:
     """
     Ensure the biblicus metadata block exists and is populated.
@@ -284,11 +321,11 @@ class Corpus:
         :param root: Corpus root directory.
         :type root: Path
         """
         self.root = root
         self.meta_dir = self.root / CORPUS_DIR_NAME
         self.raw_dir = self.root / DEFAULT_RAW_DIR
         self.config = self._load_config()
+        self._hooks = self._load_hooks()
     @property
     def uri(self) -> str:
@@ -298,7 +335,6 @@ class Corpus:
         :return: Corpus uniform resource identifier.
         :rtype: str
         """
         return self.root.as_uri()
     def _load_config(self) -> Optional[CorpusConfig]:
@@ -309,12 +345,38 @@ class Corpus:
         :rtype: CorpusConfig or None
         :raises ValueError: If the config schema is invalid.
         """
         path = self.meta_dir / "config.json"
         if not path.is_file():
             return None
         data = json.loads(path.read_text(encoding="utf-8"))
-        return CorpusConfig.model_validate(data)
+        try:
+            return CorpusConfig.model_validate(data)
+        except ValidationError as exc:
+            has_hook_error = any(
+                isinstance(error.get("loc"), tuple)
+                and error.get("loc")
+                and error.get("loc")[0] == "hooks"
+                for error in exc.errors()
+            )
+            if has_hook_error:
+                raise ValueError(f"Invalid hook specification: {exc}") from exc
+            raise ValueError(f"Invalid corpus config: {exc}") from exc
+    def _load_hooks(self) -> Optional[HookManager]:
+        """
+        Load the hook manager from config if hooks are configured.
+        :return: Hook manager or None.
+        :rtype: HookManager or None
+        :raises ValueError: If hook specifications are invalid.
+        """
+        if self.config is None or not self.config.hooks:
+            return None
+        return HookManager.from_config(
+            corpus_root=self.root,
+            corpus_uri=self.uri,
+            hook_specs=self.config.hooks,
+        )
     @classmethod
     def find(cls, start: Path) -> "Corpus":
@@ -327,7 +389,6 @@ class Corpus:
         :rtype: Corpus
         :raises FileNotFoundError: If no corpus config is found.
         """
         start = start.resolve()
         for candidate in [start, *start.parents]:
             if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
@@ -346,7 +407,6 @@ class Corpus:
         :return: Opened corpus instance.
         :rtype: Corpus
         """
         return cls.find(corpus_ref_to_path(ref))
     @classmethod
@@ -362,7 +422,6 @@ class Corpus:
         :rtype: Corpus
         :raises FileExistsError: If the corpus already exists and force is False.
         """
         root = root.resolve()
         corpus = cls(root)
@@ -392,7 +451,6 @@ class Corpus:
         :return: Catalog file path.
         :rtype: Path
         """
         return self.meta_dir / "catalog.json"
     def _init_catalog(self) -> None:
@@ -402,7 +460,6 @@ class Corpus:
         :return: None.
         :rtype: None
         """
         if self.catalog_path.exists():
             return
         catalog = CorpusCatalog(
@@ -425,7 +482,6 @@ class Corpus:
         :raises FileNotFoundError: If the catalog file does not exist.
         :raises ValueError: If the catalog schema is invalid.
         """
         if not self.catalog_path.is_file():
             raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
         catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
@@ -440,7 +496,6 @@ class Corpus:
         :raises FileNotFoundError: If the catalog file does not exist.
         :raises ValueError: If the catalog schema is invalid.
         """
         return self._load_catalog()
     def _write_catalog(self, catalog: CorpusCatalog) -> None:
@@ -452,7 +507,6 @@ class Corpus:
         :return: None.
         :rtype: None
         """
         temp_path = self.catalog_path.with_suffix(".json.tmp")
         temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
         temp_path.replace(self.catalog_path)
@@ -465,9 +519,54 @@ class Corpus:
         :return: Path to the runs directory.
         :rtype: Path
         """
         return self.meta_dir / RUNS_DIR_NAME
+    @property
+    def extraction_runs_dir(self) -> Path:
+        """
+        Location of extraction run artifacts.
+        :return: Path to the extraction runs directory.
+        :rtype: Path
+        """
+        return self.runs_dir / EXTRACTION_RUNS_DIR_NAME
+    def extraction_run_dir(self, *, extractor_id: str, run_id: str) -> Path:
+        """
+        Resolve an extraction run directory.
+        :param extractor_id: Extractor plugin identifier.
+        :type extractor_id: str
+        :param run_id: Extraction run identifier.
+        :type run_id: str
+        :return: Extraction run directory.
+        :rtype: Path
+        """
+        return self.extraction_runs_dir / extractor_id / run_id
+    def read_extracted_text(self, *, extractor_id: str, run_id: str, item_id: str) -> Optional[str]:
+        """
+        Read extracted text for an item from an extraction run, when present.
+        :param extractor_id: Extractor plugin identifier.
+        :type extractor_id: str
+        :param run_id: Extraction run identifier.
+        :type run_id: str
+        :param item_id: Item identifier.
+        :type item_id: str
+        :return: Extracted text or None if the artifact does not exist.
+        :rtype: str or None
+        :raises OSError: If the file exists but cannot be read.
+        """
+        path = (
+            self.extraction_run_dir(extractor_id=extractor_id, run_id=run_id)
+            / "text"
+            / f"{item_id}.txt"
+        )
+        if not path.is_file():
+            return None
+        return path.read_text(encoding="utf-8")
     def _ensure_runs_dir(self) -> None:
         """
         Ensure the retrieval runs directory exists.
@@ -475,7 +574,6 @@ class Corpus:
         :return: None.
         :rtype: None
         """
         self.runs_dir.mkdir(parents=True, exist_ok=True)
     def write_run(self, run: RetrievalRun) -> None:
@@ -487,7 +585,6 @@ class Corpus:
         :return: None.
         :rtype: None
         """
         self._ensure_runs_dir()
         path = self.runs_dir / f"{run.run_id}.json"
         path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
@@ -506,7 +603,6 @@ class Corpus:
         :rtype: RetrievalRun
         :raises FileNotFoundError: If the run manifest does not exist.
         """
         path = self.runs_dir / f"{run_id}.json"
         if not path.is_file():
             raise FileNotFoundError(f"Missing run manifest: {path}")
@@ -521,7 +617,6 @@ class Corpus:
         :return: Latest run identifier or None.
         :rtype: str or None
         """
         return self._load_catalog().latest_run_id
     def _upsert_catalog_item(self, item: CatalogItem) -> None:
@@ -533,7 +628,6 @@ class Corpus:
         :return: None.
         :rtype: None
         """
         self._init_catalog()
         catalog = self._load_catalog()
         catalog.items[item.id] = item
@@ -581,7 +675,6 @@ class Corpus:
         :rtype: IngestResult
         :raises ValueError: If markdown is not Unicode Transformation Format 8.
         """
         item_id = str(uuid.uuid4())
         safe_filename = _sanitize_filename(filename) if filename else ""
@@ -608,13 +701,30 @@ class Corpus:
         if resolved_tags and "tags" not in metadata_input:
             metadata_input["tags"] = list(resolved_tags)
+        if self._hooks is not None:
+            mutation = self._hooks.run_ingest_hooks(
+                hook_point=HookPoint.before_ingest,
+                filename=filename,
+                media_type=media_type,
+                title=resolved_title,
+                tags=list(resolved_tags),
+                metadata=dict(metadata_input),
+                source_uri=source_uri,
+            )
+            if mutation.add_tags:
+                for tag in mutation.add_tags:
+                    if tag not in resolved_tags:
+                        resolved_tags.append(tag)
         frontmatter: Dict[str, Any] = {}
         if media_type == "text/markdown":
             try:
                 markdown_text = data.decode("utf-8")
             except UnicodeDecodeError as decode_error:
-                raise ValueError("Markdown must be Unicode Transformation Format 8") from decode_error
+                raise ValueError(
+                    "Markdown must be Unicode Transformation Format 8"
+                ) from decode_error
             parsed_document = parse_front_matter(markdown_text)
             frontmatter = dict(parsed_document.metadata)
@@ -633,7 +743,9 @@ class Corpus:
             if isinstance(title_value, str) and title_value.strip():
                 resolved_title = title_value.strip()
-            frontmatter = _ensure_biblicus_block(frontmatter, item_id=item_id, source_uri=source_uri)
+            frontmatter = _ensure_biblicus_block(
+                frontmatter, item_id=item_id, source_uri=source_uri
+            )
             rendered_document = render_front_matter(frontmatter, parsed_document.body)
             data_to_write = rendered_document.encode("utf-8")
         else:
@@ -656,6 +768,34 @@ class Corpus:
             _write_sidecar(output_path, sidecar)
             frontmatter = sidecar
+        if self._hooks is not None:
+            mutation = self._hooks.run_ingest_hooks(
+                hook_point=HookPoint.after_ingest,
+                filename=filename,
+                media_type=media_type,
+                title=resolved_title,
+                tags=list(resolved_tags),
+                metadata=dict(metadata_input),
+                source_uri=source_uri,
+                item_id=item_id,
+                relpath=relpath,
+            )
+            if mutation.add_tags:
+                updated_tags = list(resolved_tags)
+                for tag in mutation.add_tags:
+                    if tag not in updated_tags:
+                        updated_tags.append(tag)
+                resolved_tags = updated_tags
+                sidecar_metadata = _load_sidecar(output_path)
+                sidecar_metadata["tags"] = resolved_tags
+                if media_type != "text/markdown":
+                    sidecar_metadata["media_type"] = media_type
+                sidecar_metadata["biblicus"] = {"id": item_id, "source": source_uri}
+                _write_sidecar(output_path, sidecar_metadata)
+                frontmatter = _merge_metadata(
+                    frontmatter if isinstance(frontmatter, dict) else {}, sidecar_metadata
+                )
         created_at = utc_now_iso()
         item_record = CatalogItem(
             id=item_id,
@@ -673,6 +813,129 @@ class Corpus:
         return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
+    def ingest_item_stream(
+        self,
+        stream,
+        *,
+        filename: Optional[str] = None,
+        media_type: str = "application/octet-stream",
+        tags: Sequence[str] = (),
+        metadata: Optional[Dict[str, Any]] = None,
+        source_uri: str = "unknown",
+    ) -> IngestResult:
+        """
+        Ingest a binary item from a readable stream.
+        This method is intended for large non-markdown items. It writes bytes to disk incrementally
+        while computing a checksum.
+        :param stream: Readable binary stream.
+        :type stream: object
+        :param filename: Optional filename for the stored item.
+        :type filename: str or None
+        :param media_type: Internet Assigned Numbers Authority media type for the item.
+        :type media_type: str
+        :param tags: Tags to associate with the item.
+        :type tags: Sequence[str]
+        :param metadata: Optional metadata mapping.
+        :type metadata: dict[str, Any] or None
+        :param source_uri: Source uniform resource identifier for provenance.
+        :type source_uri: str
+        :return: Ingestion result summary.
+        :rtype: IngestResult
+        :raises ValueError: If the media_type is text/markdown.
+        """
+        if media_type == "text/markdown":
+            raise ValueError("Stream ingestion is not supported for Markdown")
+        item_id = str(uuid.uuid4())
+        safe_filename = _sanitize_filename(filename) if filename else ""
+        if safe_filename:
+            safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
+        if safe_filename:
+            output_name = f"{item_id}--{safe_filename}"
+        else:
+            extension = _preferred_extension_for_media_type(media_type) or ""
+            output_name = f"{item_id}{extension}" if extension else f"{item_id}"
+        relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
+        output_path = self.root / relpath
+        resolved_tags = list(tags)
+        metadata_input: Dict[str, Any] = dict(metadata or {})
+        if resolved_tags and "tags" not in metadata_input:
+            metadata_input["tags"] = list(resolved_tags)
+        if self._hooks is not None:
+            mutation = self._hooks.run_ingest_hooks(
+                hook_point=HookPoint.before_ingest,
+                filename=filename,
+                media_type=media_type,
+                title=None,
+                tags=list(resolved_tags),
+                metadata=dict(metadata_input),
+                source_uri=source_uri,
+            )
+            if mutation.add_tags:
+                for tag in mutation.add_tags:
+                    if tag not in resolved_tags:
+                        resolved_tags.append(tag)
+        write_result = _write_stream_and_hash(stream, output_path)
+        sha256_digest = str(write_result["sha256"])
+        bytes_written = int(write_result["bytes_written"])
+        sidecar: Dict[str, Any] = {}
+        sidecar["media_type"] = media_type
+        if resolved_tags:
+            sidecar["tags"] = resolved_tags
+        if metadata_input:
+            for metadata_key, metadata_value in metadata_input.items():
+                if metadata_key in {"tags", "biblicus"}:
+                    continue
+                sidecar[metadata_key] = metadata_value
+        sidecar["biblicus"] = {"id": item_id, "source": source_uri}
+        _write_sidecar(output_path, sidecar)
+        if self._hooks is not None:
+            mutation = self._hooks.run_ingest_hooks(
+                hook_point=HookPoint.after_ingest,
+                filename=filename,
+                media_type=media_type,
+                title=None,
+                tags=list(resolved_tags),
+                metadata=dict(metadata_input),
+                source_uri=source_uri,
+                item_id=item_id,
+                relpath=relpath,
+            )
+            if mutation.add_tags:
+                updated_tags = list(resolved_tags)
+                for tag in mutation.add_tags:
+                    if tag not in updated_tags:
+                        updated_tags.append(tag)
+                resolved_tags = updated_tags
+                sidecar["tags"] = resolved_tags
+                _write_sidecar(output_path, sidecar)
+        created_at = utc_now_iso()
+        item_record = CatalogItem(
+            id=item_id,
+            relpath=relpath,
+            sha256=sha256_digest,
+            bytes=bytes_written,
+            media_type=media_type,
+            title=None,
+            tags=list(resolved_tags),
+            metadata=dict(sidecar or {}),
+            created_at=created_at,
+            source_uri=source_uri,
+        )
+        self._upsert_catalog_item(item_record)
+        return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
     def ingest_note(
         self,
         text: str,
@@ -695,7 +958,6 @@ class Corpus:
         :return: Ingestion result summary.
         :rtype: IngestResult
         """
         data = text.encode("utf-8")
         return self.ingest_item(
             data,
@@ -726,6 +988,35 @@ class Corpus:
         :return: Ingestion result summary.
         :rtype: IngestResult
         """
+        candidate_path = Path(source) if isinstance(source, str) and "://" not in source else None
+        if isinstance(source, Path) or (candidate_path is not None and candidate_path.exists()):
+            path = source if isinstance(source, Path) else candidate_path
+            assert isinstance(path, Path)
+            path = path.resolve()
+            filename = path.name
+            media_type, _ = mimetypes.guess_type(filename)
+            media_type = media_type or "application/octet-stream"
+            if path.suffix.lower() in {".md", ".markdown"}:
+                media_type = "text/markdown"
+            if media_type == "text/markdown":
+                return self.ingest_item(
+                    path.read_bytes(),
+                    filename=filename,
+                    media_type=media_type,
+                    title=None,
+                    tags=tags,
+                    metadata=None,
+                    source_uri=source_uri or path.as_uri(),
+                )
+            with path.open("rb") as handle:
+                return self.ingest_item_stream(
+                    handle,
+                    filename=filename,
+                    media_type=media_type,
+                    tags=tags,
+                    metadata=None,
+                    source_uri=source_uri or path.as_uri(),
+                )
         payload = load_source(source, source_uri=source_uri)
         return self.ingest_item(
@@ -738,6 +1029,128 @@ class Corpus:
             source_uri=payload.source_uri,
         )
+    def import_tree(self, source_root: Path, *, tags: Sequence[str] = ()) -> Dict[str, int]:
+        """
+        Import a folder tree into the corpus, preserving relative paths and provenance.
+        Imported content is stored under the raw directory in a dedicated import namespace so that
+        operators can inspect and back up imported content as a structured tree.
+        :param source_root: Root directory of the folder tree to import.
+        :type source_root: Path
+        :param tags: Tags to associate with imported items.
+        :type tags: Sequence[str]
+        :return: Import statistics.
+        :rtype: dict[str, int]
+        :raises FileNotFoundError: If the source_root does not exist.
+        :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
+        """
+        source_root = source_root.resolve()
+        if not source_root.is_dir():
+            raise FileNotFoundError(f"Import source root does not exist: {source_root}")
+        ignore_spec = load_corpus_ignore_spec(self.root)
+        import_id = str(uuid.uuid4())
+        stats = {"scanned": 0, "ignored": 0, "imported": 0}
+        for source_path in sorted(source_root.rglob("*")):
+            if not source_path.is_file():
+                continue
+            relative_source_path = source_path.relative_to(source_root).as_posix()
+            stats["scanned"] += 1
+            if ignore_spec.matches(relative_source_path):
+                stats["ignored"] += 1
+                continue
+            self._import_file(
+                source_path=source_path,
+                import_id=import_id,
+                relative_source_path=relative_source_path,
+                tags=tags,
+            )
+            stats["imported"] += 1
+        return stats
+    def _import_file(
+        self,
+        *,
+        source_path: Path,
+        import_id: str,
+        relative_source_path: str,
+        tags: Sequence[str],
+    ) -> None:
+        """
+        Import a single file into the corpus under an import namespace.
+        :param source_path: Source file path to import.
+        :type source_path: Path
+        :param import_id: Import identifier.
+        :type import_id: str
+        :param relative_source_path: Relative path within the imported tree.
+        :type relative_source_path: str
+        :param tags: Tags to apply.
+        :type tags: Sequence[str]
+        :return: None.
+        :rtype: None
+        :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
+        """
+        item_id = str(uuid.uuid4())
+        destination_relpath = str(
+            Path(DEFAULT_RAW_DIR) / "imports" / import_id / relative_source_path
+        )
+        destination_path = (self.root / destination_relpath).resolve()
+        destination_path.parent.mkdir(parents=True, exist_ok=True)
+        raw_bytes = source_path.read_bytes()
+        sha256_digest = _sha256_bytes(raw_bytes)
+        media_type, _ = mimetypes.guess_type(source_path.name)
+        media_type = media_type or "application/octet-stream"
+        if source_path.suffix.lower() in {".md", ".markdown"}:
+            media_type = "text/markdown"
+        title: Optional[str] = None
+        frontmatter_metadata: Dict[str, Any] = {}
+        if media_type == "text/markdown":
+            try:
+                text = raw_bytes.decode("utf-8")
+            except UnicodeDecodeError as decode_error:
+                raise ValueError(
+                    f"Markdown file must be Unicode Transformation Format 8: {relative_source_path}"
+                ) from decode_error
+            parsed_document = parse_front_matter(text)
+            frontmatter_metadata = dict(parsed_document.metadata)
+            title_value = frontmatter_metadata.get("title")
+            if isinstance(title_value, str) and title_value.strip():
+                title = title_value.strip()
+        destination_path.write_bytes(raw_bytes)
+        sidecar: Dict[str, Any] = {}
+        if tags:
+            sidecar["tags"] = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
+        if media_type != "text/markdown":
+            sidecar["media_type"] = media_type
+        sidecar["biblicus"] = {"id": item_id, "source": source_path.as_uri()}
+        _write_sidecar(destination_path, sidecar)
+        merged_metadata = _merge_metadata(frontmatter_metadata, sidecar)
+        resolved_tags = _merge_tags([], merged_metadata.get("tags"))
+        item_record = CatalogItem(
+            id=item_id,
+            relpath=destination_relpath,
+            sha256=sha256_digest,
+            bytes=len(raw_bytes),
+            media_type=media_type,
+            title=title,
+            tags=list(resolved_tags),
+            metadata=dict(merged_metadata or {}),
+            created_at=utc_now_iso(),
+            source_uri=source_path.as_uri(),
+        )
+        self._upsert_catalog_item(item_record)
     def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
         """
         List items from the catalog.
@@ -747,11 +1160,8 @@ class Corpus:
         :return: Catalog items ordered by recency.
         :rtype: list[CatalogItem]
         """
         catalog = self._load_catalog()
-        ordered_ids = (
-            catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
-        )
+        ordered_ids = catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
         collected_items: List[CatalogItem] = []
         for item_id in ordered_ids:
             item = catalog.items.get(item_id)
@@ -769,7 +1179,6 @@ class Corpus:
         :rtype: CatalogItem
         :raises KeyError: If the item identifier is unknown.
         """
         catalog = self._load_catalog()
         item = catalog.items.get(item_id)
         if item is None:
@@ -787,7 +1196,6 @@ class Corpus:
         :rtype: dict[str, int]
         :raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
         """
         self._init_catalog()
         existing_catalog = self._load_catalog()
         stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
@@ -862,7 +1270,9 @@ class Corpus:
             previous_item = existing_catalog.items.get(item_id)
             created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
-            source_uri = source_uri or (previous_item.source_uri if previous_item is not None else None)
+            source_uri = source_uri or (
+                previous_item.source_uri if previous_item is not None else None
+            )
             if previous_item is None:
                 stats["inserted"] += 1
@@ -909,7 +1319,6 @@ class Corpus:
         :return: Corpus name.
         :rtype: str
         """
         return self.root.name
     def purge(self, *, confirm: str) -> None:
@@ -922,10 +1331,11 @@ class Corpus:
         :rtype: None
         :raises ValueError: If the confirmation does not match.
         """
         expected = self.name
         if confirm != expected:
-            raise ValueError(f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus")
+            raise ValueError(
+                f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus"
+            )
         if self.raw_dir.exists():
             shutil.rmtree(self.raw_dir)

biblicus 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

biblicus 0.1.1py3-none-any.whl → 0.3.0py3-none-any.whl