PyPI - biblicus - Versions diffs - 0.6.0__py3-none-any.whl - Mend

biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

biblicus/__init__.py +30 -0
biblicus/__main__.py +8 -0
biblicus/_vendor/dotyaml/__init__.py +14 -0
biblicus/_vendor/dotyaml/interpolation.py +63 -0
biblicus/_vendor/dotyaml/loader.py +181 -0
biblicus/_vendor/dotyaml/transformer.py +135 -0
biblicus/backends/__init__.py +42 -0
biblicus/backends/base.py +65 -0
biblicus/backends/scan.py +375 -0
biblicus/backends/sqlite_full_text_search.py +487 -0
biblicus/cli.py +804 -0
biblicus/constants.py +12 -0
biblicus/context.py +183 -0
biblicus/corpus.py +1531 -0
biblicus/crawl.py +186 -0
biblicus/errors.py +15 -0
biblicus/evaluation.py +257 -0
biblicus/evidence_processing.py +201 -0
biblicus/extraction.py +531 -0
biblicus/extractors/__init__.py +44 -0
biblicus/extractors/base.py +68 -0
biblicus/extractors/metadata_text.py +106 -0
biblicus/extractors/openai_stt.py +180 -0
biblicus/extractors/pass_through_text.py +84 -0
biblicus/extractors/pdf_text.py +100 -0
biblicus/extractors/pipeline.py +105 -0
biblicus/extractors/rapidocr_text.py +129 -0
biblicus/extractors/select_longest_text.py +105 -0
biblicus/extractors/select_text.py +100 -0
biblicus/extractors/unstructured_text.py +100 -0
biblicus/frontmatter.py +89 -0
biblicus/hook_logging.py +180 -0
biblicus/hook_manager.py +203 -0
biblicus/hooks.py +261 -0
biblicus/ignore.py +64 -0
biblicus/knowledge_base.py +191 -0
biblicus/models.py +445 -0
biblicus/retrieval.py +133 -0
biblicus/sources.py +212 -0
biblicus/time.py +17 -0
biblicus/uris.py +63 -0
biblicus/user_config.py +138 -0
biblicus-0.6.0.dist-info/METADATA +533 -0
biblicus-0.6.0.dist-info/RECORD +48 -0
biblicus-0.6.0.dist-info/WHEEL +5 -0
biblicus-0.6.0.dist-info/entry_points.txt +2 -0
biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
biblicus-0.6.0.dist-info/top_level.txt +1 -0

biblicus/retrieval.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+Shared retrieval helpers for Biblicus backends.
+"""
+from __future__ import annotations
+import hashlib
+import json
+from typing import Any, Dict, Iterable, List, Optional
+from .corpus import Corpus
+from .models import Evidence, QueryBudget, RecipeManifest, RetrievalRun
+from .time import utc_now_iso
+def create_recipe_manifest(
+    *,
+    backend_id: str,
+    name: str,
+    config: Dict[str, Any],
+    description: Optional[str] = None,
+) -> RecipeManifest:
+    """
+    Create a deterministic recipe manifest from a backend configuration.
+    :param backend_id: Backend identifier for the recipe.
+    :type backend_id: str
+    :param name: Human-readable recipe name.
+    :type name: str
+    :param config: Backend-specific configuration values.
+    :type config: dict[str, Any]
+    :param description: Optional recipe description.
+    :type description: str or None
+    :return: Deterministic recipe manifest.
+    :rtype: RecipeManifest
+    """
+    config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
+    recipe_seed = f"{backend_id}:{config_json}"
+    recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
+    return RecipeManifest(
+        recipe_id=recipe_id,
+        backend_id=backend_id,
+        name=name,
+        created_at=utc_now_iso(),
+        config=config,
+        description=description,
+    )
+def create_run_manifest(
+    corpus: Corpus,
+    *,
+    recipe: RecipeManifest,
+    stats: Dict[str, Any],
+    artifact_paths: Optional[List[str]] = None,
+) -> RetrievalRun:
+    """
+    Create a retrieval run manifest tied to the current catalog snapshot.
+    :param corpus: Corpus used to generate the run.
+    :type corpus: Corpus
+    :param recipe: Recipe manifest for the run.
+    :type recipe: RecipeManifest
+    :param stats: Backend-specific run statistics.
+    :type stats: dict[str, Any]
+    :param artifact_paths: Optional relative paths to materialized artifacts.
+    :type artifact_paths: list[str] or None
+    :return: Run manifest.
+    :rtype: RetrievalRun
+    """
+    catalog = corpus.load_catalog()
+    created_at = utc_now_iso()
+    run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
+    return RetrievalRun(
+        run_id=run_id,
+        recipe=recipe,
+        corpus_uri=catalog.corpus_uri,
+        catalog_generated_at=catalog.generated_at,
+        created_at=created_at,
+        artifact_paths=list(artifact_paths or []),
+        stats=stats,
+    )
+def hash_text(text: str) -> str:
+    """
+    Hash a text payload for provenance.
+    :param text: Text to hash.
+    :type text: str
+    :return: Secure Hash Algorithm 256 hex digest.
+    :rtype: str
+    """
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evidence]:
+    """
+    Apply a query budget to a ranked evidence list.
+    :param evidence: Ranked evidence iterable (highest score first).
+    :type evidence: Iterable[Evidence]
+    :param budget: Budget constraints to enforce.
+    :type budget: QueryBudget
+    :return: Evidence list respecting the budget.
+    :rtype: list[Evidence]
+    """
+    selected_evidence: List[Evidence] = []
+    source_counts: Dict[str, int] = {}
+    total_characters = 0
+    for candidate_evidence in evidence:
+        if len(selected_evidence) >= budget.max_total_items:
+            break
+        source_key = candidate_evidence.source_uri or candidate_evidence.item_id
+        if budget.max_items_per_source is not None:
+            if source_counts.get(source_key, 0) >= budget.max_items_per_source:
+                continue
+        text_character_count = len(candidate_evidence.text or "")
+        if budget.max_total_characters is not None:
+            if total_characters + text_character_count > budget.max_total_characters:
+                continue
+        selected_evidence.append(candidate_evidence)
+        source_counts[source_key] = source_counts.get(source_key, 0) + 1
+        total_characters += text_character_count
+    return [
+        evidence_item.model_copy(update={"rank": index})
+        for index, evidence_item in enumerate(selected_evidence, start=1)
+    ]

biblicus/sources.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+Source loading helpers for Biblicus ingestion.
+"""
+from __future__ import annotations
+import mimetypes
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+from urllib.parse import unquote, urlparse
+from urllib.request import Request, urlopen
+def _looks_like_uri(value: str) -> bool:
+    """
+    Check whether a string resembles a uniform resource identifier.
+    :param value: Candidate string.
+    :type value: str
+    :return: True if the string has a valid uniform resource identifier scheme prefix.
+    :rtype: bool
+    """
+    return "://" in value and value.split("://", 1)[0].isidentifier()
+def _filename_from_url_path(path: str) -> str:
+    """
+    Derive a filename from a uniform resource locator path.
+    :param path: Uniform resource locator path component.
+    :type path: str
+    :return: Filename or a fallback name.
+    :rtype: str
+    """
+    filename = Path(unquote(path)).name
+    return filename or "download"
+def _media_type_from_filename(name: str) -> str:
+    """
+    Guess media type from a filename.
+    :param name: Filename to inspect.
+    :type name: str
+    :return: Guessed media type or application/octet-stream.
+    :rtype: str
+    """
+    media_type, _ = mimetypes.guess_type(name)
+    return media_type or "application/octet-stream"
+def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
+    """
+    Sniff a media type from leading bytes for a small set of common formats.
+    :param data: Raw bytes to inspect.
+    :type data: bytes
+    :return: Detected media type or None.
+    :rtype: str or None
+    """
+    prefix = data[:32]
+    if prefix.startswith(b"%PDF-"):
+        return "application/pdf"
+    if prefix.startswith(b"\x89PNG\r\n\x1a\n"):
+        return "image/png"
+    if prefix[:3] == b"\xff\xd8\xff":
+        return "image/jpeg"
+    if prefix.startswith(b"RIFF") and prefix[8:12] == b"WAVE":
+        return "audio/x-wav"
+    if prefix.startswith(b"ID3") or (
+        len(prefix) >= 2 and prefix[0] == 0xFF and (prefix[1] & 0xE0) == 0xE0
+    ):
+        return "audio/mpeg"
+    if prefix.startswith(b"OggS"):
+        return "audio/ogg"
+    if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(
+        b"<html"
+    ):
+        return "text/html"
+    return None
+def _normalize_media_type(*, filename: str, media_type: str) -> str:
+    """
+    Normalize media types that are commonly mislabelled by upstream sources.
+    This function exists to keep the corpus usable for humans. When a source provides a filename
+    extension that users recognize (for example, ``.ogg``), Biblicus prefers a matching media type
+    so that downstream processing can make reasonable decisions.
+    :param filename: Filename associated with the payload.
+    :type filename: str
+    :param media_type: Media type reported or guessed for the payload.
+    :type media_type: str
+    :return: Normalized media type.
+    :rtype: str
+    """
+    suffix = Path(filename).suffix.lower()
+    if media_type in {"application/ogg", "application/x-ogg"} and suffix in {
+        ".ogg",
+        ".oga",
+        ".ogx",
+    }:
+        return "audio/ogg"
+    return media_type
+def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
+    """
+    Ensure the filename has a usable extension for the media type.
+    :param filename: Filename candidate.
+    :type filename: str
+    :param media_type: Media type to target.
+    :type media_type: str
+    :return: Filename with extension.
+    :rtype: str
+    """
+    if Path(filename).suffix:
+        return filename
+    if media_type == "audio/ogg":
+        ext = ".ogg"
+    else:
+        ext = mimetypes.guess_extension(media_type) or ""
+    return filename + ext if ext else filename
+@dataclass(frozen=True)
+class SourcePayload:
+    """
+    Loaded source payload for ingestion.
+    :ivar data: Raw bytes from the source.
+    :vartype data: bytes
+    :ivar filename: Suggested filename for the payload.
+    :vartype filename: str
+    :ivar media_type: Internet Assigned Numbers Authority media type for the payload.
+    :vartype media_type: str
+    :ivar source_uri: Source uniform resource identifier used to load the payload.
+    :vartype source_uri: str
+    """
+    data: bytes
+    filename: str
+    media_type: str
+    source_uri: str
+def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> SourcePayload:
+    """
+    Load bytes from a source reference.
+    :param source: File path or uniform resource locator to load.
+    :type source: str or Path
+    :param source_uri: Optional override for the source uniform resource identifier.
+    :type source_uri: str or None
+    :return: Source payload with bytes and metadata.
+    :rtype: SourcePayload
+    :raises ValueError: If a file:// uniform resource identifier has a non-local host.
+    :raises NotImplementedError: If the uniform resource identifier scheme is unsupported.
+    """
+    if isinstance(source, Path):
+        path = source.resolve()
+        media_type = _media_type_from_filename(path.name)
+        if path.suffix.lower() in {".md", ".markdown"}:
+            media_type = "text/markdown"
+        return SourcePayload(
+            data=path.read_bytes(),
+            filename=path.name,
+            media_type=media_type,
+            source_uri=source_uri or path.as_uri(),
+        )
+    if _looks_like_uri(source):
+        parsed = urlparse(source)
+        if parsed.scheme == "file":
+            if parsed.netloc not in ("", "localhost"):
+                raise ValueError(
+                    f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
+                )
+            path = Path(unquote(parsed.path)).resolve()
+            return load_source(path, source_uri=source_uri or source)
+        if parsed.scheme in {"http", "https"}:
+            request = Request(source, headers={"User-Agent": "biblicus/0"})
+            with urlopen(request, timeout=30) as response:
+                response_bytes = response.read()
+                content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
+                filename = _filename_from_url_path(parsed.path)
+                media_type = content_type or _media_type_from_filename(filename)
+                if media_type == "application/octet-stream":
+                    sniffed = _sniff_media_type_from_bytes(response_bytes)
+                    if sniffed:
+                        media_type = sniffed
+                        filename = _ensure_extension_for_media_type(filename, media_type)
+                media_type = _normalize_media_type(filename=filename, media_type=media_type)
+                if Path(filename).suffix.lower() in {".md", ".markdown"}:
+                    media_type = "text/markdown"
+                return SourcePayload(
+                    data=response_bytes,
+                    filename=filename,
+                    media_type=media_type,
+                    source_uri=source_uri or source,
+                )
+        raise NotImplementedError(
+            f"Unsupported source uniform resource identifier scheme: {parsed.scheme}://"
+        )
+    path = Path(source).resolve()
+    return load_source(path, source_uri=source_uri)

biblicus/time.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""
+Time utilities for Biblicus.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+def utc_now_iso() -> str:
+    """
+    Return the current Coordinated Universal Time as an International Organization for Standardization 8601 string.
+    :return: Current Coordinated Universal Time timestamp in International Organization for Standardization 8601 format.
+    :rtype: str
+    """
+    return datetime.now(timezone.utc).isoformat(timespec="microseconds")

biblicus/uris.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""
+Uniform resource identifier and path helpers for Biblicus corpora.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Union
+from urllib.parse import unquote, urlparse
+def _looks_like_uri(value: str) -> bool:
+    """
+    Check whether a string resembles a uniform resource identifier.
+    :param value: Candidate string.
+    :type value: str
+    :return: True if the string has a valid uniform resource identifier scheme prefix.
+    :rtype: bool
+    """
+    return "://" in value and value.split("://", 1)[0].isidentifier()
+def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
+    """
+    Convert a corpus reference to a filesystem path.
+    :param ref: Filesystem path or file:// uniform resource identifier.
+    :type ref: str or Path
+    :return: Resolved filesystem path.
+    :rtype: Path
+    :raises NotImplementedError: If a non-file uniform resource identifier scheme is used.
+    :raises ValueError: If a file:// uniform resource identifier has a non-local host.
+    """
+    if isinstance(ref, Path):
+        return ref.resolve()
+    if _looks_like_uri(ref):
+        parsed = urlparse(ref)
+        if parsed.scheme != "file":
+            raise NotImplementedError(
+                "Only file:// corpus uniform resource identifiers are supported in version zero "
+                f"(got {parsed.scheme}://)"
+            )
+        if parsed.netloc not in ("", "localhost"):
+            raise ValueError(
+                f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
+            )
+        return Path(unquote(parsed.path)).resolve()
+    return Path(ref).resolve()
+def normalize_corpus_uri(ref: Union[str, Path]) -> str:
+    """
+    Normalize a corpus reference into a file:// uniform resource identifier.
+    :param ref: Filesystem path or file:// uniform resource identifier.
+    :type ref: str or Path
+    :return: Canonical file:// uniform resource identifier.
+    :rtype: str
+    """
+    return corpus_ref_to_path(ref).as_uri()

biblicus/user_config.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""
+User configuration file loading for Biblicus.
+User configuration is intended for small, local settings such as credentials for optional
+integrations. It is separate from corpus configuration.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+from pydantic import BaseModel, ConfigDict, Field
+from ._vendor.dotyaml import ConfigLoader
+class OpenAiUserConfig(BaseModel):
+    """
+    Configuration for OpenAI integrations.
+    :ivar api_key: OpenAI API key used for authenticated requests.
+    :vartype api_key: str
+    """
+    model_config = ConfigDict(extra="forbid")
+    api_key: str = Field(min_length=1)
+class BiblicusUserConfig(BaseModel):
+    """
+    Parsed user configuration for Biblicus.
+    :ivar openai: Optional OpenAI configuration.
+    :vartype openai: OpenAiUserConfig or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    openai: Optional[OpenAiUserConfig] = None
+def default_user_config_paths(
+    *, cwd: Optional[Path] = None, home: Optional[Path] = None
+) -> list[Path]:
+    """
+    Compute the default user configuration file search paths.
+    The search order is:
+    1. Home configuration: ``~/.biblicus/config.yml``
+    2. Local configuration: ``./.biblicus/config.yml``
+    Local configuration overrides home configuration when both exist.
+    :param cwd: Optional working directory to use instead of the process current directory.
+    :type cwd: Path or None
+    :param home: Optional home directory to use instead of the current user's home directory.
+    :type home: Path or None
+    :return: Ordered list of configuration file paths.
+    :rtype: list[Path]
+    """
+    resolved_home = (home or Path.home()).expanduser()
+    resolved_cwd = cwd or Path.cwd()
+    return [
+        resolved_home / ".biblicus" / "config.yml",
+        resolved_cwd / ".biblicus" / "config.yml",
+    ]
+def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
+    merged: Dict[str, Any] = {key: value for key, value in base.items()}
+    for key, value in override.items():
+        if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
+            merged[key] = _deep_merge(merged[key], value)
+        else:
+            merged[key] = value
+    return merged
+def _load_dotyaml_data(path: Path) -> Dict[str, Any]:
+    """
+    Load a dotyaml configuration file and return a nested mapping.
+    :param path: Configuration file path.
+    :type path: Path
+    :return: Parsed YAML data mapping.
+    :rtype: dict[str, Any]
+    """
+    loader = ConfigLoader(prefix="", load_dotenv_first=False)
+    loaded = loader.load_from_yaml(path)
+    return loaded if isinstance(loaded, dict) else {}
+def load_user_config(*, paths: Optional[list[Path]] = None) -> BiblicusUserConfig:
+    """
+    Load user configuration from known locations.
+    This function merges multiple configuration files in order. Later files override earlier files.
+    :param paths: Optional explicit search paths. When omitted, the default paths are used.
+    :type paths: list[Path] or None
+    :return: Parsed user configuration. When no files exist, the configuration is empty.
+    :rtype: BiblicusUserConfig
+    :raises ValueError: If an existing configuration file is not parseable.
+    """
+    search_paths = paths or default_user_config_paths()
+    merged_data: Dict[str, Any] = {}
+    for path in search_paths:
+        if not path.is_file():
+            continue
+        loaded = _load_dotyaml_data(path)
+        merged_data = _deep_merge(merged_data, loaded)
+    return BiblicusUserConfig.model_validate(merged_data)
+def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
+    """
+    Resolve an OpenAI API key from environment or user configuration.
+    Environment takes precedence over configuration.
+    :param config: Optional pre-loaded user configuration.
+    :type config: BiblicusUserConfig or None
+    :return: API key string, or None when no key is available.
+    :rtype: str or None
+    """
+    env_key = os.environ.get("OPENAI_API_KEY")
+    if env_key:
+        return env_key
+    loaded = config or load_user_config()
+    if loaded.openai is None:
+        return None
+    return loaded.openai.api_key