PyPI - biblicus - Versions diffs - 0.6.0__py3-none-any.whl - Mend

biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

biblicus/__init__.py +30 -0
biblicus/__main__.py +8 -0
biblicus/_vendor/dotyaml/__init__.py +14 -0
biblicus/_vendor/dotyaml/interpolation.py +63 -0
biblicus/_vendor/dotyaml/loader.py +181 -0
biblicus/_vendor/dotyaml/transformer.py +135 -0
biblicus/backends/__init__.py +42 -0
biblicus/backends/base.py +65 -0
biblicus/backends/scan.py +375 -0
biblicus/backends/sqlite_full_text_search.py +487 -0
biblicus/cli.py +804 -0
biblicus/constants.py +12 -0
biblicus/context.py +183 -0
biblicus/corpus.py +1531 -0
biblicus/crawl.py +186 -0
biblicus/errors.py +15 -0
biblicus/evaluation.py +257 -0
biblicus/evidence_processing.py +201 -0
biblicus/extraction.py +531 -0
biblicus/extractors/__init__.py +44 -0
biblicus/extractors/base.py +68 -0
biblicus/extractors/metadata_text.py +106 -0
biblicus/extractors/openai_stt.py +180 -0
biblicus/extractors/pass_through_text.py +84 -0
biblicus/extractors/pdf_text.py +100 -0
biblicus/extractors/pipeline.py +105 -0
biblicus/extractors/rapidocr_text.py +129 -0
biblicus/extractors/select_longest_text.py +105 -0
biblicus/extractors/select_text.py +100 -0
biblicus/extractors/unstructured_text.py +100 -0
biblicus/frontmatter.py +89 -0
biblicus/hook_logging.py +180 -0
biblicus/hook_manager.py +203 -0
biblicus/hooks.py +261 -0
biblicus/ignore.py +64 -0
biblicus/knowledge_base.py +191 -0
biblicus/models.py +445 -0
biblicus/retrieval.py +133 -0
biblicus/sources.py +212 -0
biblicus/time.py +17 -0
biblicus/uris.py +63 -0
biblicus/user_config.py +138 -0
biblicus-0.6.0.dist-info/METADATA +533 -0
biblicus-0.6.0.dist-info/RECORD +48 -0
biblicus-0.6.0.dist-info/WHEEL +5 -0
biblicus-0.6.0.dist-info/entry_points.txt +2 -0
biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
biblicus-0.6.0.dist-info/top_level.txt +1 -0

biblicus/knowledge_base.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""
+High-level knowledge base workflow for turnkey usage.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import List, Optional, Sequence
+from pydantic import BaseModel, ConfigDict, Field
+from .backends import get_backend
+from .context import (
+    ContextPack,
+    ContextPackPolicy,
+    TokenBudget,
+    build_context_pack,
+    fit_context_pack_to_token_budget,
+)
+from .corpus import Corpus
+from .models import QueryBudget, RetrievalResult, RetrievalRun
+class KnowledgeBaseDefaults(BaseModel):
+    """
+    Default configuration for a knowledge base workflow.
+    :ivar backend_id: Backend identifier to use for retrieval.
+    :vartype backend_id: str
+    :ivar recipe_name: Human-readable retrieval recipe name.
+    :vartype recipe_name: str
+    :ivar query_budget: Default query budget to apply to retrieval.
+    :vartype query_budget: QueryBudget
+    :ivar tags: Tags to apply when importing the folder.
+    :vartype tags: list[str]
+    """
+    model_config = ConfigDict(extra="forbid")
+    backend_id: str = Field(default="scan", min_length=1)
+    recipe_name: str = Field(default="Knowledge base", min_length=1)
+    query_budget: QueryBudget = Field(
+        default_factory=lambda: QueryBudget(
+            max_total_items=5,
+            max_total_characters=2000,
+            max_items_per_source=None,
+        )
+    )
+    tags: List[str] = Field(default_factory=list)
+@dataclass
+class KnowledgeBase:
+    """
+    High-level knowledge base wrapper for turnkey workflows.
+    :ivar corpus: Corpus instance that stores the ingested items.
+    :vartype corpus: Corpus
+    :ivar backend_id: Backend identifier used for retrieval.
+    :vartype backend_id: str
+    :ivar run: Retrieval run manifest associated with the knowledge base.
+    :vartype run: RetrievalRun
+    :ivar defaults: Default configuration used for this knowledge base.
+    :vartype defaults: KnowledgeBaseDefaults
+    """
+    corpus: Corpus
+    backend_id: str
+    run: RetrievalRun
+    defaults: KnowledgeBaseDefaults
+    _temp_dir: Optional[TemporaryDirectory]
+    @classmethod
+    def from_folder(
+        cls,
+        folder: str | Path,
+        *,
+        backend_id: Optional[str] = None,
+        recipe_name: Optional[str] = None,
+        query_budget: Optional[QueryBudget] = None,
+        tags: Optional[Sequence[str]] = None,
+        corpus_root: Optional[str | Path] = None,
+    ) -> "KnowledgeBase":
+        """
+        Build a knowledge base from a folder of files.
+        :param folder: Folder containing source files.
+        :type folder: str or Path
+        :param backend_id: Optional backend identifier override.
+        :type backend_id: str or None
+        :param recipe_name: Optional recipe name override.
+        :type recipe_name: str or None
+        :param query_budget: Optional query budget override.
+        :type query_budget: QueryBudget or None
+        :param tags: Optional tags to apply during import.
+        :type tags: Sequence[str] or None
+        :param corpus_root: Optional corpus root override.
+        :type corpus_root: str or Path or None
+        :return: Knowledge base instance.
+        :rtype: KnowledgeBase
+        :raises FileNotFoundError: If the folder does not exist.
+        :raises NotADirectoryError: If the folder is not a directory.
+        """
+        source_root = Path(folder).resolve()
+        if not source_root.exists():
+            raise FileNotFoundError(f"Knowledge base folder does not exist: {source_root}")
+        if not source_root.is_dir():
+            raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
+        defaults = KnowledgeBaseDefaults()
+        resolved_backend_id = backend_id or defaults.backend_id
+        resolved_recipe_name = recipe_name or defaults.recipe_name
+        resolved_query_budget = query_budget or defaults.query_budget
+        resolved_tags = list(tags) if tags is not None else defaults.tags
+        temp_dir: Optional[TemporaryDirectory] = None
+        if corpus_root is None:
+            temp_dir = TemporaryDirectory(prefix="biblicus-knowledge-base-")
+            corpus_root_path = Path(temp_dir.name) / "corpus"
+        else:
+            corpus_root_path = Path(corpus_root).resolve()
+        corpus = Corpus.init(corpus_root_path)
+        corpus.import_tree(source_root, tags=resolved_tags)
+        backend = get_backend(resolved_backend_id)
+        run = backend.build_run(corpus, recipe_name=resolved_recipe_name, config={})
+        return cls(
+            corpus=corpus,
+            backend_id=resolved_backend_id,
+            run=run,
+            defaults=KnowledgeBaseDefaults(
+                backend_id=resolved_backend_id,
+                recipe_name=resolved_recipe_name,
+                query_budget=resolved_query_budget,
+                tags=resolved_tags,
+            ),
+            _temp_dir=temp_dir,
+        )
+    def query(self, query_text: str, *, budget: Optional[QueryBudget] = None) -> RetrievalResult:
+        """
+        Query the knowledge base for evidence.
+        :param query_text: Query text to execute.
+        :type query_text: str
+        :param budget: Optional budget override.
+        :type budget: QueryBudget or None
+        :return: Retrieval result containing evidence.
+        :rtype: RetrievalResult
+        """
+        backend = get_backend(self.backend_id)
+        resolved_budget = budget or self.defaults.query_budget
+        return backend.query(
+            self.corpus,
+            run=self.run,
+            query_text=query_text,
+            budget=resolved_budget,
+        )
+    def context_pack(
+        self,
+        result: RetrievalResult,
+        *,
+        join_with: str = "\n\n",
+        max_tokens: Optional[int] = None,
+    ) -> ContextPack:
+        """
+        Build a context pack from a retrieval result.
+        :param result: Retrieval result to convert into context.
+        :type result: RetrievalResult
+        :param join_with: Join string for evidence blocks.
+        :type join_with: str
+        :param max_tokens: Optional token budget for the context pack.
+        :type max_tokens: int or None
+        :return: Context pack text and metadata.
+        :rtype: ContextPack
+        """
+        policy = ContextPackPolicy(join_with=join_with)
+        context_pack = build_context_pack(result, policy=policy)
+        if max_tokens is None:
+            return context_pack
+        return fit_context_pack_to_token_budget(
+            context_pack,
+            policy=policy,
+            token_budget=TokenBudget(max_tokens=max_tokens),
+        )

biblicus/models.py ADDED Viewed

@@ -0,0 +1,445 @@
+"""
+Pydantic models for Biblicus domain concepts.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from .constants import SCHEMA_VERSION
+from .hooks import HookSpec
+class CorpusConfig(BaseModel):
+    """
+    Canonical on-disk config for a local Biblicus corpus.
+    :ivar schema_version: Version of the corpus config schema.
+    :vartype schema_version: int
+    :ivar created_at: International Organization for Standardization 8601 timestamp for corpus creation.
+    :vartype created_at: str
+    :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
+    :vartype corpus_uri: str
+    :ivar raw_dir: Relative path to the raw items folder.
+    :vartype raw_dir: str
+    :ivar notes: Optional free-form notes for operators.
+    :vartype notes: dict[str, Any] or None
+    :ivar hooks: Optional hook specifications for corpus lifecycle events.
+    :vartype hooks: list[HookSpec] or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: int = Field(ge=1)
+    created_at: str
+    corpus_uri: str
+    raw_dir: str = "raw"
+    notes: Optional[Dict[str, Any]] = None
+    hooks: Optional[List[HookSpec]] = None
+    @model_validator(mode="after")
+    def _enforce_schema_version(self) -> "CorpusConfig":
+        if self.schema_version != SCHEMA_VERSION:
+            raise ValueError(f"Unsupported corpus config schema version: {self.schema_version}")
+        return self
+class IngestResult(BaseModel):
+    """
+    Minimal summary for an ingestion event.
+    :ivar item_id: Universally unique identifier assigned to the ingested item.
+    :vartype item_id: str
+    :ivar relpath: Relative path to the raw item file.
+    :vartype relpath: str
+    :ivar sha256: Secure Hash Algorithm 256 digest of the stored bytes.
+    :vartype sha256: str
+    """
+    model_config = ConfigDict(extra="forbid")
+    item_id: str
+    relpath: str
+    sha256: str
+class CatalogItem(BaseModel):
+    """
+    Catalog entry derived from a raw corpus item.
+    :ivar id: Universally unique identifier of the item.
+    :vartype id: str
+    :ivar relpath: Relative path to the raw item file.
+    :vartype relpath: str
+    :ivar sha256: Secure Hash Algorithm 256 digest of the stored bytes.
+    :vartype sha256: str
+    :ivar bytes: Size of the raw item in bytes.
+    :vartype bytes: int
+    :ivar media_type: Internet Assigned Numbers Authority media type for the item.
+    :vartype media_type: str
+    :ivar title: Optional human title extracted from metadata.
+    :vartype title: str or None
+    :ivar tags: Tags extracted or supplied for the item.
+    :vartype tags: list[str]
+    :ivar metadata: Merged front matter or sidecar metadata.
+    :vartype metadata: dict[str, Any]
+    :ivar created_at: International Organization for Standardization 8601 timestamp when the item was first indexed.
+    :vartype created_at: str
+    :ivar source_uri: Optional source uniform resource identifier used at ingestion time.
+    :vartype source_uri: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    relpath: str
+    sha256: str
+    bytes: int = Field(ge=0)
+    media_type: str
+    title: Optional[str] = None
+    tags: List[str] = Field(default_factory=list)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    created_at: str
+    source_uri: Optional[str] = None
+class CorpusCatalog(BaseModel):
+    """
+    Snapshot of the derived corpus catalog.
+    :ivar schema_version: Version of the catalog schema.
+    :vartype schema_version: int
+    :ivar generated_at: International Organization for Standardization 8601 timestamp of catalog generation.
+    :vartype generated_at: str
+    :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
+    :vartype corpus_uri: str
+    :ivar raw_dir: Relative path to the raw items folder.
+    :vartype raw_dir: str
+    :ivar latest_run_id: Latest retrieval run identifier, if any.
+    :vartype latest_run_id: str or None
+    :ivar items: Mapping of item IDs to catalog entries.
+    :vartype items: dict[str, CatalogItem]
+    :ivar order: Display order of item IDs (most recent first).
+    :vartype order: list[str]
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: int = Field(ge=1)
+    generated_at: str
+    corpus_uri: str
+    raw_dir: str = "raw"
+    latest_run_id: Optional[str] = None
+    items: Dict[str, CatalogItem] = Field(default_factory=dict)
+    order: List[str] = Field(default_factory=list)
+    @model_validator(mode="after")
+    def _enforce_schema_version(self) -> "CorpusCatalog":
+        if self.schema_version != SCHEMA_VERSION:
+            raise ValueError(f"Unsupported catalog schema version: {self.schema_version}")
+        return self
+class ExtractionRunReference(BaseModel):
+    """
+    Reference to an extraction run.
+    :ivar extractor_id: Extractor plugin identifier.
+    :vartype extractor_id: str
+    :ivar run_id: Extraction run identifier.
+    :vartype run_id: str
+    """
+    model_config = ConfigDict(extra="forbid")
+    extractor_id: str = Field(min_length=1)
+    run_id: str = Field(min_length=1)
+    def as_string(self) -> str:
+        """
+        Serialize the reference as a single string.
+        :return: Reference in the form extractor_id:run_id.
+        :rtype: str
+        """
+        return f"{self.extractor_id}:{self.run_id}"
+def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
+    """
+    Parse an extraction run reference in the form extractor_id:run_id.
+    :param value: Raw reference string.
+    :type value: str
+    :return: Parsed extraction run reference.
+    :rtype: ExtractionRunReference
+    :raises ValueError: If the reference is not well formed.
+    """
+    if ":" not in value:
+        raise ValueError("Extraction run reference must be extractor_id:run_id")
+    extractor_id, run_id = value.split(":", 1)
+    extractor_id = extractor_id.strip()
+    run_id = run_id.strip()
+    if not extractor_id or not run_id:
+        raise ValueError(
+            "Extraction run reference must be extractor_id:run_id with non-empty parts"
+        )
+    return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
+class ExtractionRunListEntry(BaseModel):
+    """
+    Summary entry for an extraction run stored in a corpus.
+    :ivar extractor_id: Extractor plugin identifier.
+    :vartype extractor_id: str
+    :ivar run_id: Extraction run identifier.
+    :vartype run_id: str
+    :ivar recipe_id: Deterministic recipe identifier.
+    :vartype recipe_id: str
+    :ivar recipe_name: Human-readable recipe name.
+    :vartype recipe_name: str
+    :ivar catalog_generated_at: Catalog timestamp used for the run.
+    :vartype catalog_generated_at: str
+    :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
+    :vartype created_at: str
+    :ivar stats: Run statistics.
+    :vartype stats: dict[str, object]
+    """
+    model_config = ConfigDict(extra="forbid")
+    extractor_id: str = Field(min_length=1)
+    run_id: str = Field(min_length=1)
+    recipe_id: str = Field(min_length=1)
+    recipe_name: str = Field(min_length=1)
+    catalog_generated_at: str = Field(min_length=1)
+    created_at: str = Field(min_length=1)
+    stats: Dict[str, object] = Field(default_factory=dict)
+class QueryBudget(BaseModel):
+    """
+    Evidence selection budget for retrieval.
+    :ivar max_total_items: Maximum number of evidence items to return.
+    :vartype max_total_items: int
+    :ivar max_total_characters: Optional maximum total characters across evidence text.
+    :vartype max_total_characters: int or None
+    :ivar max_items_per_source: Optional cap per source uniform resource identifier.
+    :vartype max_items_per_source: int or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    max_total_items: int = Field(ge=1)
+    max_total_characters: Optional[int] = Field(default=None, ge=1)
+    max_items_per_source: Optional[int] = Field(default=None, ge=1)
+class Evidence(BaseModel):
+    """
+    Structured retrieval evidence returned from a backend.
+    :ivar item_id: Item identifier that produced the evidence.
+    :vartype item_id: str
+    :ivar source_uri: Source uniform resource identifier from ingestion metadata.
+    :vartype source_uri: str or None
+    :ivar media_type: Media type for the evidence item.
+    :vartype media_type: str
+    :ivar score: Retrieval score (higher is better).
+    :vartype score: float
+    :ivar rank: Rank within the final evidence list (1-based).
+    :vartype rank: int
+    :ivar text: Optional text payload for the evidence.
+    :vartype text: str or None
+    :ivar content_ref: Optional reference for non-text content.
+    :vartype content_ref: str or None
+    :ivar span_start: Optional start offset in the source text.
+    :vartype span_start: int or None
+    :ivar span_end: Optional end offset in the source text.
+    :vartype span_end: int or None
+    :ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
+    :vartype stage: str
+    :ivar recipe_id: Recipe identifier used to create the run.
+    :vartype recipe_id: str
+    :ivar run_id: Retrieval run identifier.
+    :vartype run_id: str
+    :ivar hash: Optional content hash for provenance.
+    :vartype hash: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    item_id: str
+    source_uri: Optional[str] = None
+    media_type: str
+    score: float
+    rank: int = Field(ge=1)
+    text: Optional[str] = None
+    content_ref: Optional[str] = None
+    span_start: Optional[int] = None
+    span_end: Optional[int] = None
+    stage: str
+    recipe_id: str
+    run_id: str
+    hash: Optional[str] = None
+    @model_validator(mode="after")
+    def _require_text_or_reference(self) -> "Evidence":
+        has_text = isinstance(self.text, str) and self.text.strip()
+        has_ref = isinstance(self.content_ref, str) and self.content_ref.strip()
+        if not has_text and not has_ref:
+            raise ValueError("Evidence must include either text or content_ref")
+        return self
+class RecipeManifest(BaseModel):
+    """
+    Reproducible configuration for a retrieval backend.
+    :ivar recipe_id: Deterministic recipe identifier.
+    :vartype recipe_id: str
+    :ivar backend_id: Backend identifier for the recipe.
+    :vartype backend_id: str
+    :ivar name: Human-readable name for the recipe.
+    :vartype name: str
+    :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
+    :vartype created_at: str
+    :ivar config: Backend-specific configuration values.
+    :vartype config: dict[str, Any]
+    :ivar description: Optional human description.
+    :vartype description: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    recipe_id: str
+    backend_id: str
+    name: str
+    created_at: str
+    config: Dict[str, Any] = Field(default_factory=dict)
+    description: Optional[str] = None
+class RetrievalRun(BaseModel):
+    """
+    Immutable record of a retrieval materialization or on-demand run.
+    :ivar run_id: Unique run identifier.
+    :vartype run_id: str
+    :ivar recipe: Recipe manifest for this run.
+    :vartype recipe: RecipeManifest
+    :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
+    :vartype corpus_uri: str
+    :ivar catalog_generated_at: Catalog timestamp used for the run.
+    :vartype catalog_generated_at: str
+    :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
+    :vartype created_at: str
+    :ivar artifact_paths: Relative paths to materialized artifacts.
+    :vartype artifact_paths: list[str]
+    :ivar stats: Backend-specific run statistics.
+    :vartype stats: dict[str, Any]
+    """
+    model_config = ConfigDict(extra="forbid")
+    run_id: str
+    recipe: RecipeManifest
+    corpus_uri: str
+    catalog_generated_at: str
+    created_at: str
+    artifact_paths: List[str] = Field(default_factory=list)
+    stats: Dict[str, Any] = Field(default_factory=dict)
+class RetrievalResult(BaseModel):
+    """
+    Retrieval result bundle returned from a backend query.
+    :ivar query_text: Query text issued against the backend.
+    :vartype query_text: str
+    :ivar budget: Evidence selection budget applied to results.
+    :vartype budget: QueryBudget
+    :ivar run_id: Retrieval run identifier.
+    :vartype run_id: str
+    :ivar recipe_id: Recipe identifier used for this query.
+    :vartype recipe_id: str
+    :ivar backend_id: Backend identifier used for this query.
+    :vartype backend_id: str
+    :ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
+    :vartype generated_at: str
+    :ivar evidence: Evidence objects selected under the budget.
+    :vartype evidence: list[Evidence]
+    :ivar stats: Backend-specific query statistics.
+    :vartype stats: dict[str, Any]
+    """
+    model_config = ConfigDict(extra="forbid")
+    query_text: str
+    budget: QueryBudget
+    run_id: str
+    recipe_id: str
+    backend_id: str
+    generated_at: str
+    evidence: List[Evidence] = Field(default_factory=list)
+    stats: Dict[str, Any] = Field(default_factory=dict)
+class ExtractedText(BaseModel):
+    """
+    Text payload produced by an extractor plugin.
+    :ivar text: Extracted text content.
+    :vartype text: str
+    :ivar producer_extractor_id: Extractor identifier that produced this text.
+    :vartype producer_extractor_id: str
+    :ivar source_step_index: Optional pipeline step index where this text originated.
+    :vartype source_step_index: int or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    text: str
+    producer_extractor_id: str = Field(min_length=1)
+    source_step_index: Optional[int] = Field(default=None, ge=1)
+class ExtractionStepOutput(BaseModel):
+    """
+    In-memory representation of a pipeline step output for a single item.
+    :ivar step_index: One-based pipeline step index.
+    :vartype step_index: int
+    :ivar extractor_id: Extractor identifier for the step.
+    :vartype extractor_id: str
+    :ivar status: Step status, extracted, skipped, or errored.
+    :vartype status: str
+    :ivar text: Extracted text content, when produced.
+    :vartype text: str or None
+    :ivar text_characters: Character count of the extracted text.
+    :vartype text_characters: int
+    :ivar producer_extractor_id: Extractor identifier that produced the text content.
+    :vartype producer_extractor_id: str or None
+    :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
+    :vartype source_step_index: int or None
+    :ivar error_type: Optional error type name for errored steps.
+    :vartype error_type: str or None
+    :ivar error_message: Optional error message for errored steps.
+    :vartype error_message: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    step_index: int = Field(ge=1)
+    extractor_id: str
+    status: str
+    text: Optional[str] = None
+    text_characters: int = Field(default=0, ge=0)
+    producer_extractor_id: Optional[str] = None
+    source_step_index: Optional[int] = Field(default=None, ge=1)
+    error_type: Optional[str] = None
+    error_message: Optional[str] = None