PyPI - biblicus - Versions diffs - 0.1.1__py3-none-any.whl - Mend

biblicus 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

biblicus/__init__.py +28 -0
biblicus/__main__.py +8 -0
biblicus/backends/__init__.py +44 -0
biblicus/backends/base.py +65 -0
biblicus/backends/scan.py +292 -0
biblicus/backends/sqlite_full_text_search.py +427 -0
biblicus/cli.py +468 -0
biblicus/constants.py +10 -0
biblicus/corpus.py +952 -0
biblicus/evaluation.py +261 -0
biblicus/frontmatter.py +92 -0
biblicus/models.py +307 -0
biblicus/retrieval.py +137 -0
biblicus/sources.py +132 -0
biblicus/time.py +18 -0
biblicus/uris.py +64 -0
biblicus-0.1.1.dist-info/METADATA +174 -0
biblicus-0.1.1.dist-info/RECORD +22 -0
biblicus-0.1.1.dist-info/WHEEL +5 -0
biblicus-0.1.1.dist-info/entry_points.txt +2 -0
biblicus-0.1.1.dist-info/licenses/LICENSE +21 -0
biblicus-0.1.1.dist-info/top_level.txt +1 -0

biblicus/evaluation.py ADDED Viewed

@@ -0,0 +1,261 @@
+"""
+Evaluation utilities for Biblicus retrieval runs.
+"""
+from __future__ import annotations
+import json
+import time
+from pathlib import Path
+from typing import Dict, List, Optional
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from .constants import DATASET_SCHEMA_VERSION
+from .backends import get_backend
+from .corpus import Corpus
+from .models import QueryBudget, RetrievalResult, RetrievalRun
+from .time import utc_now_iso
+class EvaluationQuery(BaseModel):
+    """
+    Query record for retrieval evaluation.
+    :ivar query_id: Unique identifier for the query.
+    :vartype query_id: str
+    :ivar query_text: Natural language query to execute.
+    :vartype query_text: str
+    :ivar expected_item_id: Optional expected item identifier.
+    :vartype expected_item_id: str or None
+    :ivar expected_source_uri: Optional expected source uniform resource identifier.
+    :vartype expected_source_uri: str or None
+    :ivar kind: Query kind (gold or synthetic).
+    :vartype kind: str
+    """
+    model_config = ConfigDict(extra="forbid")
+    query_id: str
+    query_text: str
+    expected_item_id: Optional[str] = None
+    expected_source_uri: Optional[str] = None
+    kind: str = Field(default="gold")
+    @model_validator(mode="after")
+    def _require_expectation(self) -> "EvaluationQuery":
+        if not self.expected_item_id and not self.expected_source_uri:
+            raise ValueError("Evaluation queries must include expected_item_id or expected_source_uri")
+        return self
+class EvaluationDataset(BaseModel):
+    """
+    Dataset for retrieval evaluation.
+    :ivar schema_version: Dataset schema version.
+    :vartype schema_version: int
+    :ivar name: Dataset name.
+    :vartype name: str
+    :ivar description: Optional description.
+    :vartype description: str or None
+    :ivar queries: List of evaluation queries.
+    :vartype queries: list[EvaluationQuery]
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: int = Field(ge=1)
+    name: str
+    description: Optional[str] = None
+    queries: List[EvaluationQuery] = Field(default_factory=list)
+    @model_validator(mode="after")
+    def _enforce_schema_version(self) -> "EvaluationDataset":
+        if self.schema_version != DATASET_SCHEMA_VERSION:
+            raise ValueError(f"Unsupported dataset schema version: {self.schema_version}")
+        return self
+class EvaluationResult(BaseModel):
+    """
+    Result bundle for a retrieval evaluation.
+    :ivar dataset: Dataset metadata.
+    :vartype dataset: dict[str, object]
+    :ivar backend_id: Backend identifier.
+    :vartype backend_id: str
+    :ivar run_id: Retrieval run identifier.
+    :vartype run_id: str
+    :ivar evaluated_at: International Organization for Standardization 8601 evaluation timestamp.
+    :vartype evaluated_at: str
+    :ivar metrics: Quality metrics for retrieval.
+    :vartype metrics: dict[str, float]
+    :ivar system: System metrics for retrieval.
+    :vartype system: dict[str, float]
+    """
+    model_config = ConfigDict(extra="forbid")
+    dataset: Dict[str, object]
+    backend_id: str
+    run_id: str
+    evaluated_at: str
+    metrics: Dict[str, float]
+    system: Dict[str, float]
+def load_dataset(path: Path) -> EvaluationDataset:
+    """
+    Load an evaluation dataset from JavaScript Object Notation.
+    :param path: Path to the dataset JavaScript Object Notation file.
+    :type path: Path
+    :return: Parsed evaluation dataset.
+    :rtype: EvaluationDataset
+    """
+    data = json.loads(path.read_text(encoding="utf-8"))
+    return EvaluationDataset.model_validate(data)
+def evaluate_run(
+    *,
+    corpus: Corpus,
+    run: RetrievalRun,
+    dataset: EvaluationDataset,
+    budget: QueryBudget,
+) -> EvaluationResult:
+    """
+    Evaluate a retrieval run against a dataset.
+    :param corpus: Corpus associated with the run.
+    :type corpus: Corpus
+    :param run: Retrieval run manifest.
+    :type run: RetrievalRun
+    :param dataset: Evaluation dataset.
+    :type dataset: EvaluationDataset
+    :param budget: Evidence selection budget.
+    :type budget: QueryBudget
+    :return: Evaluation result bundle.
+    :rtype: EvaluationResult
+    """
+    backend = get_backend(run.recipe.backend_id)
+    latency_seconds: List[float] = []
+    hit_count = 0
+    reciprocal_ranks: List[float] = []
+    for query in dataset.queries:
+        timer_start = time.perf_counter()
+        result = backend.query(corpus, run=run, query_text=query.query_text, budget=budget)
+        elapsed_seconds = time.perf_counter() - timer_start
+        latency_seconds.append(elapsed_seconds)
+        expected_rank = _expected_rank(result, query)
+        if expected_rank is not None:
+            hit_count += 1
+            reciprocal_ranks.append(1.0 / expected_rank)
+        else:
+            reciprocal_ranks.append(0.0)
+    total_queries = max(len(dataset.queries), 1)
+    max_total_items = float(budget.max_total_items)
+    hit_rate = hit_count / total_queries
+    precision_at_max_total_items = hit_count / (total_queries * max_total_items)
+    mean_reciprocal_rank = sum(reciprocal_ranks) / total_queries
+    metrics = {
+        "hit_rate": hit_rate,
+        "precision_at_max_total_items": precision_at_max_total_items,
+        "mean_reciprocal_rank": mean_reciprocal_rank,
+    }
+    system = {
+        "average_latency_milliseconds": _average_latency_milliseconds(latency_seconds),
+        "percentile_95_latency_milliseconds": _percentile_95_latency_milliseconds(latency_seconds),
+        "index_bytes": float(_run_artifact_bytes(corpus, run)),
+    }
+    dataset_meta = {
+        "name": dataset.name,
+        "description": dataset.description,
+        "queries": len(dataset.queries),
+    }
+    return EvaluationResult(
+        dataset=dataset_meta,
+        backend_id=run.recipe.backend_id,
+        run_id=run.run_id,
+        evaluated_at=utc_now_iso(),
+        metrics=metrics,
+        system=system,
+    )
+def _expected_rank(result: RetrievalResult, query: EvaluationQuery) -> Optional[int]:
+    """
+    Locate the first evidence rank that matches the expected item or source.
+    :param result: Retrieval result for a query.
+    :type result: RetrievalResult
+    :param query: Evaluation query definition.
+    :type query: EvaluationQuery
+    :return: Rank of the first matching evidence item, or None.
+    :rtype: int or None
+    """
+    for evidence in result.evidence:
+        if query.expected_item_id and evidence.item_id == query.expected_item_id:
+            return evidence.rank
+        if query.expected_source_uri and evidence.source_uri == query.expected_source_uri:
+            return evidence.rank
+    return None
+def _average_latency_milliseconds(latencies: List[float]) -> float:
+    """
+    Compute average latency in milliseconds.
+    :param latencies: Latency samples in seconds.
+    :type latencies: list[float]
+    :return: Average latency in milliseconds.
+    :rtype: float
+    """
+    if not latencies:
+        return 0.0
+    return sum(latencies) / len(latencies) * 1000.0
+def _percentile_95_latency_milliseconds(latencies: List[float]) -> float:
+    """
+    Compute the percentile 95 latency in milliseconds.
+    :param latencies: Latency samples in seconds.
+    :type latencies: list[float]
+    :return: Percentile 95 latency in milliseconds.
+    :rtype: float
+    """
+    if not latencies:
+        return 0.0
+    sorted_latencies = sorted(latencies)
+    percentile_index = int(round(0.95 * (len(sorted_latencies) - 1)))
+    return sorted_latencies[percentile_index] * 1000.0
+def _run_artifact_bytes(corpus: Corpus, run: RetrievalRun) -> int:
+    """
+    Sum artifact sizes for a retrieval run.
+    :param corpus: Corpus that owns the artifacts.
+    :type corpus: Corpus
+    :param run: Retrieval run manifest.
+    :type run: RetrievalRun
+    :return: Total artifact bytes.
+    :rtype: int
+    """
+    total_bytes = 0
+    for artifact_relpath in run.artifact_paths:
+        artifact_path = corpus.root / artifact_relpath
+        if artifact_path.exists():
+            total_bytes += artifact_path.stat().st_size
+    return total_bytes

biblicus/frontmatter.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""
+Markdown front matter helpers.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+import yaml
+@dataclass(frozen=True)
+class FrontMatterDocument:
+    """
+    Parsed front matter and markdown body.
+    :ivar metadata: Front matter metadata mapping.
+    :vartype metadata: dict[str, Any]
+    :ivar body: Markdown body text.
+    :vartype body: str
+    """
+    metadata: Dict[str, Any]
+    body: str
+def parse_front_matter(text: str) -> FrontMatterDocument:
+    """
+    Parse Yet Another Markup Language front matter from a Markdown document.
+    :param text: Markdown content with optional front matter.
+    :type text: str
+    :return: Parsed front matter and body.
+    :rtype: FrontMatterDocument
+    :raises ValueError: If front matter is present but not a mapping.
+    """
+    if not text.startswith("---\n"):
+        return FrontMatterDocument(metadata={}, body=text)
+    front_matter_end = text.find("\n---\n", 4)
+    if front_matter_end == -1:
+        return FrontMatterDocument(metadata={}, body=text)
+    raw_yaml = text[4:front_matter_end]
+    body = text[front_matter_end + len("\n---\n") :]
+    metadata = yaml.safe_load(raw_yaml) or {}
+    if not isinstance(metadata, dict):
+        raise ValueError("Yet Another Markup Language front matter must be a mapping object")
+    return FrontMatterDocument(metadata=dict(metadata), body=body)
+def render_front_matter(metadata: Dict[str, Any], body: str) -> str:
+    """
+    Render Yet Another Markup Language front matter with a Markdown body.
+    :param metadata: Front matter metadata mapping.
+    :type metadata: dict[str, Any]
+    :param body: Markdown body text.
+    :type body: str
+    :return: Markdown with Yet Another Markup Language front matter.
+    :rtype: str
+    """
+    if not metadata:
+        return body
+    yaml_text = yaml.safe_dump(
+        metadata,
+        sort_keys=False,
+        allow_unicode=True,
+        default_flow_style=False,
+    ).strip()
+    return f"---\n{yaml_text}\n---\n{body}"
+def split_markdown_front_matter(path_text: str) -> Tuple[Dict[str, Any], str]:
+    """
+    Split Markdown into front matter metadata and body.
+    :param path_text: Markdown content.
+    :type path_text: str
+    :return: Metadata mapping and body text.
+    :rtype: tuple[dict[str, Any], str]
+    """
+    parsed_document = parse_front_matter(path_text)
+    return parsed_document.metadata, parsed_document.body

biblicus/models.py ADDED Viewed

@@ -0,0 +1,307 @@
+"""
+Pydantic models for Biblicus domain concepts.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from .constants import SCHEMA_VERSION
+class CorpusConfig(BaseModel):
+    """
+    Canonical on-disk config for a local Biblicus corpus.
+    :ivar schema_version: Version of the corpus config schema.
+    :vartype schema_version: int
+    :ivar created_at: International Organization for Standardization 8601 timestamp for corpus creation.
+    :vartype created_at: str
+    :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
+    :vartype corpus_uri: str
+    :ivar raw_dir: Relative path to the raw items folder.
+    :vartype raw_dir: str
+    :ivar notes: Optional free-form notes for operators.
+    :vartype notes: dict[str, Any] or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: int = Field(ge=1)
+    created_at: str
+    corpus_uri: str
+    raw_dir: str = "raw"
+    notes: Optional[Dict[str, Any]] = None
+    @model_validator(mode="after")
+    def _enforce_schema_version(self) -> "CorpusConfig":
+        if self.schema_version != SCHEMA_VERSION:
+            raise ValueError(f"Unsupported corpus config schema version: {self.schema_version}")
+        return self
+class IngestResult(BaseModel):
+    """
+    Minimal summary for an ingestion event.
+    :ivar item_id: Universally unique identifier assigned to the ingested item.
+    :vartype item_id: str
+    :ivar relpath: Relative path to the raw item file.
+    :vartype relpath: str
+    :ivar sha256: Secure Hash Algorithm 256 digest of the stored bytes.
+    :vartype sha256: str
+    """
+    model_config = ConfigDict(extra="forbid")
+    item_id: str
+    relpath: str
+    sha256: str
+class CatalogItem(BaseModel):
+    """
+    Catalog entry derived from a raw corpus item.
+    :ivar id: Universally unique identifier of the item.
+    :vartype id: str
+    :ivar relpath: Relative path to the raw item file.
+    :vartype relpath: str
+    :ivar sha256: Secure Hash Algorithm 256 digest of the stored bytes.
+    :vartype sha256: str
+    :ivar bytes: Size of the raw item in bytes.
+    :vartype bytes: int
+    :ivar media_type: Internet Assigned Numbers Authority media type for the item.
+    :vartype media_type: str
+    :ivar title: Optional human title extracted from metadata.
+    :vartype title: str or None
+    :ivar tags: Tags extracted or supplied for the item.
+    :vartype tags: list[str]
+    :ivar metadata: Merged front matter or sidecar metadata.
+    :vartype metadata: dict[str, Any]
+    :ivar created_at: International Organization for Standardization 8601 timestamp when the item was first indexed.
+    :vartype created_at: str
+    :ivar source_uri: Optional source uniform resource identifier used at ingestion time.
+    :vartype source_uri: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    relpath: str
+    sha256: str
+    bytes: int = Field(ge=0)
+    media_type: str
+    title: Optional[str] = None
+    tags: List[str] = Field(default_factory=list)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    created_at: str
+    source_uri: Optional[str] = None
+class CorpusCatalog(BaseModel):
+    """
+    Snapshot of the derived corpus catalog.
+    :ivar schema_version: Version of the catalog schema.
+    :vartype schema_version: int
+    :ivar generated_at: International Organization for Standardization 8601 timestamp of catalog generation.
+    :vartype generated_at: str
+    :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
+    :vartype corpus_uri: str
+    :ivar raw_dir: Relative path to the raw items folder.
+    :vartype raw_dir: str
+    :ivar latest_run_id: Latest retrieval run identifier, if any.
+    :vartype latest_run_id: str or None
+    :ivar items: Mapping of item IDs to catalog entries.
+    :vartype items: dict[str, CatalogItem]
+    :ivar order: Display order of item IDs (most recent first).
+    :vartype order: list[str]
+    """
+    model_config = ConfigDict(extra="forbid")
+    schema_version: int = Field(ge=1)
+    generated_at: str
+    corpus_uri: str
+    raw_dir: str = "raw"
+    latest_run_id: Optional[str] = None
+    items: Dict[str, CatalogItem] = Field(default_factory=dict)
+    order: List[str] = Field(default_factory=list)
+    @model_validator(mode="after")
+    def _enforce_schema_version(self) -> "CorpusCatalog":
+        if self.schema_version != SCHEMA_VERSION:
+            raise ValueError(f"Unsupported catalog schema version: {self.schema_version}")
+        return self
+class QueryBudget(BaseModel):
+    """
+    Evidence selection budget for retrieval.
+    :ivar max_total_items: Maximum number of evidence items to return.
+    :vartype max_total_items: int
+    :ivar max_total_characters: Optional maximum total characters across evidence text.
+    :vartype max_total_characters: int or None
+    :ivar max_items_per_source: Optional cap per source uniform resource identifier.
+    :vartype max_items_per_source: int or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    max_total_items: int = Field(ge=1)
+    max_total_characters: Optional[int] = Field(default=None, ge=1)
+    max_items_per_source: Optional[int] = Field(default=None, ge=1)
+class Evidence(BaseModel):
+    """
+    Structured retrieval evidence returned from a backend.
+    :ivar item_id: Item identifier that produced the evidence.
+    :vartype item_id: str
+    :ivar source_uri: Source uniform resource identifier from ingestion metadata.
+    :vartype source_uri: str or None
+    :ivar media_type: Media type for the evidence item.
+    :vartype media_type: str
+    :ivar score: Retrieval score (higher is better).
+    :vartype score: float
+    :ivar rank: Rank within the final evidence list (1-based).
+    :vartype rank: int
+    :ivar text: Optional text payload for the evidence.
+    :vartype text: str or None
+    :ivar content_ref: Optional reference for non-text content.
+    :vartype content_ref: str or None
+    :ivar span_start: Optional start offset in the source text.
+    :vartype span_start: int or None
+    :ivar span_end: Optional end offset in the source text.
+    :vartype span_end: int or None
+    :ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
+    :vartype stage: str
+    :ivar recipe_id: Recipe identifier used to create the run.
+    :vartype recipe_id: str
+    :ivar run_id: Retrieval run identifier.
+    :vartype run_id: str
+    :ivar hash: Optional content hash for provenance.
+    :vartype hash: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    item_id: str
+    source_uri: Optional[str] = None
+    media_type: str
+    score: float
+    rank: int = Field(ge=1)
+    text: Optional[str] = None
+    content_ref: Optional[str] = None
+    span_start: Optional[int] = None
+    span_end: Optional[int] = None
+    stage: str
+    recipe_id: str
+    run_id: str
+    hash: Optional[str] = None
+    @model_validator(mode="after")
+    def _require_text_or_reference(self) -> "Evidence":
+        has_text = isinstance(self.text, str) and self.text.strip()
+        has_ref = isinstance(self.content_ref, str) and self.content_ref.strip()
+        if not has_text and not has_ref:
+            raise ValueError("Evidence must include either text or content_ref")
+        return self
+class RecipeManifest(BaseModel):
+    """
+    Reproducible configuration for a retrieval backend.
+    :ivar recipe_id: Deterministic recipe identifier.
+    :vartype recipe_id: str
+    :ivar backend_id: Backend identifier for the recipe.
+    :vartype backend_id: str
+    :ivar name: Human-readable name for the recipe.
+    :vartype name: str
+    :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
+    :vartype created_at: str
+    :ivar config: Backend-specific configuration values.
+    :vartype config: dict[str, Any]
+    :ivar description: Optional human description.
+    :vartype description: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    recipe_id: str
+    backend_id: str
+    name: str
+    created_at: str
+    config: Dict[str, Any] = Field(default_factory=dict)
+    description: Optional[str] = None
+class RetrievalRun(BaseModel):
+    """
+    Immutable record of a retrieval materialization or on-demand run.
+    :ivar run_id: Unique run identifier.
+    :vartype run_id: str
+    :ivar recipe: Recipe manifest for this run.
+    :vartype recipe: RecipeManifest
+    :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
+    :vartype corpus_uri: str
+    :ivar catalog_generated_at: Catalog timestamp used for the run.
+    :vartype catalog_generated_at: str
+    :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
+    :vartype created_at: str
+    :ivar artifact_paths: Relative paths to materialized artifacts.
+    :vartype artifact_paths: list[str]
+    :ivar stats: Backend-specific run statistics.
+    :vartype stats: dict[str, Any]
+    """
+    model_config = ConfigDict(extra="forbid")
+    run_id: str
+    recipe: RecipeManifest
+    corpus_uri: str
+    catalog_generated_at: str
+    created_at: str
+    artifact_paths: List[str] = Field(default_factory=list)
+    stats: Dict[str, Any] = Field(default_factory=dict)
+class RetrievalResult(BaseModel):
+    """
+    Retrieval result bundle returned from a backend query.
+    :ivar query_text: Query text issued against the backend.
+    :vartype query_text: str
+    :ivar budget: Evidence selection budget applied to results.
+    :vartype budget: QueryBudget
+    :ivar run_id: Retrieval run identifier.
+    :vartype run_id: str
+    :ivar recipe_id: Recipe identifier used for this query.
+    :vartype recipe_id: str
+    :ivar backend_id: Backend identifier used for this query.
+    :vartype backend_id: str
+    :ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
+    :vartype generated_at: str
+    :ivar evidence: Evidence objects selected under the budget.
+    :vartype evidence: list[Evidence]
+    :ivar stats: Backend-specific query statistics.
+    :vartype stats: dict[str, Any]
+    """
+    model_config = ConfigDict(extra="forbid")
+    query_text: str
+    budget: QueryBudget
+    run_id: str
+    recipe_id: str
+    backend_id: str
+    generated_at: str
+    evidence: List[Evidence] = Field(default_factory=list)
+    stats: Dict[str, Any] = Field(default_factory=dict)