PyPI - biblicus - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

biblicus/__init__.py +5 -5
biblicus/analysis/__init__.py +1 -1
biblicus/analysis/base.py +10 -10
biblicus/analysis/markov.py +78 -68
biblicus/analysis/models.py +47 -47
biblicus/analysis/profiling.py +58 -48
biblicus/analysis/topic_modeling.py +56 -51
biblicus/cli.py +224 -177
biblicus/{recipes.py → configuration.py} +14 -14
biblicus/constants.py +2 -2
biblicus/context_engine/assembler.py +49 -19
biblicus/context_engine/retrieval.py +46 -42
biblicus/corpus.py +116 -108
biblicus/errors.py +3 -3
biblicus/evaluation.py +27 -25
biblicus/extraction.py +103 -98
biblicus/extraction_evaluation.py +26 -26
biblicus/extractors/deepgram_stt.py +7 -7
biblicus/extractors/docling_granite_text.py +11 -11
biblicus/extractors/docling_smol_text.py +11 -11
biblicus/extractors/markitdown_text.py +4 -4
biblicus/extractors/openai_stt.py +7 -7
biblicus/extractors/paddleocr_vl_text.py +20 -18
biblicus/extractors/pipeline.py +8 -8
biblicus/extractors/rapidocr_text.py +3 -3
biblicus/extractors/unstructured_text.py +3 -3
biblicus/hooks.py +4 -4
biblicus/knowledge_base.py +33 -31
biblicus/models.py +78 -78
biblicus/retrieval.py +47 -40
biblicus/retrievers/__init__.py +50 -0
biblicus/retrievers/base.py +65 -0
biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
biblicus/retrievers/hybrid.py +301 -0
biblicus/{backends → retrievers}/scan.py +83 -73
biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
biblicus/{backends → retrievers}/tf_vector.py +87 -77
biblicus/text/prompts.py +16 -8
biblicus/text/tool_loop.py +63 -5
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
biblicus-1.1.0.dist-info/RECORD +91 -0
biblicus/backends/__init__.py +0 -50
biblicus/backends/base.py +0 -65
biblicus/backends/hybrid.py +0 -292
biblicus-1.0.0.dist-info/RECORD +0 -91
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0

biblicus/models.py CHANGED Viewed

@@ -117,8 +117,8 @@ class CorpusCatalog(BaseModel):
     :vartype corpus_uri: str
     :ivar raw_dir: Relative path to the raw items folder.
     :vartype raw_dir: str
-    :ivar latest_run_id: Latest retrieval run identifier, if any.
-    :vartype latest_run_id: str or None
+    :ivar latest_snapshot_id: Latest retrieval snapshot identifier, if any.
+    :vartype latest_snapshot_id: str or None
     :ivar items: Mapping of item IDs to catalog entries.
     :vartype items: dict[str, CatalogItem]
     :ivar order: Display order of item IDs (most recent first).
@@ -131,7 +131,7 @@ class CorpusCatalog(BaseModel):
     generated_at: str
     corpus_uri: str
     raw_dir: str = "raw"
-    latest_run_id: Optional[str] = None
+    latest_snapshot_id: Optional[str] = None
     items: Dict[str, CatalogItem] = Field(default_factory=dict)
     order: List[str] = Field(default_factory=list)
@@ -142,79 +142,79 @@ class CorpusCatalog(BaseModel):
         return self
-class ExtractionRunReference(BaseModel):
+class ExtractionSnapshotReference(BaseModel):
     """
-    Reference to an extraction run.
+    Reference to an extraction snapshot.
     :ivar extractor_id: Extractor plugin identifier.
     :vartype extractor_id: str
-    :ivar run_id: Extraction run identifier.
-    :vartype run_id: str
+    :ivar snapshot_id: Extraction snapshot identifier.
+    :vartype snapshot_id: str
     """
     model_config = ConfigDict(extra="forbid")
     extractor_id: str = Field(min_length=1)
-    run_id: str = Field(min_length=1)
+    snapshot_id: str = Field(min_length=1)
     def as_string(self) -> str:
         """
         Serialize the reference as a single string.
-        :return: Reference in the form extractor_id:run_id.
+        :return: Reference in the form extractor_id:snapshot_id.
         :rtype: str
         """
-        return f"{self.extractor_id}:{self.run_id}"
+        return f"{self.extractor_id}:{self.snapshot_id}"
-def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
+def parse_extraction_snapshot_reference(value: str) -> ExtractionSnapshotReference:
     """
-    Parse an extraction run reference in the form extractor_id:run_id.
+    Parse an extraction snapshot reference in the form extractor_id:snapshot_id.
     :param value: Raw reference string.
     :type value: str
-    :return: Parsed extraction run reference.
-    :rtype: ExtractionRunReference
+    :return: Parsed extraction snapshot reference.
+    :rtype: ExtractionSnapshotReference
     :raises ValueError: If the reference is not well formed.
     """
     if ":" not in value:
-        raise ValueError("Extraction run reference must be extractor_id:run_id")
-    extractor_id, run_id = value.split(":", 1)
+        raise ValueError("Extraction snapshot reference must be extractor_id:snapshot_id")
+    extractor_id, snapshot_id = value.split(":", 1)
     extractor_id = extractor_id.strip()
-    run_id = run_id.strip()
-    if not extractor_id or not run_id:
+    snapshot_id = snapshot_id.strip()
+    if not extractor_id or not snapshot_id:
         raise ValueError(
-            "Extraction run reference must be extractor_id:run_id with non-empty parts"
+            "Extraction snapshot reference must be extractor_id:snapshot_id with non-empty parts"
         )
-    return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
+    return ExtractionSnapshotReference(extractor_id=extractor_id, snapshot_id=snapshot_id)
-class ExtractionRunListEntry(BaseModel):
+class ExtractionSnapshotListEntry(BaseModel):
     """
-    Summary entry for an extraction run stored in a corpus.
+    Summary entry for an extraction snapshot stored in a corpus.
     :ivar extractor_id: Extractor plugin identifier.
     :vartype extractor_id: str
-    :ivar run_id: Extraction run identifier.
-    :vartype run_id: str
-    :ivar recipe_id: Deterministic recipe identifier.
-    :vartype recipe_id: str
-    :ivar recipe_name: Human-readable recipe name.
-    :vartype recipe_name: str
-    :ivar catalog_generated_at: Catalog timestamp used for the run.
+    :ivar snapshot_id: Extraction snapshot identifier.
+    :vartype snapshot_id: str
+    :ivar configuration_id: Deterministic configuration identifier.
+    :vartype configuration_id: str
+    :ivar configuration_name: Human-readable configuration name.
+    :vartype configuration_name: str
+    :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
     :vartype catalog_generated_at: str
-    :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
+    :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
     :vartype created_at: str
-    :ivar stats: Run statistics.
+    :ivar stats: Snapshot statistics.
     :vartype stats: dict[str, object]
     """
     model_config = ConfigDict(extra="forbid")
     extractor_id: str = Field(min_length=1)
-    run_id: str = Field(min_length=1)
-    recipe_id: str = Field(min_length=1)
-    recipe_name: str = Field(min_length=1)
+    snapshot_id: str = Field(min_length=1)
+    configuration_id: str = Field(min_length=1)
+    configuration_name: str = Field(min_length=1)
     catalog_generated_at: str = Field(min_length=1)
     created_at: str = Field(min_length=1)
     stats: Dict[str, object] = Field(default_factory=dict)
@@ -250,7 +250,7 @@ class QueryBudget(BaseModel):
 class Evidence(BaseModel):
     """
-    Structured retrieval evidence returned from a backend.
+    Structured retrieval evidence returned from a retriever.
     :ivar item_id: Item identifier that produced the evidence.
     :vartype item_id: str
@@ -274,10 +274,10 @@ class Evidence(BaseModel):
     :vartype stage: str
     :ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
     :vartype stage_scores: dict[str, float] or None
-    :ivar recipe_id: Recipe identifier used to create the run.
-    :vartype recipe_id: str
-    :ivar run_id: Retrieval run identifier.
-    :vartype run_id: str
+    :ivar configuration_id: Configuration identifier used to create the snapshot.
+    :vartype configuration_id: str
+    :ivar snapshot_id: Retrieval snapshot identifier.
+    :vartype snapshot_id: str
     :ivar metadata: Optional metadata payload from the catalog item.
     :vartype metadata: dict[str, Any]
     :ivar hash: Optional content hash for provenance.
@@ -297,8 +297,8 @@ class Evidence(BaseModel):
     span_end: Optional[int] = None
     stage: str
     stage_scores: Optional[Dict[str, float]] = None
-    recipe_id: str
-    run_id: str
+    configuration_id: str
+    snapshot_id: str
     metadata: Dict[str, Any] = Field(default_factory=dict)
     hash: Optional[str] = None
@@ -311,79 +311,79 @@ class Evidence(BaseModel):
         return self
-class RecipeManifest(BaseModel):
+class ConfigurationManifest(BaseModel):
     """
-    Reproducible configuration for a retrieval backend.
+    Reproducible configuration for a retriever.
-    :ivar recipe_id: Deterministic recipe identifier.
-    :vartype recipe_id: str
-    :ivar backend_id: Backend identifier for the recipe.
-    :vartype backend_id: str
-    :ivar name: Human-readable name for the recipe.
+    :ivar configuration_id: Deterministic configuration identifier.
+    :vartype configuration_id: str
+    :ivar retriever_id: Retriever identifier for the configuration.
+    :vartype retriever_id: str
+    :ivar name: Human-readable name for the configuration.
     :vartype name: str
-    :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
+    :ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
     :vartype created_at: str
-    :ivar config: Backend-specific configuration values.
-    :vartype config: dict[str, Any]
+    :ivar configuration: Retriever-specific configuration values.
+    :vartype configuration: dict[str, Any]
     :ivar description: Optional human description.
     :vartype description: str or None
     """
     model_config = ConfigDict(extra="forbid")
-    recipe_id: str
-    backend_id: str
+    configuration_id: str
+    retriever_id: str
     name: str
     created_at: str
-    config: Dict[str, Any] = Field(default_factory=dict)
+    configuration: Dict[str, Any] = Field(default_factory=dict)
     description: Optional[str] = None
-class RetrievalRun(BaseModel):
+class RetrievalSnapshot(BaseModel):
     """
-    Immutable record of a retrieval materialization or on-demand run.
+    Immutable record of a retrieval snapshot.
-    :ivar run_id: Unique run identifier.
-    :vartype run_id: str
-    :ivar recipe: Recipe manifest for this run.
-    :vartype recipe: RecipeManifest
+    :ivar snapshot_id: Unique snapshot identifier.
+    :vartype snapshot_id: str
+    :ivar configuration: Configuration manifest for this snapshot.
+    :vartype configuration: ConfigurationManifest
     :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
     :vartype corpus_uri: str
-    :ivar catalog_generated_at: Catalog timestamp used for the run.
+    :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
     :vartype catalog_generated_at: str
-    :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
+    :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
     :vartype created_at: str
-    :ivar artifact_paths: Relative paths to materialized artifacts.
-    :vartype artifact_paths: list[str]
-    :ivar stats: Backend-specific run statistics.
+    :ivar snapshot_artifacts: Relative paths to materialized artifacts.
+    :vartype snapshot_artifacts: list[str]
+    :ivar stats: Retriever-specific snapshot statistics.
     :vartype stats: dict[str, Any]
     """
     model_config = ConfigDict(extra="forbid")
-    run_id: str
-    recipe: RecipeManifest
+    snapshot_id: str
+    configuration: ConfigurationManifest
     corpus_uri: str
     catalog_generated_at: str
     created_at: str
-    artifact_paths: List[str] = Field(default_factory=list)
+    snapshot_artifacts: List[str] = Field(default_factory=list)
     stats: Dict[str, Any] = Field(default_factory=dict)
 class RetrievalResult(BaseModel):
     """
-    Retrieval result bundle returned from a backend query.
+    Retrieval result bundle returned from a retriever query.
     :ivar query_text: Query text issued against the backend.
     :vartype query_text: str
     :ivar budget: Evidence selection budget applied to results.
     :vartype budget: QueryBudget
-    :ivar run_id: Retrieval run identifier.
-    :vartype run_id: str
-    :ivar recipe_id: Recipe identifier used for this query.
-    :vartype recipe_id: str
-    :ivar backend_id: Backend identifier used for this query.
-    :vartype backend_id: str
+    :ivar snapshot_id: Retrieval snapshot identifier.
+    :vartype snapshot_id: str
+    :ivar configuration_id: Configuration identifier used for this query.
+    :vartype configuration_id: str
+    :ivar retriever_id: Retriever identifier used for this query.
+    :vartype retriever_id: str
     :ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
     :vartype generated_at: str
     :ivar evidence: Evidence objects selected under the budget.
@@ -396,9 +396,9 @@ class RetrievalResult(BaseModel):
     query_text: str
     budget: QueryBudget
-    run_id: str
-    recipe_id: str
-    backend_id: str
+    snapshot_id: str
+    configuration_id: str
+    retriever_id: str
     generated_at: str
     evidence: List[Evidence] = Field(default_factory=list)
     stats: Dict[str, Any] = Field(default_factory=dict)

biblicus/retrieval.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Shared retrieval helpers for Biblicus backends.
+Shared retrieval helpers for Biblicus retrievers.
 """
 from __future__ import annotations
@@ -9,75 +9,82 @@ import json
 from typing import Any, Dict, Iterable, List, Optional
 from .corpus import Corpus
-from .models import Evidence, QueryBudget, RecipeManifest, RetrievalRun
+from .models import (
+    ConfigurationManifest,
+    Evidence,
+    QueryBudget,
+    RetrievalSnapshot,
+)
 from .time import utc_now_iso
-def create_recipe_manifest(
+def create_configuration_manifest(
     *,
-    backend_id: str,
+    retriever_id: str,
     name: str,
-    config: Dict[str, Any],
+    configuration: Dict[str, Any],
     description: Optional[str] = None,
-) -> RecipeManifest:
+) -> ConfigurationManifest:
     """
-    Create a deterministic recipe manifest from a backend configuration.
+    Create a deterministic configuration manifest from a retriever configuration.
-    :param backend_id: Backend identifier for the recipe.
-    :type backend_id: str
-    :param name: Human-readable recipe name.
+    :param retriever_id: Retriever identifier for the configuration.
+    :type retriever_id: str
+    :param name: Human-readable configuration name.
     :type name: str
-    :param config: Backend-specific configuration values.
-    :type config: dict[str, Any]
-    :param description: Optional recipe description.
+    :param configuration: Retriever-specific configuration values.
+    :type configuration: dict[str, Any]
+    :param description: Optional configuration description.
     :type description: str or None
-    :return: Deterministic recipe manifest.
-    :rtype: RecipeManifest
+    :return: Deterministic configuration manifest.
+    :rtype: ConfigurationManifest
     """
-    config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
-    recipe_seed = f"{backend_id}:{config_json}"
-    recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
-    return RecipeManifest(
-        recipe_id=recipe_id,
-        backend_id=backend_id,
+    config_json = json.dumps(configuration, sort_keys=True, separators=(",", ":"))
+    configuration_seed = f"{retriever_id}:{config_json}"
+    configuration_id = hashlib.sha256(configuration_seed.encode("utf-8")).hexdigest()
+    return ConfigurationManifest(
+        configuration_id=configuration_id,
+        retriever_id=retriever_id,
         name=name,
         created_at=utc_now_iso(),
-        config=config,
+        configuration=configuration,
         description=description,
     )
-def create_run_manifest(
+def create_snapshot_manifest(
     corpus: Corpus,
     *,
-    recipe: RecipeManifest,
+    configuration: ConfigurationManifest,
     stats: Dict[str, Any],
-    artifact_paths: Optional[List[str]] = None,
-) -> RetrievalRun:
+    snapshot_artifacts: Optional[List[str]] = None,
+) -> RetrievalSnapshot:
     """
-    Create a retrieval run manifest tied to the current catalog snapshot.
+    Create a retrieval snapshot manifest tied to the current catalog snapshot.
-    :param corpus: Corpus used to generate the run.
+    :param corpus: Corpus used to generate the snapshot.
     :type corpus: Corpus
-    :param recipe: Recipe manifest for the run.
-    :type recipe: RecipeManifest
-    :param stats: Backend-specific run statistics.
+    :param configuration: Configuration manifest for the snapshot.
+    :type configuration: ConfigurationManifest
+    :param stats: Retriever-specific snapshot statistics.
     :type stats: dict[str, Any]
-    :param artifact_paths: Optional relative paths to materialized artifacts.
-    :type artifact_paths: list[str] or None
-    :return: Run manifest.
-    :rtype: RetrievalRun
+    :param snapshot_artifacts: Optional relative paths to materialized artifacts.
+    :type snapshot_artifacts: list[str] or None
+    :return: Snapshot manifest.
+    :rtype: RetrievalSnapshot
     """
     catalog = corpus.load_catalog()
     created_at = utc_now_iso()
-    run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
-    return RetrievalRun(
-        run_id=run_id,
-        recipe=recipe,
+    snapshot_id = hashlib.sha256(
+        f"{configuration.configuration_id}:{created_at}".encode("utf-8")
+    ).hexdigest()
+    return RetrievalSnapshot(
+        snapshot_id=snapshot_id,
+        configuration=configuration,
         corpus_uri=catalog.corpus_uri,
         catalog_generated_at=catalog.generated_at,
         created_at=created_at,
-        artifact_paths=list(artifact_paths or []),
+        snapshot_artifacts=list(snapshot_artifacts or []),
         stats=stats,
     )

biblicus/retrievers/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""
+Retriever registry for Biblicus retrieval engines.
+"""
+from __future__ import annotations
+from typing import Dict, Type
+from .base import Retriever
+from .embedding_index_file import EmbeddingIndexFileRetriever
+from .embedding_index_inmemory import EmbeddingIndexInMemoryRetriever
+from .hybrid import HybridRetriever
+from .scan import ScanRetriever
+from .sqlite_full_text_search import SqliteFullTextSearchRetriever
+from .tf_vector import TfVectorRetriever
+def available_retrievers() -> Dict[str, Type[Retriever]]:
+    """
+    Return the registered retrievers.
+    :return: Mapping of retriever identifiers to retriever classes.
+    :rtype: dict[str, Type[Retriever]]
+    """
+    return {
+        EmbeddingIndexFileRetriever.retriever_id: EmbeddingIndexFileRetriever,
+        EmbeddingIndexInMemoryRetriever.retriever_id: EmbeddingIndexInMemoryRetriever,
+        HybridRetriever.retriever_id: HybridRetriever,
+        ScanRetriever.retriever_id: ScanRetriever,
+        SqliteFullTextSearchRetriever.retriever_id: SqliteFullTextSearchRetriever,
+        TfVectorRetriever.retriever_id: TfVectorRetriever,
+    }
+def get_retriever(retriever_id: str) -> Retriever:
+    """
+    Instantiate a retriever by identifier.
+    :param retriever_id: Retriever identifier.
+    :type retriever_id: str
+    :return: Retriever instance.
+    :rtype: Retriever
+    :raises KeyError: If the retriever identifier is unknown.
+    """
+    registry = available_retrievers()
+    retriever_class = registry.get(retriever_id)
+    if retriever_class is None:
+        known = ", ".join(sorted(registry))
+        raise KeyError(f"Unknown retriever '{retriever_id}'. Known retrievers: {known}")
+    return retriever_class()

biblicus/retrievers/base.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""
+Retriever interface for Biblicus retrieval engines.
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Dict
+from ..corpus import Corpus
+from ..models import QueryBudget, RetrievalResult, RetrievalSnapshot
+class Retriever(ABC):
+    """
+    Abstract interface for retrievers.
+    :ivar retriever_id: Identifier string for the retriever.
+    :vartype retriever_id: str
+    """
+    retriever_id: str
+    @abstractmethod
+    def build_snapshot(
+        self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
+    ) -> RetrievalSnapshot:
+        """
+        Build or register a retrieval snapshot for the retriever.
+        :param corpus: Corpus to build against.
+        :type corpus: Corpus
+        :param configuration_name: Human name for the configuration.
+        :type configuration_name: str
+        :param configuration: Retriever-specific configuration values.
+        :type configuration: dict[str, object]
+        :return: Snapshot manifest describing the build.
+        :rtype: RetrievalSnapshot
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def query(
+        self,
+        corpus: Corpus,
+        *,
+        snapshot: RetrievalSnapshot,
+        query_text: str,
+        budget: QueryBudget,
+    ) -> RetrievalResult:
+        """
+        Run a retrieval query against a retriever.
+        :param corpus: Corpus associated with the snapshot.
+        :type corpus: Corpus
+        :param snapshot: Snapshot manifest to use for querying.
+        :type snapshot: RetrievalSnapshot
+        :param query_text: Query text to execute.
+        :type query_text: str
+        :param budget: Evidence selection budget.
+        :type budget: QueryBudget
+        :return: Retrieval results containing evidence.
+        :rtype: RetrievalResult
+        """
+        raise NotImplementedError

biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl