PyPI - biblicus - Versions diffs - 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

biblicus 0.16.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

biblicus/__init__.py +25 -5
biblicus/analysis/__init__.py +1 -1
biblicus/analysis/base.py +10 -10
biblicus/analysis/markov.py +78 -68
biblicus/analysis/models.py +47 -47
biblicus/analysis/profiling.py +58 -48
biblicus/analysis/topic_modeling.py +56 -51
biblicus/cli.py +248 -191
biblicus/{recipes.py → configuration.py} +14 -14
biblicus/constants.py +2 -2
biblicus/context.py +27 -12
biblicus/context_engine/__init__.py +53 -0
biblicus/context_engine/assembler.py +1090 -0
biblicus/context_engine/compaction.py +110 -0
biblicus/context_engine/models.py +423 -0
biblicus/context_engine/retrieval.py +133 -0
biblicus/corpus.py +233 -124
biblicus/errors.py +27 -3
biblicus/evaluation.py +27 -25
biblicus/extraction.py +103 -98
biblicus/extraction_evaluation.py +26 -26
biblicus/extractors/deepgram_stt.py +7 -7
biblicus/extractors/docling_granite_text.py +11 -11
biblicus/extractors/docling_smol_text.py +11 -11
biblicus/extractors/markitdown_text.py +4 -4
biblicus/extractors/openai_stt.py +7 -7
biblicus/extractors/paddleocr_vl_text.py +20 -18
biblicus/extractors/pipeline.py +8 -8
biblicus/extractors/rapidocr_text.py +3 -3
biblicus/extractors/unstructured_text.py +3 -3
biblicus/hooks.py +4 -4
biblicus/knowledge_base.py +34 -32
biblicus/models.py +84 -81
biblicus/retrieval.py +49 -42
biblicus/retrievers/__init__.py +50 -0
biblicus/retrievers/base.py +65 -0
biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
biblicus/retrievers/hybrid.py +301 -0
biblicus/{backends → retrievers}/scan.py +84 -73
biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
biblicus/{backends → retrievers}/tf_vector.py +103 -100
biblicus/sources.py +46 -11
biblicus/text/link.py +6 -0
biblicus/text/prompts.py +18 -8
biblicus/text/tool_loop.py +63 -5
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
biblicus-1.1.0.dist-info/RECORD +91 -0
biblicus/backends/__init__.py +0 -50
biblicus/backends/base.py +0 -65
biblicus/backends/hybrid.py +0 -291
biblicus-0.16.0.dist-info/RECORD +0 -86
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0

biblicus/knowledge_base.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import List, Optional, Sequence
 from pydantic import BaseModel, ConfigDict, Field
-from .backends import get_backend
 from .context import (
     ContextPack,
     ContextPackPolicy,
@@ -20,17 +19,18 @@ from .context import (
     fit_context_pack_to_token_budget,
 )
 from .corpus import Corpus
-from .models import QueryBudget, RetrievalResult, RetrievalRun
+from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
+from .retrievers import get_retriever
 class KnowledgeBaseDefaults(BaseModel):
     """
     Default configuration for a knowledge base workflow.
-    :ivar backend_id: Backend identifier to use for retrieval.
-    :vartype backend_id: str
-    :ivar recipe_name: Human-readable retrieval recipe name.
-    :vartype recipe_name: str
+    :ivar retriever_id: Retriever identifier to use for retrieval.
+    :vartype retriever_id: str
+    :ivar configuration_name: Human-readable retrieval configuration name.
+    :vartype configuration_name: str
     :ivar query_budget: Default query budget to apply to retrieval.
     :vartype query_budget: QueryBudget
     :ivar tags: Tags to apply when importing the folder.
@@ -39,12 +39,12 @@ class KnowledgeBaseDefaults(BaseModel):
     model_config = ConfigDict(extra="forbid")
-    backend_id: str = Field(default="scan", min_length=1)
-    recipe_name: str = Field(default="Knowledge base", min_length=1)
+    retriever_id: str = Field(default="scan", min_length=1)
+    configuration_name: str = Field(default="Knowledge base", min_length=1)
     query_budget: QueryBudget = Field(
         default_factory=lambda: QueryBudget(
             max_total_items=5,
-            max_total_characters=2000,
+            maximum_total_characters=2000,
             max_items_per_source=None,
         )
     )
@@ -58,17 +58,17 @@ class KnowledgeBase:
     :ivar corpus: Corpus instance that stores the ingested items.
     :vartype corpus: Corpus
-    :ivar backend_id: Backend identifier used for retrieval.
-    :vartype backend_id: str
-    :ivar run: Retrieval run manifest associated with the knowledge base.
-    :vartype run: RetrievalRun
+    :ivar retriever_id: Retriever identifier used for retrieval.
+    :vartype retriever_id: str
+    :ivar snapshot: Retrieval snapshot manifest associated with the knowledge base.
+    :vartype snapshot: RetrievalSnapshot
     :ivar defaults: Default configuration used for this knowledge base.
     :vartype defaults: KnowledgeBaseDefaults
     """
     corpus: Corpus
-    backend_id: str
-    run: RetrievalRun
+    retriever_id: str
+    snapshot: RetrievalSnapshot
     defaults: KnowledgeBaseDefaults
     _temp_dir: Optional[TemporaryDirectory]
@@ -77,8 +77,8 @@ class KnowledgeBase:
         cls,
         folder: str | Path,
         *,
-        backend_id: Optional[str] = None,
-        recipe_name: Optional[str] = None,
+        retriever_id: Optional[str] = None,
+        configuration_name: Optional[str] = None,
         query_budget: Optional[QueryBudget] = None,
         tags: Optional[Sequence[str]] = None,
         corpus_root: Optional[str | Path] = None,
@@ -88,10 +88,10 @@ class KnowledgeBase:
         :param folder: Folder containing source files.
         :type folder: str or Path
-        :param backend_id: Optional backend identifier override.
-        :type backend_id: str or None
-        :param recipe_name: Optional recipe name override.
-        :type recipe_name: str or None
+        :param retriever_id: Optional retriever identifier override.
+        :type retriever_id: str or None
+        :param configuration_name: Optional configuration name override.
+        :type configuration_name: str or None
         :param query_budget: Optional query budget override.
         :type query_budget: QueryBudget or None
         :param tags: Optional tags to apply during import.
@@ -110,8 +110,8 @@ class KnowledgeBase:
             raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
         defaults = KnowledgeBaseDefaults()
-        resolved_backend_id = backend_id or defaults.backend_id
-        resolved_recipe_name = recipe_name or defaults.recipe_name
+        resolved_retriever_id = retriever_id or defaults.retriever_id
+        resolved_configuration_name = configuration_name or defaults.configuration_name
         resolved_query_budget = query_budget or defaults.query_budget
         resolved_tags = list(tags) if tags is not None else defaults.tags
@@ -125,16 +125,18 @@ class KnowledgeBase:
         corpus = Corpus.init(corpus_root_path)
         corpus.import_tree(source_root, tags=resolved_tags)
-        backend = get_backend(resolved_backend_id)
-        run = backend.build_run(corpus, recipe_name=resolved_recipe_name, config={})
+        retriever = get_retriever(resolved_retriever_id)
+        snapshot = retriever.build_snapshot(
+            corpus, configuration_name=resolved_configuration_name, configuration={}
+        )
         return cls(
             corpus=corpus,
-            backend_id=resolved_backend_id,
-            run=run,
+            retriever_id=resolved_retriever_id,
+            snapshot=snapshot,
             defaults=KnowledgeBaseDefaults(
-                backend_id=resolved_backend_id,
-                recipe_name=resolved_recipe_name,
+                retriever_id=resolved_retriever_id,
+                configuration_name=resolved_configuration_name,
                 query_budget=resolved_query_budget,
                 tags=resolved_tags,
             ),
@@ -152,11 +154,11 @@ class KnowledgeBase:
         :return: Retrieval result containing evidence.
         :rtype: RetrievalResult
         """
-        backend = get_backend(self.backend_id)
+        retriever = get_retriever(self.retriever_id)
         resolved_budget = budget or self.defaults.query_budget
-        return backend.query(
+        return retriever.query(
             self.corpus,
-            run=self.run,
+            snapshot=self.snapshot,
             query_text=query_text,
             budget=resolved_budget,
         )

biblicus/models.py CHANGED Viewed

@@ -117,8 +117,8 @@ class CorpusCatalog(BaseModel):
     :vartype corpus_uri: str
     :ivar raw_dir: Relative path to the raw items folder.
     :vartype raw_dir: str
-    :ivar latest_run_id: Latest retrieval run identifier, if any.
-    :vartype latest_run_id: str or None
+    :ivar latest_snapshot_id: Latest retrieval snapshot identifier, if any.
+    :vartype latest_snapshot_id: str or None
     :ivar items: Mapping of item IDs to catalog entries.
     :vartype items: dict[str, CatalogItem]
     :ivar order: Display order of item IDs (most recent first).
@@ -131,7 +131,7 @@ class CorpusCatalog(BaseModel):
     generated_at: str
     corpus_uri: str
     raw_dir: str = "raw"
-    latest_run_id: Optional[str] = None
+    latest_snapshot_id: Optional[str] = None
     items: Dict[str, CatalogItem] = Field(default_factory=dict)
     order: List[str] = Field(default_factory=list)
@@ -142,79 +142,79 @@ class CorpusCatalog(BaseModel):
         return self
-class ExtractionRunReference(BaseModel):
+class ExtractionSnapshotReference(BaseModel):
     """
-    Reference to an extraction run.
+    Reference to an extraction snapshot.
     :ivar extractor_id: Extractor plugin identifier.
     :vartype extractor_id: str
-    :ivar run_id: Extraction run identifier.
-    :vartype run_id: str
+    :ivar snapshot_id: Extraction snapshot identifier.
+    :vartype snapshot_id: str
     """
     model_config = ConfigDict(extra="forbid")
     extractor_id: str = Field(min_length=1)
-    run_id: str = Field(min_length=1)
+    snapshot_id: str = Field(min_length=1)
     def as_string(self) -> str:
         """
         Serialize the reference as a single string.
-        :return: Reference in the form extractor_id:run_id.
+        :return: Reference in the form extractor_id:snapshot_id.
         :rtype: str
         """
-        return f"{self.extractor_id}:{self.run_id}"
+        return f"{self.extractor_id}:{self.snapshot_id}"
-def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
+def parse_extraction_snapshot_reference(value: str) -> ExtractionSnapshotReference:
     """
-    Parse an extraction run reference in the form extractor_id:run_id.
+    Parse an extraction snapshot reference in the form extractor_id:snapshot_id.
     :param value: Raw reference string.
     :type value: str
-    :return: Parsed extraction run reference.
-    :rtype: ExtractionRunReference
+    :return: Parsed extraction snapshot reference.
+    :rtype: ExtractionSnapshotReference
     :raises ValueError: If the reference is not well formed.
     """
     if ":" not in value:
-        raise ValueError("Extraction run reference must be extractor_id:run_id")
-    extractor_id, run_id = value.split(":", 1)
+        raise ValueError("Extraction snapshot reference must be extractor_id:snapshot_id")
+    extractor_id, snapshot_id = value.split(":", 1)
     extractor_id = extractor_id.strip()
-    run_id = run_id.strip()
-    if not extractor_id or not run_id:
+    snapshot_id = snapshot_id.strip()
+    if not extractor_id or not snapshot_id:
         raise ValueError(
-            "Extraction run reference must be extractor_id:run_id with non-empty parts"
+            "Extraction snapshot reference must be extractor_id:snapshot_id with non-empty parts"
         )
-    return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
+    return ExtractionSnapshotReference(extractor_id=extractor_id, snapshot_id=snapshot_id)
-class ExtractionRunListEntry(BaseModel):
+class ExtractionSnapshotListEntry(BaseModel):
     """
-    Summary entry for an extraction run stored in a corpus.
+    Summary entry for an extraction snapshot stored in a corpus.
     :ivar extractor_id: Extractor plugin identifier.
     :vartype extractor_id: str
-    :ivar run_id: Extraction run identifier.
-    :vartype run_id: str
-    :ivar recipe_id: Deterministic recipe identifier.
-    :vartype recipe_id: str
-    :ivar recipe_name: Human-readable recipe name.
-    :vartype recipe_name: str
-    :ivar catalog_generated_at: Catalog timestamp used for the run.
+    :ivar snapshot_id: Extraction snapshot identifier.
+    :vartype snapshot_id: str
+    :ivar configuration_id: Deterministic configuration identifier.
+    :vartype configuration_id: str
+    :ivar configuration_name: Human-readable configuration name.
+    :vartype configuration_name: str
+    :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
     :vartype catalog_generated_at: str
-    :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
+    :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
     :vartype created_at: str
-    :ivar stats: Run statistics.
+    :ivar stats: Snapshot statistics.
     :vartype stats: dict[str, object]
     """
     model_config = ConfigDict(extra="forbid")
     extractor_id: str = Field(min_length=1)
-    run_id: str = Field(min_length=1)
-    recipe_id: str = Field(min_length=1)
-    recipe_name: str = Field(min_length=1)
+    snapshot_id: str = Field(min_length=1)
+    configuration_id: str = Field(min_length=1)
+    configuration_name: str = Field(min_length=1)
     catalog_generated_at: str = Field(min_length=1)
     created_at: str = Field(min_length=1)
     stats: Dict[str, object] = Field(default_factory=dict)
@@ -234,8 +234,8 @@ class QueryBudget(BaseModel):
         This enables simple pagination by re-running the same query with a
         higher offset.
     :vartype offset: int
-    :ivar max_total_characters: Optional maximum total characters across evidence text.
-    :vartype max_total_characters: int or None
+    :ivar maximum_total_characters: Optional maximum total characters across evidence text.
+    :vartype maximum_total_characters: int or None
     :ivar max_items_per_source: Optional cap per source uniform resource identifier.
     :vartype max_items_per_source: int or None
     """
@@ -244,13 +244,13 @@ class QueryBudget(BaseModel):
     max_total_items: int = Field(ge=1)
     offset: int = Field(default=0, ge=0)
-    max_total_characters: Optional[int] = Field(default=None, ge=1)
+    maximum_total_characters: Optional[int] = Field(default=None, ge=1)
     max_items_per_source: Optional[int] = Field(default=None, ge=1)
 class Evidence(BaseModel):
     """
-    Structured retrieval evidence returned from a backend.
+    Structured retrieval evidence returned from a retriever.
     :ivar item_id: Item identifier that produced the evidence.
     :vartype item_id: str
@@ -274,10 +274,12 @@ class Evidence(BaseModel):
     :vartype stage: str
     :ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
     :vartype stage_scores: dict[str, float] or None
-    :ivar recipe_id: Recipe identifier used to create the run.
-    :vartype recipe_id: str
-    :ivar run_id: Retrieval run identifier.
-    :vartype run_id: str
+    :ivar configuration_id: Configuration identifier used to create the snapshot.
+    :vartype configuration_id: str
+    :ivar snapshot_id: Retrieval snapshot identifier.
+    :vartype snapshot_id: str
+    :ivar metadata: Optional metadata payload from the catalog item.
+    :vartype metadata: dict[str, Any]
     :ivar hash: Optional content hash for provenance.
     :vartype hash: str or None
     """
@@ -295,8 +297,9 @@ class Evidence(BaseModel):
     span_end: Optional[int] = None
     stage: str
     stage_scores: Optional[Dict[str, float]] = None
-    recipe_id: str
-    run_id: str
+    configuration_id: str
+    snapshot_id: str
+    metadata: Dict[str, Any] = Field(default_factory=dict)
     hash: Optional[str] = None
     @model_validator(mode="after")
@@ -308,79 +311,79 @@ class Evidence(BaseModel):
         return self
-class RecipeManifest(BaseModel):
+class ConfigurationManifest(BaseModel):
     """
-    Reproducible configuration for a retrieval backend.
+    Reproducible configuration for a retriever.
-    :ivar recipe_id: Deterministic recipe identifier.
-    :vartype recipe_id: str
-    :ivar backend_id: Backend identifier for the recipe.
-    :vartype backend_id: str
-    :ivar name: Human-readable name for the recipe.
+    :ivar configuration_id: Deterministic configuration identifier.
+    :vartype configuration_id: str
+    :ivar retriever_id: Retriever identifier for the configuration.
+    :vartype retriever_id: str
+    :ivar name: Human-readable name for the configuration.
     :vartype name: str
-    :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
+    :ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
     :vartype created_at: str
-    :ivar config: Backend-specific configuration values.
-    :vartype config: dict[str, Any]
+    :ivar configuration: Retriever-specific configuration values.
+    :vartype configuration: dict[str, Any]
     :ivar description: Optional human description.
     :vartype description: str or None
     """
     model_config = ConfigDict(extra="forbid")
-    recipe_id: str
-    backend_id: str
+    configuration_id: str
+    retriever_id: str
     name: str
     created_at: str
-    config: Dict[str, Any] = Field(default_factory=dict)
+    configuration: Dict[str, Any] = Field(default_factory=dict)
     description: Optional[str] = None
-class RetrievalRun(BaseModel):
+class RetrievalSnapshot(BaseModel):
     """
-    Immutable record of a retrieval materialization or on-demand run.
+    Immutable record of a retrieval snapshot.
-    :ivar run_id: Unique run identifier.
-    :vartype run_id: str
-    :ivar recipe: Recipe manifest for this run.
-    :vartype recipe: RecipeManifest
+    :ivar snapshot_id: Unique snapshot identifier.
+    :vartype snapshot_id: str
+    :ivar configuration: Configuration manifest for this snapshot.
+    :vartype configuration: ConfigurationManifest
     :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
     :vartype corpus_uri: str
-    :ivar catalog_generated_at: Catalog timestamp used for the run.
+    :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
     :vartype catalog_generated_at: str
-    :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
+    :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
     :vartype created_at: str
-    :ivar artifact_paths: Relative paths to materialized artifacts.
-    :vartype artifact_paths: list[str]
-    :ivar stats: Backend-specific run statistics.
+    :ivar snapshot_artifacts: Relative paths to materialized artifacts.
+    :vartype snapshot_artifacts: list[str]
+    :ivar stats: Retriever-specific snapshot statistics.
     :vartype stats: dict[str, Any]
     """
     model_config = ConfigDict(extra="forbid")
-    run_id: str
-    recipe: RecipeManifest
+    snapshot_id: str
+    configuration: ConfigurationManifest
     corpus_uri: str
     catalog_generated_at: str
     created_at: str
-    artifact_paths: List[str] = Field(default_factory=list)
+    snapshot_artifacts: List[str] = Field(default_factory=list)
     stats: Dict[str, Any] = Field(default_factory=dict)
 class RetrievalResult(BaseModel):
     """
-    Retrieval result bundle returned from a backend query.
+    Retrieval result bundle returned from a retriever query.
     :ivar query_text: Query text issued against the backend.
     :vartype query_text: str
     :ivar budget: Evidence selection budget applied to results.
     :vartype budget: QueryBudget
-    :ivar run_id: Retrieval run identifier.
-    :vartype run_id: str
-    :ivar recipe_id: Recipe identifier used for this query.
-    :vartype recipe_id: str
-    :ivar backend_id: Backend identifier used for this query.
-    :vartype backend_id: str
+    :ivar snapshot_id: Retrieval snapshot identifier.
+    :vartype snapshot_id: str
+    :ivar configuration_id: Configuration identifier used for this query.
+    :vartype configuration_id: str
+    :ivar retriever_id: Retriever identifier used for this query.
+    :vartype retriever_id: str
     :ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
     :vartype generated_at: str
     :ivar evidence: Evidence objects selected under the budget.
@@ -393,9 +396,9 @@ class RetrievalResult(BaseModel):
     query_text: str
     budget: QueryBudget
-    run_id: str
-    recipe_id: str
-    backend_id: str
+    snapshot_id: str
+    configuration_id: str
+    retriever_id: str
     generated_at: str
     evidence: List[Evidence] = Field(default_factory=list)
     stats: Dict[str, Any] = Field(default_factory=dict)

biblicus/retrieval.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Shared retrieval helpers for Biblicus backends.
+Shared retrieval helpers for Biblicus retrievers.
 """
 from __future__ import annotations
@@ -9,75 +9,82 @@ import json
 from typing import Any, Dict, Iterable, List, Optional
 from .corpus import Corpus
-from .models import Evidence, QueryBudget, RecipeManifest, RetrievalRun
+from .models import (
+    ConfigurationManifest,
+    Evidence,
+    QueryBudget,
+    RetrievalSnapshot,
+)
 from .time import utc_now_iso
-def create_recipe_manifest(
+def create_configuration_manifest(
     *,
-    backend_id: str,
+    retriever_id: str,
     name: str,
-    config: Dict[str, Any],
+    configuration: Dict[str, Any],
     description: Optional[str] = None,
-) -> RecipeManifest:
+) -> ConfigurationManifest:
     """
-    Create a deterministic recipe manifest from a backend configuration.
+    Create a deterministic configuration manifest from a retriever configuration.
-    :param backend_id: Backend identifier for the recipe.
-    :type backend_id: str
-    :param name: Human-readable recipe name.
+    :param retriever_id: Retriever identifier for the configuration.
+    :type retriever_id: str
+    :param name: Human-readable configuration name.
     :type name: str
-    :param config: Backend-specific configuration values.
-    :type config: dict[str, Any]
-    :param description: Optional recipe description.
+    :param configuration: Retriever-specific configuration values.
+    :type configuration: dict[str, Any]
+    :param description: Optional configuration description.
     :type description: str or None
-    :return: Deterministic recipe manifest.
-    :rtype: RecipeManifest
+    :return: Deterministic configuration manifest.
+    :rtype: ConfigurationManifest
     """
-    config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
-    recipe_seed = f"{backend_id}:{config_json}"
-    recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
-    return RecipeManifest(
-        recipe_id=recipe_id,
-        backend_id=backend_id,
+    config_json = json.dumps(configuration, sort_keys=True, separators=(",", ":"))
+    configuration_seed = f"{retriever_id}:{config_json}"
+    configuration_id = hashlib.sha256(configuration_seed.encode("utf-8")).hexdigest()
+    return ConfigurationManifest(
+        configuration_id=configuration_id,
+        retriever_id=retriever_id,
         name=name,
         created_at=utc_now_iso(),
-        config=config,
+        configuration=configuration,
         description=description,
     )
-def create_run_manifest(
+def create_snapshot_manifest(
     corpus: Corpus,
     *,
-    recipe: RecipeManifest,
+    configuration: ConfigurationManifest,
     stats: Dict[str, Any],
-    artifact_paths: Optional[List[str]] = None,
-) -> RetrievalRun:
+    snapshot_artifacts: Optional[List[str]] = None,
+) -> RetrievalSnapshot:
     """
-    Create a retrieval run manifest tied to the current catalog snapshot.
+    Create a retrieval snapshot manifest tied to the current catalog snapshot.
-    :param corpus: Corpus used to generate the run.
+    :param corpus: Corpus used to generate the snapshot.
     :type corpus: Corpus
-    :param recipe: Recipe manifest for the run.
-    :type recipe: RecipeManifest
-    :param stats: Backend-specific run statistics.
+    :param configuration: Configuration manifest for the snapshot.
+    :type configuration: ConfigurationManifest
+    :param stats: Retriever-specific snapshot statistics.
     :type stats: dict[str, Any]
-    :param artifact_paths: Optional relative paths to materialized artifacts.
-    :type artifact_paths: list[str] or None
-    :return: Run manifest.
-    :rtype: RetrievalRun
+    :param snapshot_artifacts: Optional relative paths to materialized artifacts.
+    :type snapshot_artifacts: list[str] or None
+    :return: Snapshot manifest.
+    :rtype: RetrievalSnapshot
     """
     catalog = corpus.load_catalog()
     created_at = utc_now_iso()
-    run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
-    return RetrievalRun(
-        run_id=run_id,
-        recipe=recipe,
+    snapshot_id = hashlib.sha256(
+        f"{configuration.configuration_id}:{created_at}".encode("utf-8")
+    ).hexdigest()
+    return RetrievalSnapshot(
+        snapshot_id=snapshot_id,
+        configuration=configuration,
         corpus_uri=catalog.corpus_uri,
         catalog_generated_at=catalog.generated_at,
         created_at=created_at,
-        artifact_paths=list(artifact_paths or []),
+        snapshot_artifacts=list(snapshot_artifacts or []),
         stats=stats,
     )
@@ -124,8 +131,8 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
                 continue
         text_character_count = len(candidate_evidence.text or "")
-        if budget.max_total_characters is not None:
-            if total_characters + text_character_count > budget.max_total_characters:
+        if budget.maximum_total_characters is not None:
+            if total_characters + text_character_count > budget.maximum_total_characters:
                 continue
         selected_evidence.append(candidate_evidence)

biblicus/retrievers/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""
+Retriever registry for Biblicus retrieval engines.
+"""
+from __future__ import annotations
+from typing import Dict, Type
+from .base import Retriever
+from .embedding_index_file import EmbeddingIndexFileRetriever
+from .embedding_index_inmemory import EmbeddingIndexInMemoryRetriever
+from .hybrid import HybridRetriever
+from .scan import ScanRetriever
+from .sqlite_full_text_search import SqliteFullTextSearchRetriever
+from .tf_vector import TfVectorRetriever
+def available_retrievers() -> Dict[str, Type[Retriever]]:
+    """
+    Return the registered retrievers.
+    :return: Mapping of retriever identifiers to retriever classes.
+    :rtype: dict[str, Type[Retriever]]
+    """
+    return {
+        EmbeddingIndexFileRetriever.retriever_id: EmbeddingIndexFileRetriever,
+        EmbeddingIndexInMemoryRetriever.retriever_id: EmbeddingIndexInMemoryRetriever,
+        HybridRetriever.retriever_id: HybridRetriever,
+        ScanRetriever.retriever_id: ScanRetriever,
+        SqliteFullTextSearchRetriever.retriever_id: SqliteFullTextSearchRetriever,
+        TfVectorRetriever.retriever_id: TfVectorRetriever,
+    }
+def get_retriever(retriever_id: str) -> Retriever:
+    """
+    Instantiate a retriever by identifier.
+    :param retriever_id: Retriever identifier.
+    :type retriever_id: str
+    :return: Retriever instance.
+    :rtype: Retriever
+    :raises KeyError: If the retriever identifier is unknown.
+    """
+    registry = available_retrievers()
+    retriever_class = registry.get(retriever_id)
+    if retriever_class is None:
+        known = ", ".join(sorted(registry))
+        raise KeyError(f"Unknown retriever '{retriever_id}'. Known retrievers: {known}")
+    return retriever_class()

biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

biblicus 0.16.0py3-none-any.whl → 1.1.0py3-none-any.whl