PyPI - biblicus - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

biblicus/__init__.py +5 -5
biblicus/analysis/__init__.py +1 -1
biblicus/analysis/base.py +10 -10
biblicus/analysis/markov.py +78 -68
biblicus/analysis/models.py +47 -47
biblicus/analysis/profiling.py +58 -48
biblicus/analysis/topic_modeling.py +56 -51
biblicus/cli.py +224 -177
biblicus/{recipes.py → configuration.py} +14 -14
biblicus/constants.py +2 -2
biblicus/context_engine/assembler.py +49 -19
biblicus/context_engine/retrieval.py +46 -42
biblicus/corpus.py +116 -108
biblicus/errors.py +3 -3
biblicus/evaluation.py +27 -25
biblicus/extraction.py +103 -98
biblicus/extraction_evaluation.py +26 -26
biblicus/extractors/deepgram_stt.py +7 -7
biblicus/extractors/docling_granite_text.py +11 -11
biblicus/extractors/docling_smol_text.py +11 -11
biblicus/extractors/markitdown_text.py +4 -4
biblicus/extractors/openai_stt.py +7 -7
biblicus/extractors/paddleocr_vl_text.py +20 -18
biblicus/extractors/pipeline.py +8 -8
biblicus/extractors/rapidocr_text.py +3 -3
biblicus/extractors/unstructured_text.py +3 -3
biblicus/hooks.py +4 -4
biblicus/knowledge_base.py +33 -31
biblicus/models.py +78 -78
biblicus/retrieval.py +47 -40
biblicus/retrievers/__init__.py +50 -0
biblicus/retrievers/base.py +65 -0
biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
biblicus/retrievers/hybrid.py +301 -0
biblicus/{backends → retrievers}/scan.py +83 -73
biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
biblicus/{backends → retrievers}/tf_vector.py +87 -77
biblicus/text/prompts.py +16 -8
biblicus/text/tool_loop.py +63 -5
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
biblicus-1.1.0.dist-info/RECORD +91 -0
biblicus/backends/__init__.py +0 -50
biblicus/backends/base.py +0 -65
biblicus/backends/hybrid.py +0 -292
biblicus-1.0.0.dist-info/RECORD +0 -91
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0

biblicus/{backends → retrievers}/embedding_index_inmemory.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-Embedding-index retrieval backend that loads the full embedding matrix into memory at query time.
+Embedding-index retriever that loads the full embedding matrix into memory at query time.
 """
 from __future__ import annotations
@@ -10,15 +10,26 @@ import numpy as np
 from pydantic import ConfigDict, Field
 from ..corpus import Corpus
-from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
-from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
+from ..models import (
+    Evidence,
+    ExtractionSnapshotReference,
+    QueryBudget,
+    RetrievalResult,
+    RetrievalSnapshot,
+)
+from ..retrieval import (
+    apply_budget,
+    create_configuration_manifest,
+    create_snapshot_manifest,
+    hash_text,
+)
 from ..time import utc_now_iso
 from .embedding_index_common import (
     ChunkRecord,
-    EmbeddingIndexRecipeConfig,
+    EmbeddingIndexConfiguration,
     _build_snippet,
     _extract_span_text,
-    artifact_paths_for_run,
+    artifact_paths_for_snapshot,
     chunks_to_records,
     collect_chunks,
     cosine_similarity_scores,
@@ -30,7 +41,7 @@ from .embedding_index_common import (
 )
-class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
+class EmbeddingIndexInMemoryConfiguration(EmbeddingIndexConfiguration):
     """
     Configuration for embedding-index-inmemory retrieval.
@@ -43,52 +54,59 @@ class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
     maximum_cache_total_items: int = Field(default=25000, ge=1)
-class EmbeddingIndexInMemoryBackend:
+class EmbeddingIndexInMemoryRetriever:
     """
-    Embedding retrieval backend using an in-memory similarity scan.
+    Embedding retrieval retriever using an in-memory similarity scan.
     """
-    backend_id = "embedding-index-inmemory"
+    retriever_id = "embedding-index-inmemory"
-    def build_run(
-        self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
-    ) -> RetrievalRun:
+    def build_snapshot(
+        self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
+    ) -> RetrievalSnapshot:
         """
-        Build an embedding index run by chunking text payloads and materializing embeddings.
+        Build an embedding index snapshot by chunking text payloads and materializing embeddings.
         :param corpus: Corpus to build against.
         :type corpus: Corpus
-        :param recipe_name: Human-readable recipe name.
-        :type recipe_name: str
-        :param config: Backend-specific configuration values.
-        :type config: dict[str, object]
-        :return: Run manifest describing the build.
-        :rtype: biblicus.models.RetrievalRun
+        :param configuration_name: Human-readable configuration name.
+        :type configuration_name: str
+        :param configuration: Retriever-specific configuration values.
+        :type configuration: dict[str, object]
+        :return: Snapshot manifest describing the build.
+        :rtype: biblicus.models.RetrievalSnapshot
         """
-        recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(config)
-        chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
-        if len(chunks) > recipe_config.maximum_cache_total_items:
+        parsed_config = EmbeddingIndexInMemoryConfiguration.model_validate(configuration)
+        chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
+        if len(chunks) > parsed_config.maximum_cache_total_items:
             raise ValueError(
                 "embedding-index-inmemory exceeded maximum_cache_total_items. "
                 "Use embedding-index-file or increase maximum_cache_total_items."
             )
-        provider = recipe_config.embedding_provider.build_provider()
+        provider = parsed_config.embedding_provider.build_provider()
         chunk_texts = [chunk.text for chunk in chunks]
         embeddings = provider.embed_texts(chunk_texts)
         embeddings = embeddings.astype(np.float32)
-        recipe = create_recipe_manifest(
-            backend_id=self.backend_id,
-            name=recipe_name,
-            config=recipe_config.model_dump(),
+        configuration_manifest = create_configuration_manifest(
+            retriever_id=self.retriever_id,
+            name=configuration_name,
+            configuration=parsed_config.model_dump(),
+        )
+        snapshot = create_snapshot_manifest(
+            corpus,
+            configuration=configuration_manifest,
+            stats={},
+            snapshot_artifacts=[],
         )
-        run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
-        paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
+        paths = artifact_paths_for_snapshot(
+            snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
+        )
         embeddings_path = corpus.root / paths["embeddings"]
         chunks_path = corpus.root / paths["chunks"]
-        corpus.runs_dir.mkdir(parents=True, exist_ok=True)
+        corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
         write_embeddings(embeddings_path, embeddings)
         write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
@@ -100,30 +118,33 @@ class EmbeddingIndexInMemoryBackend:
             "dimensions": (
                 int(embeddings.shape[1])
                 if embeddings.size
-                else recipe_config.embedding_provider.dimensions
+                else parsed_config.embedding_provider.dimensions
             ),
         }
-        run = run.model_copy(
-            update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
+        snapshot = snapshot.model_copy(
+            update={
+                "snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
+                "stats": stats,
+            }
         )
-        corpus.write_run(run)
-        return run
+        corpus.write_snapshot(snapshot)
+        return snapshot
     def query(
         self,
         corpus: Corpus,
         *,
-        run: RetrievalRun,
+        snapshot: RetrievalSnapshot,
         query_text: str,
         budget: QueryBudget,
     ) -> RetrievalResult:
         """
-        Query an embedding index run and return ranked evidence.
+        Query an embedding index snapshot and return ranked evidence.
-        :param corpus: Corpus associated with the run.
+        :param corpus: Corpus associated with the snapshot.
         :type corpus: Corpus
-        :param run: Run manifest to use for querying.
-        :type run: biblicus.models.RetrievalRun
+        :param snapshot: Snapshot manifest to use for querying.
+        :type snapshot: biblicus.models.RetrievalSnapshot
         :param query_text: Query text to embed.
         :type query_text: str
         :param budget: Evidence selection budget.
@@ -131,14 +152,18 @@ class EmbeddingIndexInMemoryBackend:
         :return: Retrieval results containing evidence.
         :rtype: biblicus.models.RetrievalResult
         """
-        recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(run.recipe.config)
-        extraction_reference = resolve_extraction_reference(corpus, recipe_config)
+        parsed_config = EmbeddingIndexInMemoryConfiguration.model_validate(
+            snapshot.configuration.configuration
+        )
+        extraction_reference = resolve_extraction_reference(corpus, parsed_config)
-        paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
+        paths = artifact_paths_for_snapshot(
+            snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
+        )
         embeddings_path = corpus.root / paths["embeddings"]
         chunks_path = corpus.root / paths["chunks"]
         if not embeddings_path.is_file() or not chunks_path.is_file():
-            raise FileNotFoundError("Embedding index artifacts are missing for this run")
+            raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
         embeddings = read_embeddings(embeddings_path, mmap=False).astype(np.float32)
         chunk_records = read_chunks_jsonl(chunks_path)
@@ -148,7 +173,7 @@ class EmbeddingIndexInMemoryBackend:
                 "embeddings row count does not match chunk record count"
             )
-        provider = recipe_config.embedding_provider.build_provider()
+        provider = parsed_config.embedding_provider.build_provider()
         query_embedding = provider.embed_texts([query_text]).astype(np.float32)
         if query_embedding.shape[0] != 1:
             raise ValueError("Embedding provider returned an invalid query embedding shape")
@@ -160,8 +185,8 @@ class EmbeddingIndexInMemoryBackend:
         )
         evidence_items = _build_evidence(
             corpus,
-            run=run,
-            recipe_config=recipe_config,
+            snapshot=snapshot,
+            configuration=parsed_config,
             candidates=candidates,
             scores=scores,
             chunk_records=chunk_records,
@@ -169,7 +194,11 @@ class EmbeddingIndexInMemoryBackend:
         )
         ranked = [
             item.model_copy(
-                update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
+                update={
+                    "rank": index,
+                    "configuration_id": snapshot.configuration.configuration_id,
+                    "snapshot_id": snapshot.snapshot_id,
+                }
             )
             for index, item in enumerate(evidence_items, start=1)
         ]
@@ -177,9 +206,9 @@ class EmbeddingIndexInMemoryBackend:
         return RetrievalResult(
             query_text=query_text,
             budget=budget,
-            run_id=run.run_id,
-            recipe_id=run.recipe.recipe_id,
-            backend_id=self.backend_id,
+            snapshot_id=snapshot.snapshot_id,
+            configuration_id=snapshot.configuration.configuration_id,
+            retriever_id=snapshot.configuration.retriever_id,
             generated_at=utc_now_iso(),
             evidence=evidence,
             stats={"candidates": len(evidence_items), "returned": len(evidence)},
@@ -202,12 +231,12 @@ def _top_indices(scores: np.ndarray, *, limit: int) -> List[int]:
 def _build_evidence(
     corpus: Corpus,
     *,
-    run: RetrievalRun,
-    recipe_config: EmbeddingIndexInMemoryRecipeConfig,
+    snapshot: RetrievalSnapshot,
+    configuration: EmbeddingIndexInMemoryConfiguration,
     candidates: List[int],
     scores: np.ndarray,
     chunk_records: List[ChunkRecord],
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> List[Evidence]:
     catalog = corpus.load_catalog()
     evidence_items: List[Evidence] = []
@@ -226,7 +255,7 @@ def _build_evidence(
             media_type=media_type,
             extraction_reference=extraction_reference,
         )
-        span_text = _build_snippet(text, (span_start, span_end), recipe_config.snippet_characters)
+        span_text = _build_snippet(text, (span_start, span_end), configuration.snippet_characters)
         if span_text is None:
             span_text = _extract_span_text(text, (span_start, span_end))
         evidence_items.append(
@@ -240,10 +269,10 @@ def _build_evidence(
                 content_ref=None,
                 span_start=span_start,
                 span_end=span_end,
-                stage=EmbeddingIndexInMemoryBackend.backend_id,
+                stage=EmbeddingIndexInMemoryRetriever.retriever_id,
                 stage_scores=None,
-                recipe_id=run.recipe.recipe_id,
-                run_id=run.run_id,
+                configuration_id=snapshot.configuration.configuration_id,
+                snapshot_id=snapshot.snapshot_id,
                 metadata=getattr(catalog_item, "metadata", {}) or {},
                 hash=hash_text(span_text or ""),
             )
@@ -257,7 +286,7 @@ def _load_text_for_evidence(
     item_id: str,
     relpath: str,
     media_type: str,
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> Optional[str]:
     from .embedding_index_common import _load_text_from_item

biblicus/retrievers/hybrid.py ADDED Viewed

@@ -0,0 +1,301 @@
+"""
+Hybrid retriever combining lexical and vector results.
+"""
+from __future__ import annotations
+from typing import Dict, List, Optional
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from ..corpus import Corpus
+from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalSnapshot
+from ..retrieval import apply_budget, create_configuration_manifest, create_snapshot_manifest
+from ..time import utc_now_iso
+class HybridConfiguration(BaseModel):
+    """
+    Configuration for hybrid retrieval fusion.
+    :ivar lexical_retriever: Retriever identifier for lexical retrieval.
+    :vartype lexical_retriever: str
+    :ivar embedding_retriever: Retriever identifier for embedding retrieval.
+    :vartype embedding_retriever: str
+    :ivar lexical_weight: Weight for lexical scores.
+    :vartype lexical_weight: float
+    :ivar embedding_weight: Weight for embedding scores.
+    :vartype embedding_weight: float
+    :ivar lexical_configuration: Optional lexical retriever configuration.
+    :vartype lexical_configuration: dict[str, object]
+    :ivar embedding_configuration: Optional embedding retriever configuration.
+    :vartype embedding_configuration: dict[str, object]
+    """
+    model_config = ConfigDict(extra="forbid")
+    lexical_retriever: str = Field(default="sqlite-full-text-search", min_length=1)
+    embedding_retriever: str = Field(default="tf-vector", min_length=1)
+    lexical_weight: float = Field(default=0.5, ge=0, le=1)
+    embedding_weight: float = Field(default=0.5, ge=0, le=1)
+    lexical_configuration: Dict[str, object] = Field(default_factory=dict)
+    embedding_configuration: Dict[str, object] = Field(default_factory=dict)
+    @model_validator(mode="after")
+    def _validate_weights(self) -> "HybridConfiguration":
+        if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
+            raise ValueError("weights must sum to 1")
+        return self
+class HybridRetriever:
+    """
+    Hybrid retriever that fuses lexical and embedding retrieval.
+    :ivar retriever_id: Retriever identifier.
+    :vartype retriever_id: str
+    """
+    retriever_id = "hybrid"
+    def build_snapshot(
+        self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
+    ) -> RetrievalSnapshot:
+        """
+        Build or register a hybrid retrieval snapshot.
+        :param corpus: Corpus to build against.
+        :type corpus: Corpus
+        :param configuration_name: Human-readable configuration name.
+        :type configuration_name: str
+        :param configuration: Retriever-specific configuration values.
+        :type configuration: dict[str, object]
+        :return: Snapshot manifest describing the build.
+        :rtype: RetrievalSnapshot
+        """
+        parsed_config = HybridConfiguration.model_validate(configuration)
+        _ensure_retriever_supported(parsed_config)
+        lexical_retriever = _resolve_retriever(parsed_config.lexical_retriever)
+        embedding_retriever = _resolve_retriever(parsed_config.embedding_retriever)
+        lexical_snapshot = lexical_retriever.build_snapshot(
+            corpus,
+            configuration_name=f"{configuration_name}-lexical",
+            configuration=parsed_config.lexical_configuration,
+        )
+        embedding_snapshot = embedding_retriever.build_snapshot(
+            corpus,
+            configuration_name=f"{configuration_name}-embedding",
+            configuration=parsed_config.embedding_configuration,
+        )
+        configuration_manifest = create_configuration_manifest(
+            retriever_id=self.retriever_id,
+            name=configuration_name,
+            configuration=parsed_config.model_dump(),
+        )
+        stats = {
+            "lexical_snapshot_id": lexical_snapshot.snapshot_id,
+            "embedding_snapshot_id": embedding_snapshot.snapshot_id,
+        }
+        snapshot = create_snapshot_manifest(
+            corpus,
+            configuration=configuration_manifest,
+            stats=stats,
+            snapshot_artifacts=[],
+        )
+        corpus.write_snapshot(snapshot)
+        return snapshot
+    def query(
+        self,
+        corpus: Corpus,
+        *,
+        snapshot: RetrievalSnapshot,
+        query_text: str,
+        budget: QueryBudget,
+    ) -> RetrievalResult:
+        """
+        Query using both lexical and embedding retrievers and fuse scores.
+        :param corpus: Corpus associated with the snapshot.
+        :type corpus: Corpus
+        :param snapshot: Snapshot manifest to use for querying.
+        :type snapshot: RetrievalSnapshot
+        :param query_text: Query text to execute.
+        :type query_text: str
+        :param budget: Evidence selection budget.
+        :type budget: QueryBudget
+        :return: Retrieval results containing evidence.
+        :rtype: RetrievalResult
+        """
+        configuration = HybridConfiguration.model_validate(snapshot.configuration.configuration)
+        _ensure_retriever_supported(configuration)
+        lexical_retriever = _resolve_retriever(configuration.lexical_retriever)
+        embedding_retriever = _resolve_retriever(configuration.embedding_retriever)
+        lexical_snapshot_id = snapshot.stats.get("lexical_snapshot_id")
+        embedding_snapshot_id = snapshot.stats.get("embedding_snapshot_id")
+        if not lexical_snapshot_id or not embedding_snapshot_id:
+            raise ValueError("Hybrid snapshot missing lexical or embedding snapshot identifiers")
+        lexical_snapshot = corpus.load_snapshot(str(lexical_snapshot_id))
+        embedding_snapshot = corpus.load_snapshot(str(embedding_snapshot_id))
+        component_budget = _expand_component_budget(budget)
+        lexical_result = lexical_retriever.query(
+            corpus, snapshot=lexical_snapshot, query_text=query_text, budget=component_budget
+        )
+        embedding_result = embedding_retriever.query(
+            corpus, snapshot=embedding_snapshot, query_text=query_text, budget=component_budget
+        )
+        candidates = _fuse_evidence(
+            lexical_result.evidence,
+            embedding_result.evidence,
+            lexical_weight=configuration.lexical_weight,
+            embedding_weight=configuration.embedding_weight,
+        )
+        sorted_candidates = sorted(
+            candidates,
+            key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
+        )
+        ranked = [
+            evidence_item.model_copy(
+                update={
+                    "rank": index,
+                    "configuration_id": snapshot.configuration.configuration_id,
+                    "snapshot_id": snapshot.snapshot_id,
+                }
+            )
+            for index, evidence_item in enumerate(sorted_candidates, start=1)
+        ]
+        evidence = apply_budget(ranked, budget)
+        stats = {
+            "candidates": len(sorted_candidates),
+            "returned": len(evidence),
+            "fusion_weights": {
+                "lexical": configuration.lexical_weight,
+                "embedding": configuration.embedding_weight,
+            },
+        }
+        return RetrievalResult(
+            query_text=query_text,
+            budget=budget,
+            snapshot_id=snapshot.snapshot_id,
+            configuration_id=snapshot.configuration.configuration_id,
+            retriever_id=snapshot.configuration.retriever_id,
+            generated_at=utc_now_iso(),
+            evidence=evidence,
+            stats=stats,
+        )
+def _ensure_retriever_supported(configuration: HybridConfiguration) -> None:
+    """
+    Validate that hybrid retrievers do not reference the hybrid retriever itself.
+    :param configuration: Parsed hybrid configuration.
+    :type configuration: HybridConfiguration
+    :return: None.
+    :rtype: None
+    :raises ValueError: If hybrid is used as a component retriever.
+    """
+    if configuration.lexical_retriever == HybridRetriever.retriever_id:
+        raise ValueError("Hybrid retriever cannot use itself as the lexical retriever")
+    if configuration.embedding_retriever == HybridRetriever.retriever_id:
+        raise ValueError("Hybrid retriever cannot use itself as the embedding retriever")
+def _resolve_retriever(retriever_id: str):
+    """
+    Resolve a retriever by identifier.
+    :param retriever_id: Retriever identifier.
+    :type retriever_id: str
+    :return: Retriever instance.
+    :rtype: object
+    """
+    from biblicus.retrievers import get_retriever  # Delayed import to avoid circular import
+    return get_retriever(retriever_id)
+def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
+    """
+    Expand a final budget to collect more candidates for fusion.
+    :param budget: Final evidence budget.
+    :type budget: QueryBudget
+    :param multiplier: Candidate expansion multiplier.
+    :type multiplier: int
+    :return: Expanded budget for component retrievers.
+    :rtype: QueryBudget
+    """
+    maximum_total_characters = budget.maximum_total_characters
+    expanded_characters = (
+        maximum_total_characters * multiplier if maximum_total_characters is not None else None
+    )
+    expanded_max_items_per_source = (
+        budget.max_items_per_source * multiplier
+        if budget.max_items_per_source is not None
+        else None
+    )
+    requested_items = budget.max_total_items + budget.offset
+    return QueryBudget(
+        max_total_items=requested_items * multiplier,
+        offset=0,
+        maximum_total_characters=expanded_characters,
+        max_items_per_source=expanded_max_items_per_source,
+    )
+def _fuse_evidence(
+    lexical: List[Evidence],
+    embedding: List[Evidence],
+    *,
+    lexical_weight: float,
+    embedding_weight: float,
+) -> List[Evidence]:
+    """
+    Fuse lexical and embedding evidence lists into hybrid candidates.
+    :param lexical: Lexical evidence list.
+    :type lexical: list[Evidence]
+    :param embedding: Embedding evidence list.
+    :type embedding: list[Evidence]
+    :param lexical_weight: Lexical score weight.
+    :type lexical_weight: float
+    :param embedding_weight: Embedding score weight.
+    :type embedding_weight: float
+    :return: Hybrid evidence list.
+    :rtype: list[Evidence]
+    """
+    merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
+    for evidence_item in lexical:
+        merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
+    for evidence_item in embedding:
+        merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
+    candidates: List[Evidence] = []
+    for item_id, sources in merged.items():
+        lexical_evidence = sources.get("lexical")
+        embedding_evidence = sources.get("embedding")
+        lexical_score = lexical_evidence.score if lexical_evidence else 0.0
+        embedding_score = embedding_evidence.score if embedding_evidence else 0.0
+        combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
+        base_evidence = lexical_evidence or embedding_evidence
+        candidates.append(
+            Evidence(
+                item_id=item_id,
+                source_uri=base_evidence.source_uri,
+                media_type=base_evidence.media_type,
+                score=combined_score,
+                rank=1,
+                text=base_evidence.text,
+                content_ref=base_evidence.content_ref,
+                span_start=base_evidence.span_start,
+                span_end=base_evidence.span_end,
+                stage="hybrid",
+                stage_scores={"lexical": lexical_score, "embedding": embedding_score},
+                configuration_id="",
+                snapshot_id="",
+                metadata=base_evidence.metadata,
+                hash=base_evidence.hash,
+            )
+        )
+    return candidates

biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl