PyPI - biblicus - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

biblicus/__init__.py +5 -5
biblicus/analysis/__init__.py +1 -1
biblicus/analysis/base.py +10 -10
biblicus/analysis/markov.py +78 -68
biblicus/analysis/models.py +47 -47
biblicus/analysis/profiling.py +58 -48
biblicus/analysis/topic_modeling.py +56 -51
biblicus/cli.py +224 -177
biblicus/{recipes.py → configuration.py} +14 -14
biblicus/constants.py +2 -2
biblicus/context_engine/assembler.py +49 -19
biblicus/context_engine/retrieval.py +46 -42
biblicus/corpus.py +116 -108
biblicus/errors.py +3 -3
biblicus/evaluation.py +27 -25
biblicus/extraction.py +103 -98
biblicus/extraction_evaluation.py +26 -26
biblicus/extractors/deepgram_stt.py +7 -7
biblicus/extractors/docling_granite_text.py +11 -11
biblicus/extractors/docling_smol_text.py +11 -11
biblicus/extractors/markitdown_text.py +4 -4
biblicus/extractors/openai_stt.py +7 -7
biblicus/extractors/paddleocr_vl_text.py +20 -18
biblicus/extractors/pipeline.py +8 -8
biblicus/extractors/rapidocr_text.py +3 -3
biblicus/extractors/unstructured_text.py +3 -3
biblicus/hooks.py +4 -4
biblicus/knowledge_base.py +33 -31
biblicus/models.py +78 -78
biblicus/retrieval.py +47 -40
biblicus/retrievers/__init__.py +50 -0
biblicus/retrievers/base.py +65 -0
biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
biblicus/retrievers/hybrid.py +301 -0
biblicus/{backends → retrievers}/scan.py +83 -73
biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
biblicus/{backends → retrievers}/tf_vector.py +87 -77
biblicus/text/prompts.py +16 -8
biblicus/text/tool_loop.py +63 -5
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
biblicus-1.1.0.dist-info/RECORD +91 -0
biblicus/backends/__init__.py +0 -50
biblicus/backends/base.py +0 -65
biblicus/backends/hybrid.py +0 -292
biblicus-1.0.0.dist-info/RECORD +0 -91
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0

biblicus/{backends → retrievers}/embedding_index_common.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-Shared primitives for embedding-index retrieval backends.
+Shared primitives for embedding-index retrievers.
 """
 from __future__ import annotations
@@ -12,10 +12,11 @@ import numpy as np
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from ..chunking import ChunkerConfig, TextChunk, TokenizerConfig
-from ..corpus import CORPUS_DIR_NAME, RUNS_DIR_NAME, Corpus
+from ..constants import CORPUS_DIR_NAME, SNAPSHOTS_DIR_NAME
+from ..corpus import Corpus
 from ..embedding_providers import EmbeddingProviderConfig, _l2_normalize_rows
 from ..frontmatter import parse_front_matter
-from ..models import ExtractionRunReference, parse_extraction_run_reference
+from ..models import ExtractionSnapshotReference, parse_extraction_snapshot_reference
 class ChunkRecord(BaseModel):
@@ -43,12 +44,12 @@ class ChunkRecord(BaseModel):
         return self
-class EmbeddingIndexRecipeConfig(BaseModel):
+class EmbeddingIndexConfiguration(BaseModel):
     """
-    Configuration for embedding-index retrieval backends.
+    Configuration for embedding-index retrievers.
-    :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
-    :vartype extraction_run: str or None
+    :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
+    :vartype extraction_snapshot: str or None
     :ivar chunker: Chunker configuration.
     :vartype chunker: biblicus.chunking.ChunkerConfig
     :ivar tokenizer: Optional tokenizer configuration.
@@ -68,7 +69,7 @@ class EmbeddingIndexRecipeConfig(BaseModel):
     snippet_characters: Optional[int] = Field(default=None, ge=1)
     maximum_cache_total_items: Optional[int] = Field(default=None, ge=1)
     maximum_cache_total_characters: Optional[int] = Field(default=None, ge=1)
-    extraction_run: Optional[str] = None
+    extraction_snapshot: Optional[str] = None
     chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
     tokenizer: Optional[TokenizerConfig] = None
     embedding_provider: EmbeddingProviderConfig
@@ -102,28 +103,28 @@ def _build_snippet(
 def resolve_extraction_reference(
-    corpus: Corpus, recipe_config: EmbeddingIndexRecipeConfig
-) -> Optional[ExtractionRunReference]:
+    corpus: Corpus, configuration: EmbeddingIndexConfiguration
+) -> Optional[ExtractionSnapshotReference]:
     """
-    Resolve an extraction run reference from an embedding-index recipe config.
+    Resolve an extraction snapshot reference from an embedding-index configuration.
-    :param corpus: Corpus associated with the recipe.
+    :param corpus: Corpus associated with the configuration.
     :type corpus: Corpus
-    :param recipe_config: Parsed embedding-index recipe configuration.
-    :type recipe_config: EmbeddingIndexRecipeConfig
+    :param configuration: Parsed embedding-index configuration.
+    :type configuration: EmbeddingIndexConfiguration
     :return: Parsed extraction reference or None.
-    :rtype: ExtractionRunReference or None
-    :raises FileNotFoundError: If an extraction run is referenced but not present.
+    :rtype: ExtractionSnapshotReference or None
+    :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
     """
-    if not recipe_config.extraction_run:
+    if not configuration.extraction_snapshot:
         return None
-    extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
-    run_dir = corpus.extraction_run_dir(
+    extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
+    snapshot_dir = corpus.extraction_snapshot_dir(
         extractor_id=extraction_reference.extractor_id,
-        run_id=extraction_reference.run_id,
+        snapshot_id=extraction_reference.snapshot_id,
     )
-    if not run_dir.is_dir():
-        raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
+    if not snapshot_dir.is_dir():
+        raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
     return extraction_reference
@@ -133,12 +134,12 @@ def _load_text_from_item(
     item_id: str,
     relpath: str,
     media_type: str,
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> Optional[str]:
     if extraction_reference:
         extracted_text = corpus.read_extracted_text(
             extractor_id=extraction_reference.extractor_id,
-            run_id=extraction_reference.run_id,
+            snapshot_id=extraction_reference.snapshot_id,
             item_id=item_id,
         )
         if isinstance(extracted_text, str):
@@ -153,7 +154,7 @@ def _load_text_from_item(
 def iter_text_payloads(
-    corpus: Corpus, *, extraction_reference: Optional[ExtractionRunReference]
+    corpus: Corpus, *, extraction_reference: Optional[ExtractionSnapshotReference]
 ) -> Iterator[Tuple[object, str]]:
     """
     Yield catalog items and their text payloads.
@@ -161,7 +162,7 @@ def iter_text_payloads(
     :param corpus: Corpus containing the items.
     :type corpus: Corpus
     :param extraction_reference: Optional extraction reference.
-    :type extraction_reference: ExtractionRunReference or None
+    :type extraction_reference: ExtractionSnapshotReference or None
     :yield: (catalog_item, text) pairs.
     :rtype: Iterator[tuple[object, str]]
     """
@@ -185,21 +186,21 @@ def iter_text_payloads(
 def collect_chunks(
-    corpus: Corpus, *, recipe_config: EmbeddingIndexRecipeConfig
+    corpus: Corpus, *, configuration: EmbeddingIndexConfiguration
 ) -> Tuple[List[TextChunk], int]:
     """
     Collect chunks from text payloads in a corpus.
     :param corpus: Corpus to chunk.
     :type corpus: Corpus
-    :param recipe_config: Parsed embedding-index recipe configuration.
-    :type recipe_config: EmbeddingIndexRecipeConfig
+    :param configuration: Parsed embedding-index configuration.
+    :type configuration: EmbeddingIndexConfiguration
     :return: (chunks, text_item_count)
     :rtype: tuple[list[TextChunk], int]
     """
-    tokenizer = recipe_config.tokenizer.build_tokenizer() if recipe_config.tokenizer else None
-    chunker = recipe_config.chunker.build_chunker(tokenizer=tokenizer)
-    extraction_reference = resolve_extraction_reference(corpus, recipe_config)
+    tokenizer = configuration.tokenizer.build_tokenizer() if configuration.tokenizer else None
+    chunker = configuration.chunker.build_chunker(tokenizer=tokenizer)
+    extraction_reference = resolve_extraction_reference(corpus, configuration)
     chunks: List[TextChunk] = []
     next_chunk_id = 0
@@ -317,18 +318,20 @@ def cosine_similarity_scores(embeddings: np.ndarray, query_vector: np.ndarray) -
     return embeddings @ query_vector
-def artifact_paths_for_run(*, run_id: str, backend_id: str) -> Dict[str, str]:
+def artifact_paths_for_snapshot(*, snapshot_id: str, retriever_id: str) -> Dict[str, str]:
     """
-    Build deterministic artifact relative paths for an embedding index run.
+    Build deterministic artifact relative paths for an embedding index snapshot.
-    :param run_id: Run identifier.
-    :type run_id: str
-    :param backend_id: Backend identifier.
-    :type backend_id: str
+    :param snapshot_id: Snapshot identifier.
+    :type snapshot_id: str
+    :param retriever_id: Retriever identifier.
+    :type retriever_id: str
     :return: Mapping with keys embeddings and chunks.
     :rtype: dict[str, str]
     """
-    prefix = f"{run_id}.{backend_id}"
-    embeddings_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.embeddings.npy")
-    chunks_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.chunks.jsonl")
+    prefix = f"{snapshot_id}.{retriever_id}"
+    embeddings_relpath = str(
+        Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.embeddings.npy"
+    )
+    chunks_relpath = str(Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.chunks.jsonl")
     return {"embeddings": embeddings_relpath, "chunks": chunks_relpath}

biblicus/{backends → retrievers}/embedding_index_file.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-Embedding-index retrieval backend that reads the embedding matrix via memory mapping.
+Embedding-index retriever that reads the embedding matrix via memory mapping.
 """
 from __future__ import annotations
@@ -10,15 +10,26 @@ from typing import Dict, List, Optional
 import numpy as np
 from ..corpus import Corpus
-from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
-from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
+from ..models import (
+    Evidence,
+    ExtractionSnapshotReference,
+    QueryBudget,
+    RetrievalResult,
+    RetrievalSnapshot,
+)
+from ..retrieval import (
+    apply_budget,
+    create_configuration_manifest,
+    create_snapshot_manifest,
+    hash_text,
+)
 from ..time import utc_now_iso
 from .embedding_index_common import (
     ChunkRecord,
-    EmbeddingIndexRecipeConfig,
+    EmbeddingIndexConfiguration,
     _build_snippet,
     _extract_span_text,
-    artifact_paths_for_run,
+    artifact_paths_for_snapshot,
     chunks_to_records,
     collect_chunks,
     cosine_similarity_scores,
@@ -30,45 +41,52 @@ from .embedding_index_common import (
 )
-class EmbeddingIndexFileBackend:
+class EmbeddingIndexFileRetriever:
     """
-    Embedding retrieval backend using memory-mapped similarity scanning.
+    Embedding retrieval retriever using memory-mapped similarity scanning.
     """
-    backend_id = "embedding-index-file"
+    retriever_id = "embedding-index-file"
-    def build_run(
-        self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
-    ) -> RetrievalRun:
+    def build_snapshot(
+        self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
+    ) -> RetrievalSnapshot:
         """
-        Build an embedding index run by chunking text payloads and materializing embeddings.
+        Build an embedding index snapshot by chunking text payloads and materializing embeddings.
         :param corpus: Corpus to build against.
         :type corpus: Corpus
-        :param recipe_name: Human-readable recipe name.
-        :type recipe_name: str
-        :param config: Backend-specific configuration values.
-        :type config: dict[str, object]
-        :return: Run manifest describing the build.
-        :rtype: biblicus.models.RetrievalRun
+        :param configuration_name: Human-readable configuration name.
+        :type configuration_name: str
+        :param configuration: Retriever-specific configuration values.
+        :type configuration: dict[str, object]
+        :return: Snapshot manifest describing the build.
+        :rtype: biblicus.models.RetrievalSnapshot
         """
-        recipe_config = EmbeddingIndexRecipeConfig.model_validate(config)
-        chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
+        parsed_config = EmbeddingIndexConfiguration.model_validate(configuration)
+        chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
-        provider = recipe_config.embedding_provider.build_provider()
+        provider = parsed_config.embedding_provider.build_provider()
         embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
-        recipe = create_recipe_manifest(
-            backend_id=self.backend_id,
-            name=recipe_name,
-            config=recipe_config.model_dump(),
+        configuration_manifest = create_configuration_manifest(
+            retriever_id=self.retriever_id,
+            name=configuration_name,
+            configuration=parsed_config.model_dump(),
+        )
+        snapshot = create_snapshot_manifest(
+            corpus,
+            configuration=configuration_manifest,
+            stats={},
+            snapshot_artifacts=[],
         )
-        run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
-        paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
+        paths = artifact_paths_for_snapshot(
+            snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
+        )
         embeddings_path = corpus.root / paths["embeddings"]
         chunks_path = corpus.root / paths["chunks"]
-        corpus.runs_dir.mkdir(parents=True, exist_ok=True)
+        corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
         write_embeddings(embeddings_path, embeddings)
         write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
@@ -80,30 +98,33 @@ class EmbeddingIndexFileBackend:
             "dimensions": (
                 int(embeddings.shape[1])
                 if embeddings.size
-                else recipe_config.embedding_provider.dimensions
+                else parsed_config.embedding_provider.dimensions
             ),
         }
-        run = run.model_copy(
-            update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
+        snapshot = snapshot.model_copy(
+            update={
+                "snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
+                "stats": stats,
+            }
         )
-        corpus.write_run(run)
-        return run
+        corpus.write_snapshot(snapshot)
+        return snapshot
     def query(
         self,
         corpus: Corpus,
         *,
-        run: RetrievalRun,
+        snapshot: RetrievalSnapshot,
         query_text: str,
         budget: QueryBudget,
     ) -> RetrievalResult:
         """
-        Query an embedding index run and return ranked evidence.
+        Query an embedding index snapshot and return ranked evidence.
-        :param corpus: Corpus associated with the run.
+        :param corpus: Corpus associated with the snapshot.
         :type corpus: Corpus
-        :param run: Run manifest to use for querying.
-        :type run: biblicus.models.RetrievalRun
+        :param snapshot: Snapshot manifest to use for querying.
+        :type snapshot: biblicus.models.RetrievalSnapshot
         :param query_text: Query text to embed.
         :type query_text: str
         :param budget: Evidence selection budget.
@@ -111,14 +132,18 @@ class EmbeddingIndexFileBackend:
         :return: Retrieval results containing evidence.
         :rtype: biblicus.models.RetrievalResult
         """
-        recipe_config = EmbeddingIndexRecipeConfig.model_validate(run.recipe.config)
-        extraction_reference = resolve_extraction_reference(corpus, recipe_config)
+        parsed_config = EmbeddingIndexConfiguration.model_validate(
+            snapshot.configuration.configuration
+        )
+        extraction_reference = resolve_extraction_reference(corpus, parsed_config)
-        paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
+        paths = artifact_paths_for_snapshot(
+            snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
+        )
         embeddings_path = corpus.root / paths["embeddings"]
         chunks_path = corpus.root / paths["chunks"]
         if not embeddings_path.is_file() or not chunks_path.is_file():
-            raise FileNotFoundError("Embedding index artifacts are missing for this run")
+            raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
         embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
         chunk_records = read_chunks_jsonl(chunks_path)
@@ -128,12 +153,12 @@ class EmbeddingIndexFileBackend:
                 "embeddings row count does not match chunk record count"
             )
-        provider = recipe_config.embedding_provider.build_provider()
+        provider = parsed_config.embedding_provider.build_provider()
         query_embedding = provider.embed_texts([query_text]).astype(np.float32)
         if query_embedding.shape[0] != 1:
             raise ValueError("Embedding provider returned an invalid query embedding shape")
-        batch_rows = recipe_config.maximum_cache_total_items or 4096
+        batch_rows = parsed_config.maximum_cache_total_items or 4096
         candidates = _top_indices_batched(
             embeddings=embeddings,
             query_vector=query_embedding[0],
@@ -142,8 +167,8 @@ class EmbeddingIndexFileBackend:
         )
         evidence_items = _build_evidence(
             corpus,
-            run=run,
-            recipe_config=recipe_config,
+            snapshot=snapshot,
+            configuration=parsed_config,
             candidates=candidates,
             embeddings=embeddings,
             query_vector=query_embedding[0],
@@ -152,7 +177,11 @@ class EmbeddingIndexFileBackend:
         )
         ranked = [
             item.model_copy(
-                update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
+                update={
+                    "rank": index,
+                    "configuration_id": snapshot.configuration.configuration_id,
+                    "snapshot_id": snapshot.snapshot_id,
+                }
             )
             for index, item in enumerate(evidence_items, start=1)
         ]
@@ -160,9 +189,9 @@ class EmbeddingIndexFileBackend:
         return RetrievalResult(
             query_text=query_text,
             budget=budget,
-            run_id=run.run_id,
-            recipe_id=run.recipe.recipe_id,
-            backend_id=self.backend_id,
+            snapshot_id=snapshot.snapshot_id,
+            configuration_id=snapshot.configuration.configuration_id,
+            retriever_id=snapshot.configuration.retriever_id,
             generated_at=utc_now_iso(),
             evidence=evidence,
             stats={"candidates": len(evidence_items), "returned": len(evidence)},
@@ -205,13 +234,13 @@ def _top_indices_batched(
 def _build_evidence(
     corpus: Corpus,
     *,
-    run: RetrievalRun,
-    recipe_config: EmbeddingIndexRecipeConfig,
+    snapshot: RetrievalSnapshot,
+    configuration: EmbeddingIndexConfiguration,
     candidates: List[int],
     embeddings: np.ndarray,
     query_vector: np.ndarray,
     chunk_records: List[ChunkRecord],
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> List[Evidence]:
     catalog = corpus.load_catalog()
     evidence_items: List[Evidence] = []
@@ -226,7 +255,7 @@ def _build_evidence(
             extraction_reference=extraction_reference,
         )
         span_text = _build_snippet(
-            text, (record.span_start, record.span_end), recipe_config.snippet_characters
+            text, (record.span_start, record.span_end), configuration.snippet_characters
         )
         if span_text is None:
             span_text = _extract_span_text(text, (record.span_start, record.span_end))
@@ -242,10 +271,10 @@ def _build_evidence(
                 content_ref=None,
                 span_start=record.span_start,
                 span_end=record.span_end,
-                stage=EmbeddingIndexFileBackend.backend_id,
+                stage=EmbeddingIndexFileRetriever.retriever_id,
                 stage_scores=None,
-                recipe_id=run.recipe.recipe_id,
-                run_id=run.run_id,
+                configuration_id=snapshot.configuration.configuration_id,
+                snapshot_id=snapshot.snapshot_id,
                 metadata=getattr(catalog_item, "metadata", {}) or {},
                 hash=hash_text(span_text or ""),
             )
@@ -259,7 +288,7 @@ def _load_text_for_evidence(
     item_id: str,
     relpath: str,
     media_type: str,
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> Optional[str]:
     from .embedding_index_common import _load_text_from_item

biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl