PyPI - biblicus - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

biblicus/__init__.py +5 -5
biblicus/analysis/__init__.py +1 -1
biblicus/analysis/base.py +10 -10
biblicus/analysis/markov.py +78 -68
biblicus/analysis/models.py +47 -47
biblicus/analysis/profiling.py +58 -48
biblicus/analysis/topic_modeling.py +56 -51
biblicus/cli.py +224 -177
biblicus/{recipes.py → configuration.py} +14 -14
biblicus/constants.py +2 -2
biblicus/context_engine/assembler.py +49 -19
biblicus/context_engine/retrieval.py +46 -42
biblicus/corpus.py +116 -108
biblicus/errors.py +3 -3
biblicus/evaluation.py +27 -25
biblicus/extraction.py +103 -98
biblicus/extraction_evaluation.py +26 -26
biblicus/extractors/deepgram_stt.py +7 -7
biblicus/extractors/docling_granite_text.py +11 -11
biblicus/extractors/docling_smol_text.py +11 -11
biblicus/extractors/markitdown_text.py +4 -4
biblicus/extractors/openai_stt.py +7 -7
biblicus/extractors/paddleocr_vl_text.py +20 -18
biblicus/extractors/pipeline.py +8 -8
biblicus/extractors/rapidocr_text.py +3 -3
biblicus/extractors/unstructured_text.py +3 -3
biblicus/hooks.py +4 -4
biblicus/knowledge_base.py +33 -31
biblicus/models.py +78 -78
biblicus/retrieval.py +47 -40
biblicus/retrievers/__init__.py +50 -0
biblicus/retrievers/base.py +65 -0
biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
biblicus/retrievers/hybrid.py +301 -0
biblicus/{backends → retrievers}/scan.py +83 -73
biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
biblicus/{backends → retrievers}/tf_vector.py +87 -77
biblicus/text/prompts.py +16 -8
biblicus/text/tool_loop.py +63 -5
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
biblicus-1.1.0.dist-info/RECORD +91 -0
biblicus/backends/__init__.py +0 -50
biblicus/backends/base.py +0 -65
biblicus/backends/hybrid.py +0 -292
biblicus-1.0.0.dist-info/RECORD +0 -91
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0

biblicus/{backends → retrievers}/scan.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-Naive full-scan retrieval backend.
+Naive full-scan retriever.
 """
 from __future__ import annotations
@@ -12,87 +12,97 @@ from ..corpus import Corpus
 from ..frontmatter import parse_front_matter
 from ..models import (
     Evidence,
-    ExtractionRunReference,
+    ExtractionSnapshotReference,
     QueryBudget,
     RetrievalResult,
-    RetrievalRun,
-    parse_extraction_run_reference,
+    RetrievalSnapshot,
+    parse_extraction_snapshot_reference,
+)
+from ..retrieval import (
+    apply_budget,
+    create_configuration_manifest,
+    create_snapshot_manifest,
+    hash_text,
 )
-from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
 from ..time import utc_now_iso
-class ScanRecipeConfig(BaseModel):
+class ScanConfiguration(BaseModel):
     """
-    Configuration for the naive scan backend.
+    Configuration for the naive scan retriever.
     :ivar snippet_characters: Maximum characters to include in evidence snippets.
     :vartype snippet_characters: int
-    :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
-    :vartype extraction_run: str or None
+    :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
+    :vartype extraction_snapshot: str or None
     """
     model_config = ConfigDict(extra="forbid")
     snippet_characters: int = Field(default=400, ge=1)
-    extraction_run: Optional[str] = None
+    extraction_snapshot: Optional[str] = None
-class ScanBackend:
+class ScanRetriever:
     """
-    Naive backend that scans all text items at query time.
+    Naive retriever that scans all text items at query time.
-    :ivar backend_id: Backend identifier.
-    :vartype backend_id: str
+    :ivar retriever_id: Retriever identifier.
+    :vartype retriever_id: str
     """
-    backend_id = "scan"
+    retriever_id = "scan"
-    def build_run(
-        self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
-    ) -> RetrievalRun:
+    def build_snapshot(
+        self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
+    ) -> RetrievalSnapshot:
         """
-        Register a scan backend run (no materialization).
+        Register a scan retriever snapshot (no snapshot artifacts).
         :param corpus: Corpus to build against.
         :type corpus: Corpus
-        :param recipe_name: Human-readable recipe name.
-        :type recipe_name: str
-        :param config: Backend-specific configuration values.
-        :type config: dict[str, object]
-        :return: Run manifest describing the build.
-        :rtype: RetrievalRun
+        :param configuration_name: Human-readable configuration name.
+        :type configuration_name: str
+        :param configuration: Retriever-specific configuration values.
+        :type configuration: dict[str, object]
+        :return: Snapshot manifest describing the build.
+        :rtype: RetrievalSnapshot
         """
-        recipe_config = ScanRecipeConfig.model_validate(config)
+        parsed_config = ScanConfiguration.model_validate(configuration)
         catalog = corpus.load_catalog()
-        recipe = create_recipe_manifest(
-            backend_id=self.backend_id,
-            name=recipe_name,
-            config=recipe_config.model_dump(),
+        configuration_manifest = create_configuration_manifest(
+            retriever_id=self.retriever_id,
+            name=configuration_name,
+            configuration=parsed_config.model_dump(),
         )
         stats = {
             "items": len(catalog.items),
-            "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
+            "text_items": _count_text_items(corpus, catalog.items.values(), parsed_config),
         }
-        run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
-        corpus.write_run(run)
-        return run
+        snapshot = create_snapshot_manifest(
+            corpus,
+            configuration=configuration_manifest,
+            stats=stats,
+            snapshot_artifacts=[],
+        )
+        corpus.write_snapshot(snapshot)
+        return snapshot
     def query(
         self,
         corpus: Corpus,
         *,
-        run: RetrievalRun,
+        snapshot: RetrievalSnapshot,
         query_text: str,
         budget: QueryBudget,
     ) -> RetrievalResult:
         """
         Query the corpus with a full scan.
-        :param corpus: Corpus associated with the run.
+        :param corpus: Corpus associated with the snapshot.
         :type corpus: Corpus
-        :param run: Run manifest to use for querying.
-        :type run: RetrievalRun
+        :param snapshot: Snapshot manifest to use for querying.
+        :type snapshot: RetrievalSnapshot
         :param query_text: Query text to execute.
         :type query_text: str
         :param budget: Evidence selection budget.
@@ -100,15 +110,15 @@ class ScanBackend:
         :return: Retrieval results containing evidence.
         :rtype: RetrievalResult
         """
-        recipe_config = ScanRecipeConfig.model_validate(run.recipe.config)
+        parsed_config = ScanConfiguration.model_validate(snapshot.configuration.configuration)
         catalog = corpus.load_catalog()
-        extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
+        extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
         query_tokens = _tokenize_query(query_text)
         scored_candidates = _score_items(
             corpus,
             catalog.items.values(),
             query_tokens,
-            recipe_config.snippet_characters,
+            parsed_config.snippet_characters,
             extraction_reference=extraction_reference,
         )
         sorted_candidates = sorted(
@@ -119,8 +129,8 @@ class ScanBackend:
             evidence_item.model_copy(
                 update={
                     "rank": index,
-                    "recipe_id": run.recipe.recipe_id,
-                    "run_id": run.run_id,
+                    "configuration_id": snapshot.configuration.configuration_id,
+                    "snapshot_id": snapshot.snapshot_id,
                 }
             )
             for index, evidence_item in enumerate(sorted_candidates, start=1)
@@ -130,9 +140,9 @@ class ScanBackend:
         return RetrievalResult(
             query_text=query_text,
             budget=budget,
-            run_id=run.run_id,
-            recipe_id=run.recipe.recipe_id,
-            backend_id=self.backend_id,
+            snapshot_id=snapshot.snapshot_id,
+            configuration_id=snapshot.configuration.configuration_id,
+            retriever_id=snapshot.configuration.retriever_id,
             generated_at=utc_now_iso(),
             evidence=evidence,
             stats=stats,
@@ -140,56 +150,56 @@ class ScanBackend:
 def _resolve_extraction_reference(
-    corpus: Corpus, recipe_config: ScanRecipeConfig
-) -> Optional[ExtractionRunReference]:
+    corpus: Corpus, configuration: ScanConfiguration
+) -> Optional[ExtractionSnapshotReference]:
     """
-    Resolve an extraction run reference from a recipe config.
+    Resolve an extraction snapshot reference from a configuration.
-    :param corpus: Corpus associated with the recipe.
+    :param corpus: Corpus associated with the configuration.
     :type corpus: Corpus
-    :param recipe_config: Parsed scan recipe configuration.
-    :type recipe_config: ScanRecipeConfig
+    :param configuration: Parsed scan configuration.
+    :type configuration: ScanConfiguration
     :return: Parsed extraction reference or None.
-    :rtype: ExtractionRunReference or None
-    :raises FileNotFoundError: If an extraction run is referenced but not present.
+    :rtype: ExtractionSnapshotReference or None
+    :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
     """
-    if not recipe_config.extraction_run:
+    if not configuration.extraction_snapshot:
         return None
-    extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
-    run_dir = corpus.extraction_run_dir(
+    extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
+    snapshot_dir = corpus.extraction_snapshot_dir(
         extractor_id=extraction_reference.extractor_id,
-        run_id=extraction_reference.run_id,
+        snapshot_id=extraction_reference.snapshot_id,
     )
-    if not run_dir.is_dir():
-        raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
+    if not snapshot_dir.is_dir():
+        raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
     return extraction_reference
 def _count_text_items(
-    corpus: Corpus, items: Iterable[object], recipe_config: ScanRecipeConfig
+    corpus: Corpus, items: Iterable[object], configuration: ScanConfiguration
 ) -> int:
     """
     Count catalog items that represent text content.
-    When an extraction run is configured, extracted artifacts are treated as text.
+    When an extraction snapshot is configured, extracted artifacts are treated as text.
     :param corpus: Corpus containing the items.
     :type corpus: Corpus
     :param items: Catalog items to inspect.
     :type items: Iterable[object]
-    :param recipe_config: Parsed scan recipe configuration.
-    :type recipe_config: ScanRecipeConfig
+    :param configuration: Parsed scan configuration.
+    :type configuration: ScanConfiguration
     :return: Number of text items.
     :rtype: int
     """
     text_item_count = 0
-    extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
+    extraction_reference = _resolve_extraction_reference(corpus, configuration)
     for catalog_item in items:
         item_id = str(getattr(catalog_item, "id", ""))
         if extraction_reference and item_id:
             extracted_text = corpus.read_extracted_text(
                 extractor_id=extraction_reference.extractor_id,
-                run_id=extraction_reference.run_id,
+                snapshot_id=extraction_reference.snapshot_id,
                 item_id=item_id,
             )
             if isinstance(extracted_text, str) and extracted_text.strip():
@@ -219,7 +229,7 @@ def _load_text_from_item(
     item_id: str,
     relpath: str,
     media_type: str,
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> Optional[str]:
     """
     Load a text payload from a catalog item.
@@ -232,15 +242,15 @@ def _load_text_from_item(
     :type relpath: str
     :param media_type: Media type for the stored content.
     :type media_type: str
-    :param extraction_reference: Optional extraction run reference.
-    :type extraction_reference: ExtractionRunReference or None
+    :param extraction_reference: Optional extraction snapshot reference.
+    :type extraction_reference: ExtractionSnapshotReference or None
     :return: Text payload or None if not decodable as text.
     :rtype: str or None
     """
     if extraction_reference:
         extracted_text = corpus.read_extracted_text(
             extractor_id=extraction_reference.extractor_id,
-            run_id=extraction_reference.run_id,
+            snapshot_id=extraction_reference.snapshot_id,
             item_id=item_id,
         )
         if isinstance(extracted_text, str) and extracted_text.strip():
@@ -316,7 +326,7 @@ def _score_items(
     tokens: List[str],
     snippet_characters: int,
     *,
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> List[Evidence]:
     """
     Score catalog items by token frequency and return evidence candidates.
@@ -366,8 +376,8 @@ def _score_items(
                 span_start=span_start,
                 span_end=span_end,
                 stage="scan",
-                recipe_id="",
-                run_id="",
+                configuration_id="",
+                snapshot_id="",
                 metadata=getattr(catalog_item, "metadata", {}) or {},
                 hash=hash_text(snippet),
             )

biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

biblicus 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl