PyPI - biblicus - Versions diffs - 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

biblicus 0.15.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

biblicus/__init__.py +21 -1
biblicus/analysis/markov.py +35 -3
biblicus/backends/__init__.py +6 -2
biblicus/backends/embedding_index_common.py +334 -0
biblicus/backends/embedding_index_file.py +272 -0
biblicus/backends/embedding_index_inmemory.py +270 -0
biblicus/backends/hybrid.py +8 -5
biblicus/backends/scan.py +1 -0
biblicus/backends/sqlite_full_text_search.py +1 -1
biblicus/backends/{vector.py → tf_vector.py} +28 -35
biblicus/chunking.py +396 -0
biblicus/cli.py +75 -25
biblicus/context.py +27 -12
biblicus/context_engine/__init__.py +53 -0
biblicus/context_engine/assembler.py +1060 -0
biblicus/context_engine/compaction.py +110 -0
biblicus/context_engine/models.py +423 -0
biblicus/context_engine/retrieval.py +129 -0
biblicus/corpus.py +117 -16
biblicus/embedding_providers.py +122 -0
biblicus/errors.py +24 -0
biblicus/frontmatter.py +2 -0
biblicus/knowledge_base.py +1 -1
biblicus/models.py +15 -3
biblicus/retrieval.py +7 -2
biblicus/sources.py +46 -11
biblicus/text/link.py +6 -0
biblicus/text/prompts.py +2 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0

biblicus/cli.py CHANGED Viewed

@@ -8,7 +8,7 @@ import argparse
 import json
 import sys
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
 from pydantic import ValidationError
@@ -24,7 +24,7 @@ from .context import (
 )
 from .corpus import Corpus
 from .crawl import CrawlRequest, crawl_into_corpus
-from .errors import ExtractionRunFatalError
+from .errors import ExtractionRunFatalError, IngestCollisionError
 from .evaluation import evaluate_run, load_dataset
 from .evidence_processing import apply_evidence_filter, apply_evidence_reranker
 from .extraction import build_extraction_run
@@ -117,18 +117,28 @@ def cmd_ingest(arguments: argparse.Namespace) -> int:
     results = []
-    if arguments.note is not None or arguments.stdin:
-        text = arguments.note if arguments.note is not None else sys.stdin.read()
-        ingest_result = corpus.ingest_note(
-            text,
-            title=arguments.title,
-            tags=tags,
-            source_uri="stdin" if arguments.stdin else "text",
+    try:
+        if arguments.note is not None or arguments.stdin:
+            text = arguments.note if arguments.note is not None else sys.stdin.read()
+            ingest_result = corpus.ingest_note(
+                text,
+                title=arguments.title,
+                tags=tags,
+                source_uri=None if arguments.stdin else None,
+            )
+            results.append(ingest_result)
+        for source_path in arguments.files or []:
+            results.append(corpus.ingest_source(source_path, tags=tags))
+    except IngestCollisionError as error:
+        print(
+            "Ingest failed: source already ingested\n"
+            f"source_uri: {error.source_uri}\n"
+            f"existing_item_id: {error.existing_item_id}\n"
+            f"existing_relpath: {error.existing_relpath}",
+            file=sys.stderr,
         )
-        results.append(ingest_result)
-    for source_path in arguments.files or []:
-        results.append(corpus.ingest_source(source_path, tags=tags))
+        return 3
     if not results:
         print("Nothing to ingest: provide file paths, --note, or --stdin", file=sys.stderr)
@@ -239,15 +249,23 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
     return 0
-def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
+def _parse_config_pairs(pairs: Optional[Iterable[str]]) -> Dict[str, object]:
     """
-    Parse repeated key=value config pairs.
+    Parse key=value pairs into a configuration mapping.
-    :param pairs: Config pairs supplied via the command-line interface.
-    :type pairs: list[str] or None
-    :return: Parsed config mapping.
+    This is used by a few command-line options that accept repeated key=value items.
+    Values are coerced to useful types in a predictable way:
+    - JSON objects/arrays (leading ``{`` or ``[``) are parsed as JSON.
+    - Whole numbers are parsed as integers.
+    - Other numeric forms are parsed as floats.
+    - Everything else remains a string.
+    :param pairs: Iterable of key=value strings.
+    :type pairs: Iterable[str] or None
+    :return: Parsed configuration mapping.
     :rtype: dict[str, object]
-    :raises ValueError: If any entry is not key=value.
+    :raises ValueError: If any entry is not a key=value pair or values are invalid.
     """
     config: Dict[str, object] = {}
     for item in pairs or []:
@@ -257,8 +275,14 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
         key = key.strip()
         if not key:
             raise ValueError("Config keys must be non-empty")
+        raw = raw.strip()
         value: object = raw
-        if raw.isdigit():
+        if raw.startswith("{") or raw.startswith("["):
+            try:
+                value = json.loads(raw)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Config value must be valid JSON for key {key!r}") from exc
+        elif raw.isdigit():
             value = int(raw)
         else:
             try:
@@ -359,7 +383,8 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
     """
     return QueryBudget(
         max_total_items=arguments.max_total_items,
-        max_total_characters=arguments.max_total_characters,
+        offset=getattr(arguments, "offset", 0),
+        maximum_total_characters=arguments.maximum_total_characters,
         max_items_per_source=arguments.max_items_per_source,
     )
@@ -373,13 +398,26 @@ def cmd_build(arguments: argparse.Namespace) -> int:
     :return: Exit code.
     :rtype: int
     """
+    from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
     corpus = (
         Corpus.open(arguments.corpus)
         if getattr(arguments, "corpus", None)
         else Corpus.find(Path.cwd())
     )
     backend = get_backend(arguments.backend)
-    config = _parse_config_pairs(arguments.config)
+    base_config: Dict[str, object] = {}
+    if getattr(arguments, "recipe", None):
+        base_config = load_recipe_view(
+            arguments.recipe,
+            recipe_label="Recipe file",
+            mapping_error_message="Retrieval build recipe must be a mapping/object",
+        )
+    overrides = parse_dotted_overrides(arguments.config)
+    config = apply_dotted_overrides(base_config, overrides)
     run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
     print(run.model_dump_json(indent=2))
     return 0
@@ -947,11 +985,17 @@ def build_parser() -> argparse.ArgumentParser:
         help="Backend identifier (for example, scan, sqlite-full-text-search).",
     )
     p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
+    p_build.add_argument(
+        "--recipe",
+        default=None,
+        action="append",
+        help="Path to YAML recipe file (repeatable). If provided, recipes are composed in precedence order.",
+    )
     p_build.add_argument(
         "--config",
         action="append",
         default=None,
-        help="Backend config as key=value (repeatable).",
+        help="Backend config override as key=value (repeatable). Dotted keys create nested config mappings.",
     )
     p_build.set_defaults(func=cmd_build)
@@ -1030,8 +1074,14 @@ def build_parser() -> argparse.ArgumentParser:
     p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
     p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
     p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
+    p_query.add_argument(
+        "--offset",
+        type=int,
+        default=0,
+        help="Skip this many ranked candidates before selecting evidence (pagination).",
+    )
     p_query.add_argument("--max-total-items", type=int, default=5)
-    p_query.add_argument("--max-total-characters", type=int, default=2000)
+    p_query.add_argument("--maximum-total-characters", type=int, default=2000)
     p_query.add_argument("--max-items-per-source", type=int, default=5)
     p_query.add_argument(
         "--reranker-id",
@@ -1091,7 +1141,7 @@ def build_parser() -> argparse.ArgumentParser:
         help="Path to dataset JavaScript Object Notation file.",
     )
     p_eval.add_argument("--max-total-items", type=int, default=5)
-    p_eval.add_argument("--max-total-characters", type=int, default=2000)
+    p_eval.add_argument("--maximum-total-characters", type=int, default=2000)
     p_eval.add_argument("--max-items-per-source", type=int, default=5)
     p_eval.set_defaults(func=cmd_eval)

biblicus/context.py CHANGED Viewed

@@ -25,6 +25,8 @@ class ContextPackPolicy(BaseModel):
     :vartype ordering: str
     :ivar include_metadata: Whether to include evidence metadata lines in each block.
     :vartype include_metadata: bool
+    :ivar metadata_fields: Optional evidence metadata fields to include.
+    :vartype metadata_fields: list[str] or None
     """
     model_config = ConfigDict(extra="forbid")
@@ -32,6 +34,7 @@ class ContextPackPolicy(BaseModel):
     join_with: str = Field(default="\n\n")
     ordering: str = Field(default="rank", min_length=1)
     include_metadata: bool = Field(default=False)
+    metadata_fields: Optional[List[str]] = None
 class ContextPack(BaseModel):
@@ -132,7 +135,9 @@ def build_context_pack(result: RetrievalResult, *, policy: ContextPackPolicy) ->
         trimmed_text = evidence.text.strip()
         if not trimmed_text:
             continue
-        metadata = _metadata_for_evidence(evidence) if policy.include_metadata else None
+        metadata = (
+            _metadata_for_evidence(evidence, policy=policy) if policy.include_metadata else None
+        )
         block_text = _format_block_text(trimmed_text, metadata=metadata)
         selected_blocks.append(
             ContextPackBlock(
@@ -276,7 +281,11 @@ def _order_evidence(
     raise ValueError(f"Unknown context pack ordering: {policy.ordering}")
-def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
+def _metadata_for_evidence(
+    evidence: Evidence,
+    *,
+    policy: ContextPackPolicy,
+) -> Dict[str, object]:
     """
     Build metadata for a context pack block.
@@ -285,12 +294,19 @@ def _metadata_for_evidence(evidence: Evidence) -> Dict[str, object]:
     :return: Metadata mapping.
     :rtype: dict[str, object]
     """
-    return {
+    metadata = {
         "item_id": evidence.item_id,
         "source_uri": evidence.source_uri or "none",
         "score": evidence.score,
         "stage": evidence.stage,
     }
+    extra = evidence.metadata or {}
+    if policy.metadata_fields is not None:
+        extra = {key: extra.get(key) for key in policy.metadata_fields if key in extra}
+    for key, value in extra.items():
+        if key not in metadata:
+            metadata[key] = value
+    return metadata
 def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> str:
@@ -306,12 +322,11 @@ def _format_block_text(text: str, *, metadata: Optional[Dict[str, object]]) -> s
     """
     if not metadata:
         return text
-    metadata_lines = "\n".join(
-        [
-            f"item_id: {metadata['item_id']}",
-            f"source_uri: {metadata['source_uri']}",
-            f"score: {metadata['score']}",
-            f"stage: {metadata['stage']}",
-        ]
-    )
-    return f"{metadata_lines}\n{text}"
+    ordered_keys = ["item_id", "source_uri", "score", "stage"]
+    metadata_lines = [f"{key}: {metadata[key]}" for key in ordered_keys if key in metadata]
+    for key in sorted(metadata.keys()):
+        if key in ordered_keys:
+            continue
+        metadata_lines.append(f"{key}: {metadata[key]}")
+    metadata_text = "\n".join(metadata_lines)
+    return f"{metadata_text}\n{text}"

biblicus/context_engine/__init__.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""
+Public interface for the Biblicus Context Engine.
+"""
+from .assembler import ContextAssembler, ContextAssemblyResult
+from .compaction import BaseCompactor, CompactionRequest, SummaryCompactor, TruncateCompactor
+from .models import (
+    AssistantMessageSpec,
+    CompactorDeclaration,
+    ContextBudgetSpec,
+    ContextDeclaration,
+    ContextExpansionSpec,
+    ContextInsertSpec,
+    ContextMessageSpec,
+    ContextPackBudgetSpec,
+    ContextPackSpec,
+    ContextPolicySpec,
+    ContextRetrieverRequest,
+    ContextTemplateSpec,
+    CorpusDeclaration,
+    HistoryInsertSpec,
+    RetrieverDeclaration,
+    SystemMessageSpec,
+    UserMessageSpec,
+)
+from .retrieval import retrieve_context_pack
+__all__ = [
+    "ContextAssembler",
+    "ContextAssemblyResult",
+    "BaseCompactor",
+    "CompactionRequest",
+    "SummaryCompactor",
+    "TruncateCompactor",
+    "ContextBudgetSpec",
+    "ContextDeclaration",
+    "ContextExpansionSpec",
+    "ContextInsertSpec",
+    "ContextMessageSpec",
+    "ContextPackBudgetSpec",
+    "ContextPackSpec",
+    "ContextPolicySpec",
+    "ContextRetrieverRequest",
+    "ContextTemplateSpec",
+    "CorpusDeclaration",
+    "RetrieverDeclaration",
+    "CompactorDeclaration",
+    "HistoryInsertSpec",
+    "SystemMessageSpec",
+    "UserMessageSpec",
+    "AssistantMessageSpec",
+    "retrieve_context_pack",
+]

biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

biblicus 0.15.1py3-none-any.whl → 1.0.0py3-none-any.whl