PyPI - biblicus - Versions diffs - 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

biblicus 0.15.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

biblicus/__init__.py +21 -1
biblicus/analysis/markov.py +35 -3
biblicus/backends/__init__.py +6 -2
biblicus/backends/embedding_index_common.py +334 -0
biblicus/backends/embedding_index_file.py +272 -0
biblicus/backends/embedding_index_inmemory.py +270 -0
biblicus/backends/hybrid.py +8 -5
biblicus/backends/scan.py +1 -0
biblicus/backends/sqlite_full_text_search.py +1 -1
biblicus/backends/{vector.py → tf_vector.py} +28 -35
biblicus/chunking.py +396 -0
biblicus/cli.py +75 -25
biblicus/context.py +27 -12
biblicus/context_engine/__init__.py +53 -0
biblicus/context_engine/assembler.py +1060 -0
biblicus/context_engine/compaction.py +110 -0
biblicus/context_engine/models.py +423 -0
biblicus/context_engine/retrieval.py +129 -0
biblicus/corpus.py +117 -16
biblicus/embedding_providers.py +122 -0
biblicus/errors.py +24 -0
biblicus/frontmatter.py +2 -0
biblicus/knowledge_base.py +1 -1
biblicus/models.py +15 -3
biblicus/retrieval.py +7 -2
biblicus/sources.py +46 -11
biblicus/text/link.py +6 -0
biblicus/text/prompts.py +2 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0

biblicus/corpus.py CHANGED Viewed

@@ -11,6 +11,7 @@ import shutil
 import uuid
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
+from urllib.parse import quote, unquote, urlparse
 import yaml
 from pydantic import ValidationError
@@ -24,6 +25,7 @@ from .constants import (
     SCHEMA_VERSION,
     SIDECAR_SUFFIX,
 )
+from .errors import IngestCollisionError
 from .frontmatter import parse_front_matter, render_front_matter
 from .hook_manager import HookManager
 from .hooks import HookPoint
@@ -110,7 +112,10 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
     """
     media_type_overrides = {
         "image/jpeg": ".jpg",
+        "audio/mpeg": ".mp3",
         "audio/ogg": ".ogg",
+        "audio/wav": ".wav",
+        "audio/x-wav": ".wav",
     }
     if media_type in media_type_overrides:
         return media_type_overrides[media_type]
@@ -136,7 +141,16 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
         return raw_name + ".md"
     if Path(raw_name).suffix:
-        return raw_name
+        if "%2F" in raw_name or "%3A" in raw_name:
+            decoded = unquote(raw_name)
+            parsed = urlparse(decoded)
+            decoded_path = parsed.path if parsed.scheme else decoded
+            if not Path(decoded_path).suffix:
+                pass
+            else:
+                return raw_name
+        else:
+            return raw_name
     ext = _preferred_extension_for_media_type(media_type)
     if not ext:
@@ -144,6 +158,55 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
     return raw_name + ext
+def _encode_source_uri_for_filename(source_uri: str) -> str:
+    """
+    Percent-encode a source uniform resource identifier for filename use.
+    :param source_uri: Source uniform resource identifier to encode.
+    :type source_uri: str
+    :return: Percent-encoded uniform resource identifier safe for filenames.
+    :rtype: str
+    """
+    return quote(source_uri, safe="")
+def _storage_filename_for_ingest(
+    *, filename: Optional[str], media_type: str, source_uri: Optional[str]
+) -> str:
+    """
+    Derive a collision-safe filename for corpus storage.
+    If a source uniform resource identifier is provided, the full uniform resource identifier is
+    percent-encoded to namespace the stored file, preventing collisions between identical basenames
+    from different sources. When no uniform resource identifier is available, fall back to a
+    sanitized filename.
+    :param filename: Optional filename hint from the caller.
+    :type filename: str or None
+    :param media_type: Media type of the payload.
+    :type media_type: str
+    :param source_uri: Optional source uniform resource identifier for provenance.
+    :type source_uri: str or None
+    :return: Storage filename with an appropriate extension, or an empty string when no hint exists.
+    :rtype: str
+    """
+    base_name = ""
+    if source_uri:
+        base_name = _encode_source_uri_for_filename(source_uri)
+        if filename and not source_uri.startswith("file:"):
+            sanitized = _sanitize_filename(filename)
+            if sanitized:
+                base_name = f"{base_name}--{sanitized}"
+    if not base_name and filename:
+        base_name = _sanitize_filename(filename)
+    if not base_name:
+        return ""
+    if len(base_name) > 180:
+        digest = hashlib.sha256(base_name.encode("utf-8")).hexdigest()
+        base_name = f"hash-{digest}"
+    return _ensure_filename_extension(base_name, media_type=media_type)
 def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
     """
     Merge tags from explicit input and front matter values.
@@ -520,6 +583,24 @@ class Corpus:
         temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
         temp_path.replace(self.catalog_path)
+    def _find_item_by_source_uri(self, source_uri: str) -> Optional[CatalogItem]:
+        """
+        Locate an existing catalog item by source uniform resource identifier.
+        :param source_uri: Source uniform resource identifier to search for.
+        :type source_uri: str
+        :return: Matching catalog item or None.
+        :rtype: CatalogItem or None
+        """
+        if not source_uri:
+            return None
+        self._init_catalog()
+        catalog = self._load_catalog()
+        for item in catalog.items.values():
+            if item.source_uri == source_uri:
+                return item
+        return None
     @property
     def runs_dir(self) -> Path:
         """
@@ -817,18 +898,26 @@ class Corpus:
         :return: Ingestion result summary.
         :rtype: IngestResult
         :raises ValueError: If markdown is not Unicode Transformation Format 8.
+        :raises IngestCollisionError: If a source uniform resource identifier is already ingested.
         """
-        item_id = str(uuid.uuid4())
-        safe_filename = _sanitize_filename(filename) if filename else ""
+        existing_item = self._find_item_by_source_uri(source_uri)
+        if existing_item is not None:
+            raise IngestCollisionError(
+                source_uri=source_uri,
+                existing_item_id=existing_item.id,
+                existing_relpath=existing_item.relpath,
+            )
-        if safe_filename:
-            safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
+        item_id = str(uuid.uuid4())
+        storage_filename = _storage_filename_for_ingest(
+            filename=filename, media_type=media_type, source_uri=source_uri
+        )
         if media_type == "text/markdown":
-            output_name = f"{item_id}--{safe_filename}" if safe_filename else f"{item_id}.md"
+            output_name = f"{item_id}--{storage_filename}" if storage_filename else f"{item_id}.md"
         else:
-            if safe_filename:
-                output_name = f"{item_id}--{safe_filename}"
+            if storage_filename:
+                output_name = f"{item_id}--{storage_filename}"
             else:
                 extension = _preferred_extension_for_media_type(media_type) or ""
                 output_name = f"{item_id}{extension}" if extension else f"{item_id}"
@@ -991,13 +1080,21 @@ class Corpus:
         if media_type == "text/markdown":
             raise ValueError("Stream ingestion is not supported for Markdown")
+        existing_item = self._find_item_by_source_uri(source_uri)
+        if existing_item is not None:
+            raise IngestCollisionError(
+                source_uri=source_uri,
+                existing_item_id=existing_item.id,
+                existing_relpath=existing_item.relpath,
+            )
         item_id = str(uuid.uuid4())
-        safe_filename = _sanitize_filename(filename) if filename else ""
-        if safe_filename:
-            safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
+        storage_filename = _storage_filename_for_ingest(
+            filename=filename, media_type=media_type, source_uri=source_uri
+        )
-        if safe_filename:
-            output_name = f"{item_id}--{safe_filename}"
+        if storage_filename:
+            output_name = f"{item_id}--{storage_filename}"
         else:
             extension = _preferred_extension_for_media_type(media_type) or ""
             output_name = f"{item_id}{extension}" if extension else f"{item_id}"
@@ -1085,7 +1182,7 @@ class Corpus:
         *,
         title: Optional[str] = None,
         tags: Sequence[str] = (),
-        source_uri: str = "text",
+        source_uri: Optional[str] = None,
     ) -> IngestResult:
         """
         Ingest a text note as Markdown.
@@ -1096,11 +1193,15 @@ class Corpus:
         :type title: str or None
         :param tags: Tags to associate with the note.
         :type tags: Sequence[str]
-        :param source_uri: Source uniform resource identifier for provenance.
-        :type source_uri: str
+        :param source_uri: Optional source uniform resource identifier for provenance.
+        :type source_uri: str or None
         :return: Ingestion result summary.
         :rtype: IngestResult
         """
+        if source_uri is None:
+            digest_source = (title or "") + "\n" + text
+            digest = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()
+            source_uri = f"text:{digest}"
         data = text.encode("utf-8")
         return self.ingest_item(
             data,

biblicus/embedding_providers.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""
+Embedding provider interfaces for retrieval backends.
+"""
+from __future__ import annotations
+import hashlib
+from abc import ABC, abstractmethod
+from typing import Optional, Sequence
+import numpy as np
+from pydantic import BaseModel, ConfigDict, Field
+class EmbeddingProvider(ABC):
+    """
+    Interface for producing dense embedding vectors from text.
+    :ivar provider_id: Provider identifier.
+    :vartype provider_id: str
+    """
+    provider_id: str
+    @abstractmethod
+    def embed_texts(self, texts: Sequence[str]) -> np.ndarray:
+        """
+        Embed a batch of texts.
+        :param texts: Text inputs.
+        :type texts: Sequence[str]
+        :return: 2D float array with shape (len(texts), dimensions).
+        :rtype: numpy.ndarray
+        """
+        raise NotImplementedError
+def _l2_normalize_rows(matrix: np.ndarray) -> np.ndarray:
+    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
+    norms = np.where(norms == 0, 1.0, norms)
+    return matrix / norms
+class HashEmbeddingProvider(EmbeddingProvider):
+    """
+    Deterministic embedding provider for tests and demos.
+    The output vectors are stable across runs and require no external services.
+    """
+    provider_id = "hash-embedding"
+    def __init__(self, *, dimensions: int, seed: str = "biblicus") -> None:
+        self._dimensions = int(dimensions)
+        self._seed = str(seed)
+        if self._dimensions <= 0:
+            raise ValueError("dimensions must be greater than 0")
+    def embed_texts(self, texts: Sequence[str]) -> np.ndarray:
+        """
+        Embed a batch of texts deterministically.
+        :param texts: Text inputs.
+        :type texts: Sequence[str]
+        :return: Normalized embedding matrix.
+        :rtype: numpy.ndarray
+        """
+        items = list(texts)
+        if not items:
+            return np.zeros((0, self._dimensions), dtype=np.float32)
+        vectors = np.zeros((len(items), self._dimensions), dtype=np.float32)
+        for row_index, text in enumerate(items):
+            vectors[row_index] = self._hash_to_vector(text)
+        return _l2_normalize_rows(vectors)
+    def _hash_to_vector(self, text: str) -> np.ndarray:
+        output = np.empty((self._dimensions,), dtype=np.float32)
+        remaining = self._dimensions
+        offset = 0
+        counter = 0
+        while remaining > 0:
+            digest = hashlib.sha256(f"{self._seed}:{counter}:{text}".encode("utf-8")).digest()
+            raw = np.frombuffer(digest, dtype=np.uint8).astype(np.float32)
+            raw = (raw / 255.0) * 2.0 - 1.0
+            take = min(remaining, raw.shape[0])
+            output[offset : offset + take] = raw[:take]
+            remaining -= take
+            offset += take
+            counter += 1
+        return output
+class EmbeddingProviderConfig(BaseModel):
+    """
+    Configuration for embedding provider selection.
+    :ivar provider_id: Provider identifier.
+    :vartype provider_id: str
+    :ivar dimensions: Dimensionality of produced vectors.
+    :vartype dimensions: int
+    :ivar seed: Optional deterministic seed for test providers.
+    :vartype seed: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    provider_id: str = Field(min_length=1)
+    dimensions: int = Field(ge=1)
+    seed: Optional[str] = None
+    def build_provider(self) -> EmbeddingProvider:
+        """
+        Build an embedding provider instance from this configuration.
+        :return: Embedding provider instance.
+        :rtype: EmbeddingProvider
+        :raises ValueError: If the provider identifier is unknown.
+        """
+        if self.provider_id == HashEmbeddingProvider.provider_id:
+            return HashEmbeddingProvider(dimensions=self.dimensions, seed=self.seed or "biblicus")
+        raise ValueError(f"Unknown embedding provider_id: {self.provider_id!r}")

biblicus/errors.py CHANGED Viewed

@@ -13,3 +13,27 @@ class ExtractionRunFatalError(RuntimeError):
     rather than a per-item extraction failure. For example, a selection extractor that depends
     on referenced extraction run manifests treats missing manifests as fatal.
     """
+class IngestCollisionError(RuntimeError):
+    """
+    Ingest collision for an already ingested source.
+    :param source_uri: Source uniform resource identifier that caused the collision.
+    :type source_uri: str
+    :param existing_item_id: Identifier of the existing catalog item.
+    :type existing_item_id: str
+    :param existing_relpath: Raw storage relpath of the existing item.
+    :type existing_relpath: str
+    """
+    def __init__(self, *, source_uri: str, existing_item_id: str, existing_relpath: str) -> None:
+        self.source_uri = source_uri
+        self.existing_item_id = existing_item_id
+        self.existing_relpath = existing_relpath
+        message = (
+            "Source already ingested"
+            f": source_uri={source_uri} existing_item_id={existing_item_id}"
+            f" existing_relpath={existing_relpath}"
+        )
+        super().__init__(message)

biblicus/frontmatter.py CHANGED Viewed

@@ -44,6 +44,8 @@ def parse_front_matter(text: str) -> FrontMatterDocument:
     raw_yaml = text[4:front_matter_end]
     body = text[front_matter_end + len("\n---\n") :]
+    if body.startswith("\n"):
+        body = body[1:]
     metadata = yaml.safe_load(raw_yaml) or {}
     if not isinstance(metadata, dict):

biblicus/knowledge_base.py CHANGED Viewed

@@ -44,7 +44,7 @@ class KnowledgeBaseDefaults(BaseModel):
     query_budget: QueryBudget = Field(
         default_factory=lambda: QueryBudget(
             max_total_items=5,
-            max_total_characters=2000,
+            maximum_total_characters=2000,
             max_items_per_source=None,
         )
     )

biblicus/models.py CHANGED Viewed

@@ -224,10 +224,18 @@ class QueryBudget(BaseModel):
     """
     Evidence selection budget for retrieval.
+    The budget constrains the *returned* evidence. It intentionally does not
+    change how a backend scores candidates, only how many evidence items are
+    selected and how much text is allowed through.
     :ivar max_total_items: Maximum number of evidence items to return.
     :vartype max_total_items: int
-    :ivar max_total_characters: Optional maximum total characters across evidence text.
-    :vartype max_total_characters: int or None
+    :ivar offset: Number of ranked candidates to skip before selecting evidence.
+        This enables simple pagination by re-running the same query with a
+        higher offset.
+    :vartype offset: int
+    :ivar maximum_total_characters: Optional maximum total characters across evidence text.
+    :vartype maximum_total_characters: int or None
     :ivar max_items_per_source: Optional cap per source uniform resource identifier.
     :vartype max_items_per_source: int or None
     """
@@ -235,7 +243,8 @@ class QueryBudget(BaseModel):
     model_config = ConfigDict(extra="forbid")
     max_total_items: int = Field(ge=1)
-    max_total_characters: Optional[int] = Field(default=None, ge=1)
+    offset: int = Field(default=0, ge=0)
+    maximum_total_characters: Optional[int] = Field(default=None, ge=1)
     max_items_per_source: Optional[int] = Field(default=None, ge=1)
@@ -269,6 +278,8 @@ class Evidence(BaseModel):
     :vartype recipe_id: str
     :ivar run_id: Retrieval run identifier.
     :vartype run_id: str
+    :ivar metadata: Optional metadata payload from the catalog item.
+    :vartype metadata: dict[str, Any]
     :ivar hash: Optional content hash for provenance.
     :vartype hash: str or None
     """
@@ -288,6 +299,7 @@ class Evidence(BaseModel):
     stage_scores: Optional[Dict[str, float]] = None
     recipe_id: str
     run_id: str
+    metadata: Dict[str, Any] = Field(default_factory=dict)
     hash: Optional[str] = None
     @model_validator(mode="after")

biblicus/retrieval.py CHANGED Viewed

@@ -108,8 +108,13 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
     selected_evidence: List[Evidence] = []
     source_counts: Dict[str, int] = {}
     total_characters = 0
+    skipped = 0
     for candidate_evidence in evidence:
+        if skipped < budget.offset:
+            skipped += 1
+            continue
         if len(selected_evidence) >= budget.max_total_items:
             break
@@ -119,8 +124,8 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
                 continue
         text_character_count = len(candidate_evidence.text or "")
-        if budget.max_total_characters is not None:
-            if total_characters + text_character_count > budget.max_total_characters:
+        if budget.maximum_total_characters is not None:
+            if total_characters + text_character_count > budget.maximum_total_characters:
                 continue
         selected_evidence.append(candidate_evidence)

biblicus/sources.py CHANGED Viewed

@@ -8,7 +8,7 @@ import mimetypes
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
-from urllib.parse import unquote, urlparse
+from urllib.parse import quote, unquote, urlparse
 from urllib.request import Request, urlopen
@@ -37,6 +37,27 @@ def _filename_from_url_path(path: str) -> str:
     return filename or "download"
+def _sanitize_filename_component(name: str) -> str:
+    allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
+    sanitized_name = "".join(
+        (character if character in allowed_characters else "_") for character in name
+    ).strip()
+    return sanitized_name or "file"
+def _namespaced_filename(
+    *, source_uri: Optional[str], fallback_name: Optional[str], media_type: str
+) -> str:
+    base_name = ""
+    if source_uri:
+        base_name = quote(source_uri, safe="")
+    if not base_name and fallback_name:
+        base_name = _sanitize_filename_component(fallback_name)
+    if not base_name:
+        base_name = "file"
+    return _ensure_extension_for_media_type(base_name, media_type)
 def _media_type_from_filename(name: str) -> str:
     """
     Guess media type from a filename.
@@ -119,8 +140,16 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
     """
     if Path(filename).suffix:
         return filename
-    if media_type == "audio/ogg":
-        ext = ".ogg"
+    media_type_overrides = {
+        "audio/mpeg": ".mp3",
+        "audio/ogg": ".ogg",
+        "audio/wav": ".wav",
+        "audio/x-wav": ".wav",
+        "image/jpeg": ".jpg",
+        "text/html": ".html",
+    }
+    if media_type in media_type_overrides:
+        ext = media_type_overrides[media_type]
     else:
         ext = mimetypes.guess_extension(media_type) or ""
     return filename + ext if ext else filename
@@ -165,11 +194,12 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
         media_type = _media_type_from_filename(path.name)
         if path.suffix.lower() in {".md", ".markdown"}:
             media_type = "text/markdown"
+        resolved_source_uri = source_uri or path.as_uri()
         return SourcePayload(
             data=path.read_bytes(),
             filename=path.name,
             media_type=media_type,
-            source_uri=source_uri or path.as_uri(),
+            source_uri=resolved_source_uri,
         )
     if _looks_like_uri(source):
@@ -187,21 +217,26 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
             with urlopen(request, timeout=30) as response:
                 response_bytes = response.read()
                 content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
-                filename = _filename_from_url_path(parsed.path)
-                media_type = content_type or _media_type_from_filename(filename)
+                fallback_filename = _filename_from_url_path(parsed.path)
+                media_type = content_type or _media_type_from_filename(fallback_filename)
                 if media_type == "application/octet-stream":
                     sniffed = _sniff_media_type_from_bytes(response_bytes)
                     if sniffed:
                         media_type = sniffed
-                        filename = _ensure_extension_for_media_type(filename, media_type)
-                media_type = _normalize_media_type(filename=filename, media_type=media_type)
-                if Path(filename).suffix.lower() in {".md", ".markdown"}:
+                        fallback_filename = _ensure_extension_for_media_type(
+                            fallback_filename, media_type
+                        )
+                media_type = _normalize_media_type(
+                    filename=fallback_filename, media_type=media_type
+                )
+                if Path(fallback_filename).suffix.lower() in {".md", ".markdown"}:
                     media_type = "text/markdown"
+                resolved_source_uri = source_uri or source
                 return SourcePayload(
                     data=response_bytes,
-                    filename=filename,
+                    filename=fallback_filename,
                     media_type=media_type,
-                    source_uri=source_uri or source,
+                    source_uri=resolved_source_uri,
                 )
         raise NotImplementedError(

biblicus/text/link.py CHANGED Viewed

@@ -159,6 +159,8 @@ def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
 def _validate_replace_text(old_str: str, new_str: str) -> None:
+    if "<span" in old_str or "</span>" in old_str:
+        raise ValueError("Text link replacements must target plain text without span tags")
     if strip_span_tags(old_str) != strip_span_tags(new_str):
         raise ValueError("Text link replacements may only insert span tags")
@@ -460,12 +462,16 @@ def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: st
     error_lines = "\n".join(f"- {error}" for error in errors)
     context_section = build_span_context_section(current_text, errors)
     coverage_guidance = _build_coverage_guidance(errors)
+    nested_guidance = ""
+    if any("nested span" in error for error in errors):
+        nested_guidance = "Do not create nested or overlapping spans. Remove nested spans and wrap only bare text.\n"
     return (
         "Your last edit did not validate.\n"
         "Issues:\n"
         f"{error_lines}\n\n"
         f"{context_section}"
         f"{coverage_guidance}"
+        f"{nested_guidance}"
         "Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
         "Reuse the same id for identical names and do not assign multiple ids to the same name. "
         f"Ids must start with '{id_prefix}'. Try again.\n"

biblicus/text/prompts.py CHANGED Viewed

@@ -57,6 +57,8 @@ DEFAULT_ANNOTATE_SYSTEM_PROMPT = (
     "- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
     "- Do not include <span or </span> inside old_str or new_str.\n"
     "- Do not insert nested spans.\n"
+    "- Do not wrap text that is already inside a span; spans must never overlap.\n"
+    "- If a name appears inside an existing span, leave it alone and wrap only bare text.\n"
     "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
     "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
     "- Do not delete, reorder, paraphrase, or label text beyond the span attributes.\n\n"

{biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: biblicus
-Version: 0.15.1
+Version: 1.0.0
 Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
 License: MIT
 Requires-Python: >=3.9
@@ -11,6 +11,7 @@ Requires-Dist: PyYAML>=6.0
 Requires-Dist: pypdf>=4.0
 Requires-Dist: Jinja2>=3.1
 Requires-Dist: dotyaml>=0.1.3
+Requires-Dist: numpy>=1.24
 Provides-Extra: dev
 Requires-Dist: behave>=1.2.6; extra == "dev"
 Requires-Dist: coverage[toml]>=7.0; extra == "dev"
@@ -292,7 +293,7 @@ for note_title, note_text in notes:
 backend = get_backend("scan")
 run = backend.build_run(corpus, recipe_name="Story demo", config={})
-budget = QueryBudget(max_total_items=5, max_total_characters=2000, max_items_per_source=None)
+budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
 result = backend.query(
     corpus,
     run=run,
@@ -332,7 +333,7 @@ Example output:
   "query_text": "Primary button style preference",
   "budget": {
     "max_total_items": 5,
-    "max_total_characters": 2000,
+    "maximum_total_characters": 2000,
     "max_items_per_source": null
   },
   "run_id": "RUN_ID",

biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

biblicus 0.15.1py3-none-any.whl → 1.0.0py3-none-any.whl