PyPI - biblicus - Versions diffs - 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

biblicus 0.16.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

biblicus/__init__.py +25 -5
biblicus/analysis/__init__.py +1 -1
biblicus/analysis/base.py +10 -10
biblicus/analysis/markov.py +78 -68
biblicus/analysis/models.py +47 -47
biblicus/analysis/profiling.py +58 -48
biblicus/analysis/topic_modeling.py +56 -51
biblicus/cli.py +248 -191
biblicus/{recipes.py → configuration.py} +14 -14
biblicus/constants.py +2 -2
biblicus/context.py +27 -12
biblicus/context_engine/__init__.py +53 -0
biblicus/context_engine/assembler.py +1090 -0
biblicus/context_engine/compaction.py +110 -0
biblicus/context_engine/models.py +423 -0
biblicus/context_engine/retrieval.py +133 -0
biblicus/corpus.py +233 -124
biblicus/errors.py +27 -3
biblicus/evaluation.py +27 -25
biblicus/extraction.py +103 -98
biblicus/extraction_evaluation.py +26 -26
biblicus/extractors/deepgram_stt.py +7 -7
biblicus/extractors/docling_granite_text.py +11 -11
biblicus/extractors/docling_smol_text.py +11 -11
biblicus/extractors/markitdown_text.py +4 -4
biblicus/extractors/openai_stt.py +7 -7
biblicus/extractors/paddleocr_vl_text.py +20 -18
biblicus/extractors/pipeline.py +8 -8
biblicus/extractors/rapidocr_text.py +3 -3
biblicus/extractors/unstructured_text.py +3 -3
biblicus/hooks.py +4 -4
biblicus/knowledge_base.py +34 -32
biblicus/models.py +84 -81
biblicus/retrieval.py +49 -42
biblicus/retrievers/__init__.py +50 -0
biblicus/retrievers/base.py +65 -0
biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
biblicus/retrievers/hybrid.py +301 -0
biblicus/{backends → retrievers}/scan.py +84 -73
biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
biblicus/{backends → retrievers}/tf_vector.py +103 -100
biblicus/sources.py +46 -11
biblicus/text/link.py +6 -0
biblicus/text/prompts.py +18 -8
biblicus/text/tool_loop.py +63 -5
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
biblicus-1.1.0.dist-info/RECORD +91 -0
biblicus/backends/__init__.py +0 -50
biblicus/backends/base.py +0 -65
biblicus/backends/hybrid.py +0 -291
biblicus-0.16.0.dist-info/RECORD +0 -86
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0

biblicus/{backends → retrievers}/tf_vector.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-Deterministic term-frequency vector retrieval backend.
+Deterministic term-frequency vector retriever.
 """
 from __future__ import annotations
@@ -8,93 +8,103 @@ import math
 import re
 from typing import Dict, Iterable, List, Optional, Tuple
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict
 from ..corpus import Corpus
 from ..frontmatter import parse_front_matter
 from ..models import (
     Evidence,
-    ExtractionRunReference,
+    ExtractionSnapshotReference,
     QueryBudget,
     RetrievalResult,
-    RetrievalRun,
-    parse_extraction_run_reference,
+    RetrievalSnapshot,
+    parse_extraction_snapshot_reference,
+)
+from ..retrieval import (
+    apply_budget,
+    create_configuration_manifest,
+    create_snapshot_manifest,
+    hash_text,
 )
-from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
 from ..time import utc_now_iso
-class TfVectorRecipeConfig(BaseModel):
+class TfVectorConfiguration(BaseModel):
     """
-    Configuration for the term-frequency vector retrieval backend.
+    Configuration for the term-frequency vector retriever.
-    :ivar snippet_characters: Maximum characters to include in evidence snippets.
-    :vartype snippet_characters: int
-    :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
-    :vartype extraction_run: str or None
+    :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
+    :vartype extraction_snapshot: str or None
+    :ivar snippet_characters: Optional maximum character count for returned evidence text.
+    :vartype snippet_characters: int or None
     """
     model_config = ConfigDict(extra="forbid")
-    snippet_characters: int = Field(default=400, ge=1)
-    extraction_run: Optional[str] = None
+    extraction_snapshot: Optional[str] = None
+    snippet_characters: Optional[int] = None
-class TfVectorBackend:
+class TfVectorRetriever:
     """
-    Deterministic vector backend using term-frequency cosine similarity.
+    Deterministic vector retriever using term-frequency cosine similarity.
-    :ivar backend_id: Backend identifier.
-    :vartype backend_id: str
+    :ivar retriever_id: Retriever identifier.
+    :vartype retriever_id: str
     """
-    backend_id = "tf-vector"
+    retriever_id = "tf-vector"
-    def build_run(
-        self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
-    ) -> RetrievalRun:
+    def build_snapshot(
+        self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
+    ) -> RetrievalSnapshot:
         """
-        Register a vector backend run (no materialization).
+        Register a vector retriever snapshot (no snapshot artifacts).
         :param corpus: Corpus to build against.
         :type corpus: Corpus
-        :param recipe_name: Human-readable recipe name.
-        :type recipe_name: str
-        :param config: Backend-specific configuration values.
-        :type config: dict[str, object]
-        :return: Run manifest describing the build.
-        :rtype: RetrievalRun
+        :param configuration_name: Human-readable configuration name.
+        :type configuration_name: str
+        :param configuration: Retriever-specific configuration values.
+        :type configuration: dict[str, object]
+        :return: Snapshot manifest describing the build.
+        :rtype: RetrievalSnapshot
         """
-        recipe_config = TfVectorRecipeConfig.model_validate(config)
+        parsed_config = TfVectorConfiguration.model_validate(configuration)
         catalog = corpus.load_catalog()
-        recipe = create_recipe_manifest(
-            backend_id=self.backend_id,
-            name=recipe_name,
-            config=recipe_config.model_dump(),
+        configuration_manifest = create_configuration_manifest(
+            retriever_id=self.retriever_id,
+            name=configuration_name,
+            configuration=parsed_config.model_dump(),
         )
         stats = {
             "items": len(catalog.items),
-            "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
+            "text_items": _count_text_items(corpus, catalog.items.values(), parsed_config),
         }
-        run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
-        corpus.write_run(run)
-        return run
+        snapshot = create_snapshot_manifest(
+            corpus,
+            configuration=configuration_manifest,
+            stats=stats,
+            snapshot_artifacts=[],
+        )
+        corpus.write_snapshot(snapshot)
+        return snapshot
     def query(
         self,
         corpus: Corpus,
         *,
-        run: RetrievalRun,
+        snapshot: RetrievalSnapshot,
         query_text: str,
         budget: QueryBudget,
     ) -> RetrievalResult:
         """
         Query the corpus using term-frequency cosine similarity.
-        :param corpus: Corpus associated with the run.
+        :param corpus: Corpus associated with the snapshot.
         :type corpus: Corpus
-        :param run: Run manifest to use for querying.
-        :type run: RetrievalRun
+        :param snapshot: Snapshot manifest to use for querying.
+        :type snapshot: RetrievalSnapshot
         :param query_text: Query text to execute.
         :type query_text: str
         :param budget: Evidence selection budget.
@@ -102,15 +112,15 @@ class TfVectorBackend:
         :return: Retrieval results containing evidence.
         :rtype: RetrievalResult
         """
-        recipe_config = TfVectorRecipeConfig.model_validate(run.recipe.config)
+        parsed_config = TfVectorConfiguration.model_validate(snapshot.configuration.configuration)
         query_tokens = _tokenize_text(query_text)
         if not query_tokens:
             return RetrievalResult(
                 query_text=query_text,
                 budget=budget,
-                run_id=run.run_id,
-                recipe_id=run.recipe.recipe_id,
-                backend_id=self.backend_id,
+                snapshot_id=snapshot.snapshot_id,
+                configuration_id=snapshot.configuration.configuration_id,
+                retriever_id=snapshot.configuration.retriever_id,
                 generated_at=utc_now_iso(),
                 evidence=[],
                 stats={"candidates": 0, "returned": 0},
@@ -118,15 +128,15 @@ class TfVectorBackend:
         query_vector = _term_frequencies(query_tokens)
         query_norm = _vector_norm(query_vector)
         catalog = corpus.load_catalog()
-        extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
+        extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
         scored_candidates = _score_items(
             corpus,
             catalog.items.values(),
             query_tokens=query_tokens,
             query_vector=query_vector,
             query_norm=query_norm,
-            snippet_characters=recipe_config.snippet_characters,
             extraction_reference=extraction_reference,
+            snippet_characters=parsed_config.snippet_characters,
         )
         sorted_candidates = sorted(
             scored_candidates,
@@ -136,8 +146,8 @@ class TfVectorBackend:
             evidence_item.model_copy(
                 update={
                     "rank": index,
-                    "recipe_id": run.recipe.recipe_id,
-                    "run_id": run.run_id,
+                    "configuration_id": snapshot.configuration.configuration_id,
+                    "snapshot_id": snapshot.snapshot_id,
                 }
             )
             for index, evidence_item in enumerate(sorted_candidates, start=1)
@@ -147,9 +157,9 @@ class TfVectorBackend:
         return RetrievalResult(
             query_text=query_text,
             budget=budget,
-            run_id=run.run_id,
-            recipe_id=run.recipe.recipe_id,
-            backend_id=self.backend_id,
+            snapshot_id=snapshot.snapshot_id,
+            configuration_id=snapshot.configuration.configuration_id,
+            retriever_id=snapshot.configuration.retriever_id,
             generated_at=utc_now_iso(),
             evidence=evidence,
             stats=stats,
@@ -157,33 +167,33 @@ class TfVectorBackend:
 def _resolve_extraction_reference(
-    corpus: Corpus, recipe_config: TfVectorRecipeConfig
-) -> Optional[ExtractionRunReference]:
+    corpus: Corpus, configuration: TfVectorConfiguration
+) -> Optional[ExtractionSnapshotReference]:
     """
-    Resolve an extraction run reference from a recipe config.
+    Resolve an extraction snapshot reference from a configuration.
-    :param corpus: Corpus associated with the recipe.
+    :param corpus: Corpus associated with the configuration.
     :type corpus: Corpus
-    :param recipe_config: Parsed vector recipe configuration.
-    :type recipe_config: TfVectorRecipeConfig
+    :param configuration: Parsed vector configuration.
+    :type configuration: TfVectorConfiguration
     :return: Parsed extraction reference or None.
-    :rtype: ExtractionRunReference or None
-    :raises FileNotFoundError: If an extraction run is referenced but not present.
+    :rtype: ExtractionSnapshotReference or None
+    :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
     """
-    if not recipe_config.extraction_run:
+    if not configuration.extraction_snapshot:
         return None
-    extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
-    run_dir = corpus.extraction_run_dir(
+    extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
+    snapshot_dir = corpus.extraction_snapshot_dir(
         extractor_id=extraction_reference.extractor_id,
-        run_id=extraction_reference.run_id,
+        snapshot_id=extraction_reference.snapshot_id,
     )
-    if not run_dir.is_dir():
-        raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
+    if not snapshot_dir.is_dir():
+        raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
     return extraction_reference
 def _count_text_items(
-    corpus: Corpus, items: Iterable[object], recipe_config: TfVectorRecipeConfig
+    corpus: Corpus, items: Iterable[object], configuration: TfVectorConfiguration
 ) -> int:
     """
     Count catalog items that represent text content.
@@ -192,19 +202,19 @@ def _count_text_items(
     :type corpus: Corpus
     :param items: Catalog items to inspect.
     :type items: Iterable[object]
-    :param recipe_config: Parsed vector recipe configuration.
-    :type recipe_config: TfVectorRecipeConfig
+    :param configuration: Parsed vector configuration.
+    :type configuration: TfVectorConfiguration
     :return: Number of text items.
     :rtype: int
     """
     text_item_count = 0
-    extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
+    extraction_reference = _resolve_extraction_reference(corpus, configuration)
     for catalog_item in items:
         item_id = str(getattr(catalog_item, "id", ""))
         if extraction_reference and item_id:
             extracted_text = corpus.read_extracted_text(
                 extractor_id=extraction_reference.extractor_id,
-                run_id=extraction_reference.run_id,
+                snapshot_id=extraction_reference.snapshot_id,
                 item_id=item_id,
             )
             if isinstance(extracted_text, str) and extracted_text.strip():
@@ -292,7 +302,7 @@ def _load_text_from_item(
     item_id: str,
     relpath: str,
     media_type: str,
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
 ) -> Optional[str]:
     """
     Load a text payload from a catalog item.
@@ -305,15 +315,15 @@ def _load_text_from_item(
     :type relpath: str
     :param media_type: Media type for the stored content.
     :type media_type: str
-    :param extraction_reference: Optional extraction run reference.
-    :type extraction_reference: ExtractionRunReference or None
+    :param extraction_reference: Optional extraction snapshot reference.
+    :type extraction_reference: ExtractionSnapshotReference or None
     :return: Text payload or None if not decodable as text.
     :rtype: str or None
     """
     if extraction_reference:
         extracted_text = corpus.read_extracted_text(
             extractor_id=extraction_reference.extractor_id,
-            run_id=extraction_reference.run_id,
+            snapshot_id=extraction_reference.snapshot_id,
             item_id=item_id,
         )
         if isinstance(extracted_text, str) and extracted_text.strip():
@@ -359,21 +369,13 @@ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]
     return best_start, best_end
-def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
-    """
-    Build a snippet around a match span, constrained by a character budget.
-    :param text: Source text to slice.
-    :type text: str
-    :param span: Match span to center on.
-    :type span: tuple[int, int] or None
-    :param max_chars: Maximum snippet length.
-    :type max_chars: int
-    :return: Snippet text.
-    :rtype: str
-    """
+def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: Optional[int]) -> str:
+    if max_chars is None:
+        return text
     if not text:
         return ""
+    if max_chars <= 0:
+        return ""
     if span is None:
         return text[:max_chars]
     span_start, span_end = span
@@ -390,8 +392,8 @@ def _score_items(
     query_tokens: List[str],
     query_vector: Dict[str, float],
     query_norm: float,
-    snippet_characters: int,
-    extraction_reference: Optional[ExtractionRunReference],
+    extraction_reference: Optional[ExtractionSnapshotReference],
+    snippet_characters: Optional[int],
 ) -> List[Evidence]:
     """
     Score catalog items and return evidence candidates.
@@ -406,10 +408,10 @@ def _score_items(
     :type query_vector: dict[str, float]
     :param query_norm: Query vector norm.
     :type query_norm: float
-    :param snippet_characters: Snippet length budget.
-    :type snippet_characters: int
-    :param extraction_reference: Optional extraction run reference.
-    :type extraction_reference: ExtractionRunReference or None
+    :param extraction_reference: Optional extraction snapshot reference.
+    :type extraction_reference: ExtractionSnapshotReference or None
+    :param snippet_characters: Optional maximum character count for returned evidence text.
+    :type snippet_characters: int or None
     :return: Evidence candidates with provisional ranks.
     :rtype: list[Evidence]
     """
@@ -437,9 +439,9 @@ def _score_items(
         if similarity <= 0:
             continue
         span = _find_first_match(item_text, query_tokens)
-        snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
         span_start = span[0] if span else None
         span_end = span[1] if span else None
+        evidence_text = _build_snippet(item_text, span, max_chars=snippet_characters)
         evidence_items.append(
             Evidence(
                 item_id=str(getattr(catalog_item, "id")),
@@ -447,14 +449,15 @@ def _score_items(
                 media_type=str(media_type),
                 score=float(similarity),
                 rank=1,
-                text=snippet,
+                text=evidence_text,
                 content_ref=None,
                 span_start=span_start,
                 span_end=span_end,
                 stage="tf-vector",
-                recipe_id="",
-                run_id="",
-                hash=hash_text(snippet),
+                configuration_id="",
+                snapshot_id="",
+                metadata=getattr(catalog_item, "metadata", {}) or {},
+                hash=hash_text(evidence_text or ""),
             )
         )
     return evidence_items

biblicus/sources.py CHANGED Viewed

@@ -8,7 +8,7 @@ import mimetypes
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
-from urllib.parse import unquote, urlparse
+from urllib.parse import quote, unquote, urlparse
 from urllib.request import Request, urlopen
@@ -37,6 +37,27 @@ def _filename_from_url_path(path: str) -> str:
     return filename or "download"
+def _sanitize_filename_component(name: str) -> str:
+    allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
+    sanitized_name = "".join(
+        (character if character in allowed_characters else "_") for character in name
+    ).strip()
+    return sanitized_name or "file"
+def _namespaced_filename(
+    *, source_uri: Optional[str], fallback_name: Optional[str], media_type: str
+) -> str:
+    base_name = ""
+    if source_uri:
+        base_name = quote(source_uri, safe="")
+    if not base_name and fallback_name:
+        base_name = _sanitize_filename_component(fallback_name)
+    if not base_name:
+        base_name = "file"
+    return _ensure_extension_for_media_type(base_name, media_type)
 def _media_type_from_filename(name: str) -> str:
     """
     Guess media type from a filename.
@@ -119,8 +140,16 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
     """
     if Path(filename).suffix:
         return filename
-    if media_type == "audio/ogg":
-        ext = ".ogg"
+    media_type_overrides = {
+        "audio/mpeg": ".mp3",
+        "audio/ogg": ".ogg",
+        "audio/wav": ".wav",
+        "audio/x-wav": ".wav",
+        "image/jpeg": ".jpg",
+        "text/html": ".html",
+    }
+    if media_type in media_type_overrides:
+        ext = media_type_overrides[media_type]
     else:
         ext = mimetypes.guess_extension(media_type) or ""
     return filename + ext if ext else filename
@@ -165,11 +194,12 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
         media_type = _media_type_from_filename(path.name)
         if path.suffix.lower() in {".md", ".markdown"}:
             media_type = "text/markdown"
+        resolved_source_uri = source_uri or path.as_uri()
         return SourcePayload(
             data=path.read_bytes(),
             filename=path.name,
             media_type=media_type,
-            source_uri=source_uri or path.as_uri(),
+            source_uri=resolved_source_uri,
         )
     if _looks_like_uri(source):
@@ -187,21 +217,26 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
             with urlopen(request, timeout=30) as response:
                 response_bytes = response.read()
                 content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
-                filename = _filename_from_url_path(parsed.path)
-                media_type = content_type or _media_type_from_filename(filename)
+                fallback_filename = _filename_from_url_path(parsed.path)
+                media_type = content_type or _media_type_from_filename(fallback_filename)
                 if media_type == "application/octet-stream":
                     sniffed = _sniff_media_type_from_bytes(response_bytes)
                     if sniffed:
                         media_type = sniffed
-                        filename = _ensure_extension_for_media_type(filename, media_type)
-                media_type = _normalize_media_type(filename=filename, media_type=media_type)
-                if Path(filename).suffix.lower() in {".md", ".markdown"}:
+                        fallback_filename = _ensure_extension_for_media_type(
+                            fallback_filename, media_type
+                        )
+                media_type = _normalize_media_type(
+                    filename=fallback_filename, media_type=media_type
+                )
+                if Path(fallback_filename).suffix.lower() in {".md", ".markdown"}:
                     media_type = "text/markdown"
+                resolved_source_uri = source_uri or source
                 return SourcePayload(
                     data=response_bytes,
-                    filename=filename,
+                    filename=fallback_filename,
                     media_type=media_type,
-                    source_uri=source_uri or source,
+                    source_uri=resolved_source_uri,
                 )
         raise NotImplementedError(

biblicus/text/link.py CHANGED Viewed

@@ -159,6 +159,8 @@ def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
 def _validate_replace_text(old_str: str, new_str: str) -> None:
+    if "<span" in old_str or "</span>" in old_str:
+        raise ValueError("Text link replacements must target plain text without span tags")
     if strip_span_tags(old_str) != strip_span_tags(new_str):
         raise ValueError("Text link replacements may only insert span tags")
@@ -460,12 +462,16 @@ def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: st
     error_lines = "\n".join(f"- {error}" for error in errors)
     context_section = build_span_context_section(current_text, errors)
     coverage_guidance = _build_coverage_guidance(errors)
+    nested_guidance = ""
+    if any("nested span" in error for error in errors):
+        nested_guidance = "Do not create nested or overlapping spans. Remove nested spans and wrap only bare text.\n"
     return (
         "Your last edit did not validate.\n"
         "Issues:\n"
         f"{error_lines}\n\n"
         f"{context_section}"
         f"{coverage_guidance}"
+        f"{nested_guidance}"
         "Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
         "Reuse the same id for identical names and do not assign multiple ids to the same name. "
         f"Ids must start with '{id_prefix}'. Try again.\n"

biblicus/text/prompts.py CHANGED Viewed

@@ -11,14 +11,16 @@ DEFAULT_EXTRACT_SYSTEM_PROMPT = (
     "Interpret the word 'return' in the user's request as: wrap the returned text with "
     "<span>...</span> in-place in the current text.\n\n"
     "Use the str_replace tool to insert <span>...</span> tags and the done tool when finished.\n"
+    "For long spans, insert <span> and </span> using separate str_replace calls. "
+    "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
     "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
     "Rules:\n"
     "- Use str_replace only.\n"
     "- old_str must match exactly once in the current text.\n"
     "- When choosing old_str, copy the exact substring (including punctuation/case) from the current text.\n"
     "- old_str and new_str must be non-empty strings.\n"
-    "- new_str must be identical to old_str with only <span> and </span> inserted.\n"
-    "- Do not include <span> or </span> inside old_str or new_str.\n"
+    "- new_str must be identical to old_str with only <span> and/or </span> inserted.\n"
+    "- Do not include <span> or </span> inside old_str.\n"
     "- Do not insert nested spans.\n"
     "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
     "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
@@ -49,14 +51,18 @@ DEFAULT_ANNOTATE_SYSTEM_PROMPT = (
     '<span ATTRIBUTE="VALUE">...</span> in-place in the current text.\n'
     "Each span must include exactly one attribute from: {{ allowed_attributes }}.\n\n"
     "Use the str_replace tool to insert span tags and the done tool when finished.\n"
+    "For long spans, insert the opening and closing tags using separate str_replace calls. "
+    "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
     "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
     "Rules:\n"
     "- Use str_replace only.\n"
     "- old_str must match exactly once in the current text.\n"
     "- old_str and new_str must be non-empty strings.\n"
-    "- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
-    "- Do not include <span or </span> inside old_str or new_str.\n"
+    "- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
+    "- Do not include <span or </span> inside old_str.\n"
     "- Do not insert nested spans.\n"
+    "- Do not wrap text that is already inside a span; spans must never overlap.\n"
+    "- If a name appears inside an existing span, leave it alone and wrap only bare text.\n"
     "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
     "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
     "- Do not delete, reorder, paraphrase, or label text beyond the span attributes.\n\n"
@@ -78,13 +84,15 @@ DEFAULT_LINK_SYSTEM_PROMPT = (
     "- Do not call done until every repeated name or entity in the text is wrapped.\n"
     "- If a name appears multiple times, there must be one id and refs for every later occurrence.\n\n"
     "Use the str_replace tool to insert span tags and the done tool when finished.\n"
+    "For long spans, insert the opening and closing tags using separate str_replace calls. "
+    "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
     "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
     "Rules:\n"
     "- Use str_replace only.\n"
     "- old_str must match exactly once in the current text.\n"
     "- old_str and new_str must be non-empty strings.\n"
-    "- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
-    "- Do not include <span or </span> inside old_str or new_str.\n"
+    "- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
+    "- Do not include <span or </span> inside old_str.\n"
     "- Do not insert nested spans.\n"
     "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
     "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
@@ -98,13 +106,15 @@ DEFAULT_REDACT_SYSTEM_PROMPT = (
     "<span>...</span> in-place in the current text.\n"
     "If redaction types are provided, use a redact attribute with one of: {{ redaction_types }}.\n\n"
     "Use the str_replace tool to insert span tags and the done tool when finished.\n"
+    "For long spans, insert the opening and closing tags using separate str_replace calls. "
+    "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
     "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
     "Rules:\n"
     "- Use str_replace only.\n"
     "- old_str must match exactly once in the current text.\n"
     "- old_str and new_str must be non-empty strings.\n"
-    "- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
-    "- Do not include <span or </span> inside old_str or new_str.\n"
+    "- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
+    "- Do not include <span or </span> inside old_str.\n"
     "- Do not insert nested spans.\n"
     "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
     "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"

biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

biblicus 0.16.0py3-none-any.whl → 1.1.0py3-none-any.whl