PyPI - biblicus - Versions diffs - 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

biblicus 0.13.0py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

biblicus/__init__.py +1 -1
biblicus/_vendor/dotyaml/__init__.py +2 -2
biblicus/_vendor/dotyaml/loader.py +40 -1
biblicus/ai/__init__.py +39 -0
biblicus/ai/embeddings.py +114 -0
biblicus/ai/llm.py +138 -0
biblicus/ai/models.py +226 -0
biblicus/analysis/__init__.py +5 -2
biblicus/analysis/markov.py +1624 -0
biblicus/analysis/models.py +754 -1
biblicus/analysis/topic_modeling.py +98 -19
biblicus/backends/hybrid.py +6 -1
biblicus/backends/sqlite_full_text_search.py +4 -2
biblicus/cli.py +118 -23
biblicus/context.py +2 -2
biblicus/recipes.py +136 -0
biblicus/text/__init__.py +43 -0
biblicus/text/annotate.py +222 -0
biblicus/text/extract.py +210 -0
biblicus/text/link.py +519 -0
biblicus/text/markup.py +200 -0
biblicus/text/models.py +319 -0
biblicus/text/prompts.py +113 -0
biblicus/text/redact.py +229 -0
biblicus/text/slice.py +155 -0
biblicus/text/tool_loop.py +334 -0
{biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/METADATA +90 -26
{biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/RECORD +32 -17
biblicus/analysis/llm.py +0 -106
{biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/WHEEL +0 -0
{biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/top_level.txt +0 -0

biblicus/text/annotate.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""
+Agentic text annotation using virtual file edits.
+"""
+from __future__ import annotations
+from typing import Iterable, List, Sequence
+from jinja2 import Environment, StrictUndefined
+from .markup import (
+    TextAnnotatedSpan,
+    build_span_context_section,
+    parse_span_markup,
+    strip_span_tags,
+)
+from .models import TextAnnotateRequest, TextAnnotateResult
+from .tool_loop import request_confirmation, run_tool_loop
+DEFAULT_ANNOTATION_ATTRIBUTES = ["label", "phase", "role", "evidence", "entity"]
+def apply_text_annotate(request: TextAnnotateRequest) -> TextAnnotateResult:
+    """
+    Apply text annotation using a language model.
+    :param request: Text annotate request.
+    :type request: TextAnnotateRequest
+    :return: Text annotate result.
+    :rtype: TextAnnotateResult
+    :raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
+        a confirmation round and return a warning when confirmed.
+    """
+    warnings: List[str] = []
+    allowed_attributes = _resolve_allowed_attributes(request.allowed_attributes)
+    system_prompt = _render_system_prompt(
+        request.system_prompt,
+        allowed_attributes=allowed_attributes,
+    )
+    if request.mock_marked_up_text is not None:
+        return _build_mock_result(
+            request,
+            request.mock_marked_up_text,
+            allowed_attributes=allowed_attributes,
+        )
+    result = run_tool_loop(
+        text=request.text,
+        client=request.client,
+        system_prompt=system_prompt,
+        prompt_template=request.prompt_template,
+        max_rounds=request.max_rounds,
+        max_edits_per_round=request.max_edits_per_round,
+        apply_str_replace=_apply_annotate_replace,
+        validate_text=lambda current_text: _validate_annotation_markup(
+            current_text, allowed_attributes
+        ),
+        build_retry_message=lambda errors, current_text: _build_retry_message(
+            errors, current_text, allowed_attributes
+        ),
+    )
+    if not result.done:
+        if result.last_error:
+            raise ValueError(f"Text annotate failed: {result.last_error}")
+        warnings.append("Text annotate reached max rounds without done=true")
+    if result.text == request.text:
+        if result.last_error:
+            raise ValueError(result.last_error)
+        confirmation = request_confirmation(
+            result=result,
+            text=result.text,
+            client=request.client,
+            system_prompt=system_prompt,
+            prompt_template=request.prompt_template,
+            max_rounds=2,
+            max_edits_per_round=request.max_edits_per_round,
+            apply_str_replace=_apply_annotate_replace,
+            confirmation_message=_build_empty_confirmation_message(result.text),
+            validate_text=lambda current_text: _validate_annotation_markup(
+                current_text, allowed_attributes
+            ),
+            build_retry_message=lambda errors, current_text: _build_retry_message(
+                errors, current_text, allowed_attributes
+            ),
+        )
+        if not confirmation.done:
+            if confirmation.last_error:
+                raise ValueError(f"Text annotate failed: {confirmation.last_error}")
+            warnings.append("Text annotate confirmation reached max rounds without done=true")
+        _validate_preserved_text(original=request.text, marked_up=confirmation.text)
+        spans = parse_span_markup(confirmation.text)
+        validation_errors = _validate_annotation_spans(spans, allowed_attributes)
+        if validation_errors:
+            raise ValueError("; ".join(validation_errors))
+        if not spans:
+            warnings.append("Text annotate returned no spans; model confirmed empty result")
+        return TextAnnotateResult(
+            marked_up_text=confirmation.text,
+            spans=spans,
+            warnings=warnings,
+        )
+    _validate_preserved_text(original=request.text, marked_up=result.text)
+    spans = parse_span_markup(result.text)
+    validation_errors = _validate_annotation_spans(spans, allowed_attributes)
+    if validation_errors:
+        raise ValueError("; ".join(validation_errors))
+    return TextAnnotateResult(marked_up_text=result.text, spans=spans, warnings=warnings)
+def _build_mock_result(
+    request: TextAnnotateRequest,
+    marked_up_text: str,
+    *,
+    allowed_attributes: Sequence[str],
+) -> TextAnnotateResult:
+    if marked_up_text == request.text:
+        raise ValueError("Text annotate produced no spans")
+    _validate_preserved_text(original=request.text, marked_up=marked_up_text)
+    spans = parse_span_markup(marked_up_text)
+    errors = _validate_annotation_spans(spans, allowed_attributes)
+    if errors:
+        raise ValueError("; ".join(errors))
+    return TextAnnotateResult(marked_up_text=marked_up_text, spans=spans, warnings=[])
+def _resolve_allowed_attributes(allowed: Sequence[str] | None) -> List[str]:
+    if allowed is None:
+        return list(DEFAULT_ANNOTATION_ATTRIBUTES)
+    return [value for value in allowed]
+def _render_system_prompt(template: str, *, allowed_attributes: Sequence[str]) -> str:
+    env = Environment(undefined=StrictUndefined)
+    rendered = env.from_string(template).render(
+        allowed_attributes=list(allowed_attributes),
+    )
+    return rendered
+def _apply_annotate_replace(text: str, old_str: str, new_str: str) -> str:
+    occurrences = text.count(old_str)
+    if occurrences == 0:
+        raise ValueError("Text annotate replacement old_str not found")
+    if occurrences > 1:
+        raise ValueError("Text annotate replacement old_str is not unique")
+    _validate_replace_text(old_str, new_str)
+    return text.replace(old_str, new_str, 1)
+def _validate_replace_text(old_str: str, new_str: str) -> None:
+    if strip_span_tags(old_str) != strip_span_tags(new_str):
+        raise ValueError("Text annotate replacements may only insert span tags")
+def _validate_preserved_text(*, original: str, marked_up: str) -> None:
+    if strip_span_tags(marked_up) != original:
+        raise ValueError("Text annotate edits modified the source text")
+def _validate_annotation_markup(
+    marked_up_text: str, allowed_attributes: Sequence[str]
+) -> List[str]:
+    try:
+        spans = parse_span_markup(marked_up_text)
+    except ValueError as exc:
+        return [str(exc)]
+    return _validate_annotation_spans(spans, allowed_attributes)
+def _validate_annotation_spans(
+    spans: Iterable[TextAnnotatedSpan], allowed_attributes: Sequence[str]
+) -> List[str]:
+    errors: List[str] = []
+    allowed_set = set(allowed_attributes)
+    for span in spans:
+        if not span.attributes:
+            errors.append(
+                f"Span {span.index} is missing an attribute. Allowed attributes: {', '.join(allowed_attributes)}"
+            )
+            continue
+        if len(span.attributes) > 1:
+            errors.append(f"Span {span.index} has multiple attributes; only one is allowed")
+            continue
+        name, value = next(iter(span.attributes.items()))
+        if name not in allowed_set:
+            errors.append(
+                f"Span {span.index} uses attribute '{name}'. Allowed attributes: {', '.join(allowed_attributes)}"
+            )
+        if value.strip() == "":
+            errors.append(f"Span {span.index} has an empty value for attribute '{name}'")
+    return errors
+def _build_retry_message(
+    errors: Sequence[str], current_text: str, allowed_attributes: Sequence[str]
+) -> str:
+    error_lines = "\n".join(f"- {error}" for error in errors)
+    context_section = build_span_context_section(current_text, errors)
+    return (
+        "Your last edit did not validate.\n"
+        "Issues:\n"
+        f"{error_lines}\n\n"
+        f"{context_section}"
+        "Please fix the markup using str_replace. Each span must include exactly one attribute. "
+        "Allowed attributes are: "
+        f"{', '.join(allowed_attributes)}. Try again.\n"
+        "Current text:\n"
+        f"---\n{current_text}\n---"
+    )
+def _build_empty_confirmation_message(text: str) -> str:
+    return (
+        "No annotated spans were inserted. If there are truly no spans to return, "
+        "call done again without changes. Otherwise insert span tags with the correct attributes.\n"
+        "Current text:\n"
+        f"---\n{text}\n---"
+    )

biblicus/text/extract.py ADDED Viewed

@@ -0,0 +1,210 @@
+"""
+Agentic text extraction using virtual file edits.
+"""
+from __future__ import annotations
+import re
+from typing import Any, List, Optional
+from .models import TextExtractRequest, TextExtractResult, TextExtractSpan
+from .tool_loop import request_confirmation, run_tool_loop
+def apply_text_extract(request: TextExtractRequest) -> TextExtractResult:
+    """
+    Apply text extraction using a language model.
+    :param request: Text extract request.
+    :type request: TextExtractRequest
+    :return: Text extract result.
+    :rtype: TextExtractResult
+    :raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
+        a confirmation round and return a warning when confirmed.
+    """
+    if request.mock_marked_up_text is not None:
+        return _build_mock_result(request, request.mock_marked_up_text)
+    warnings: List[str] = []
+    result = run_tool_loop(
+        text=request.text,
+        client=request.client,
+        system_prompt=request.system_prompt,
+        prompt_template=request.prompt_template,
+        max_rounds=request.max_rounds,
+        max_edits_per_round=request.max_edits_per_round,
+        apply_str_replace=_apply_extract_replace,
+        validate_text=_validate_extract_markup,
+        build_retry_message=_build_retry_message,
+    )
+    if not result.done:
+        if result.last_error:
+            message_error = _extract_validation_error_from_messages(result.messages)
+            if message_error:
+                raise ValueError(f"Text extract failed: {message_error}")
+            raise ValueError(f"Text extract failed: {result.last_error}")
+        warnings.append("Text extract reached max rounds without done=true")
+    if result.text == request.text:
+        if result.last_error:
+            raise ValueError(result.last_error)
+        confirmation = request_confirmation(
+            result=result,
+            text=result.text,
+            client=request.client,
+            system_prompt=request.system_prompt,
+            prompt_template=request.prompt_template,
+            max_rounds=2,
+            max_edits_per_round=request.max_edits_per_round,
+            apply_str_replace=_apply_extract_replace,
+            confirmation_message=_build_empty_confirmation_message(result.text),
+        )
+        if not confirmation.done:
+            if confirmation.last_error:
+                raise ValueError(f"Text extract failed: {confirmation.last_error}")
+            warnings.append("Text extract confirmation reached max rounds without done=true")
+        _validate_preserved_text(original=request.text, marked_up=confirmation.text)
+        spans = _extract_spans(marked_up_text=confirmation.text)
+        if not spans:
+            warnings.append("Text extract returned no spans; model confirmed empty result")
+        return TextExtractResult(
+            marked_up_text=confirmation.text,
+            spans=spans,
+            warnings=warnings,
+        )
+    _validate_preserved_text(original=request.text, marked_up=result.text)
+    spans = _extract_spans(marked_up_text=result.text)
+    return TextExtractResult(marked_up_text=result.text, spans=spans, warnings=warnings)
+def _build_mock_result(request: TextExtractRequest, marked_up_text: str) -> TextExtractResult:
+    if marked_up_text == request.text:
+        raise ValueError("Text extract produced no spans")
+    _validate_preserved_text(original=request.text, marked_up=marked_up_text)
+    spans = _extract_spans(marked_up_text=marked_up_text)
+    return TextExtractResult(marked_up_text=marked_up_text, spans=spans, warnings=[])
+def _apply_extract_replace(text: str, old_str: str, new_str: str) -> str:
+    occurrences = text.count(old_str)
+    if occurrences == 0:
+        raise ValueError("Text extract replacement old_str not found")
+    if occurrences > 1:
+        raise ValueError("Text extract replacement old_str is not unique")
+    _validate_replace_text(old_str, new_str)
+    return text.replace(old_str, new_str, 1)
+def _validate_replace_text(old_str: str, new_str: str) -> None:
+    if _strip_span_tags(old_str) != _strip_span_tags(new_str):
+        raise ValueError("Text extract replacements may only insert span tags")
+def _validate_preserved_text(*, original: str, marked_up: str) -> None:
+    if _strip_span_tags(marked_up) != original:
+        raise ValueError("Text extract edits modified the source text")
+def _strip_span_tags(text: str) -> str:
+    return text.replace("<span>", "").replace("</span>", "")
+def _extract_spans(*, marked_up_text: str) -> List[TextExtractSpan]:
+    open_tag = "<span>"
+    close_tag = "</span>"
+    tag_pattern = re.compile(re.escape(open_tag) + "|" + re.escape(close_tag))
+    spans: List[TextExtractSpan] = []
+    cursor = 0
+    original_index = 0
+    span_start = None
+    span_text = ""
+    for match in tag_pattern.finditer(marked_up_text):
+        chunk = marked_up_text[cursor : match.start()]
+        if chunk:
+            if span_start is not None:
+                span_text += chunk
+            original_index += len(chunk)
+        tag = match.group(0)
+        if tag == open_tag:
+            if span_start is not None:
+                raise ValueError("Text extract contains nested spans")
+            span_start = original_index
+            span_text = ""
+        else:
+            if span_start is None:
+                raise ValueError("Text extract contains an unmatched closing tag")
+            span_end = original_index
+            spans.append(
+                TextExtractSpan(
+                    index=len(spans) + 1,
+                    start_char=span_start,
+                    end_char=span_end,
+                    text=span_text,
+                )
+            )
+            span_start = None
+            span_text = ""
+        cursor = match.end()
+    tail = marked_up_text[cursor:]
+    if tail:
+        if span_start is not None:
+            span_text += tail
+        original_index += len(tail)
+    if span_start is not None:
+        raise ValueError("Text extract contains an unclosed span")
+    return spans
+def _validate_extract_markup(marked_up_text: str) -> List[str]:
+    try:
+        _extract_spans(marked_up_text=marked_up_text)
+    except ValueError as exc:
+        return [str(exc)]
+    return []
+def _build_retry_message(errors: List[str], current_text: str) -> str:
+    error_lines = "\n".join(f"- {error}" for error in errors)
+    return (
+        "Your last edit did not validate.\n"
+        "Issues:\n"
+        f"{error_lines}\n\n"
+        "Please fix the markup using str_replace. "
+        "Do not nest <span> tags and do not create unmatched tags.\n"
+        "Current text:\n"
+        f"---\n{current_text}\n---"
+    )
+def _extract_validation_error_from_messages(
+    messages: List[dict[str, Any]],
+) -> Optional[str]:
+    for message in messages:
+        if message.get("role") != "user":
+            continue
+        content = str(message.get("content") or "")
+        if "Your last edit did not validate." not in content:
+            continue
+        if "Issues:" not in content:
+            continue
+        lines = content.splitlines()
+        try:
+            issues_index = lines.index("Issues:")
+        except ValueError:
+            continue
+        for line in lines[issues_index + 1 :]:
+            if line.startswith("- "):
+                return line[2:].strip()
+    return None
+def _build_empty_confirmation_message(text: str) -> str:
+    return (
+        "No spans were inserted. If there are truly no spans to return, call done again without changes. "
+        "Otherwise insert <span> tags for the requested text.\n"
+        "Current text:\n"
+        f"---\n{text}\n---"
+    )

biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

biblicus 0.13.0py3-none-any.whl → 0.15.0py3-none-any.whl