biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +2 -2
- biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus/ai/__init__.py +39 -0
- biblicus/ai/embeddings.py +114 -0
- biblicus/ai/llm.py +138 -0
- biblicus/ai/models.py +226 -0
- biblicus/analysis/__init__.py +5 -2
- biblicus/analysis/markov.py +1624 -0
- biblicus/analysis/models.py +754 -1
- biblicus/analysis/topic_modeling.py +98 -19
- biblicus/backends/hybrid.py +6 -1
- biblicus/backends/sqlite_full_text_search.py +4 -2
- biblicus/cli.py +118 -23
- biblicus/context.py +2 -2
- biblicus/recipes.py +136 -0
- biblicus/text/__init__.py +43 -0
- biblicus/text/annotate.py +222 -0
- biblicus/text/extract.py +210 -0
- biblicus/text/link.py +519 -0
- biblicus/text/markup.py +200 -0
- biblicus/text/models.py +319 -0
- biblicus/text/prompts.py +113 -0
- biblicus/text/redact.py +229 -0
- biblicus/text/slice.py +155 -0
- biblicus/text/tool_loop.py +334 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/METADATA +90 -26
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/RECORD +32 -17
- biblicus/analysis/llm.py +0 -106
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/WHEEL +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/top_level.txt +0 -0
biblicus/text/markup.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared span markup parsing utilities.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import Dict, List, Sequence
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TextAnnotatedSpan(BaseModel):
|
|
14
|
+
"""
|
|
15
|
+
Span annotated with arbitrary attributes.
|
|
16
|
+
|
|
17
|
+
:param index: One-based index of the span in the output order.
|
|
18
|
+
:type index: int
|
|
19
|
+
:param start_char: Start character offset in the original text.
|
|
20
|
+
:type start_char: int
|
|
21
|
+
:param end_char: End character offset in the original text.
|
|
22
|
+
:type end_char: int
|
|
23
|
+
:param text: Span text.
|
|
24
|
+
:type text: str
|
|
25
|
+
:param attributes: Attribute mapping extracted from the span tag.
|
|
26
|
+
:type attributes: dict[str, str]
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
model_config = ConfigDict(extra="forbid")
|
|
30
|
+
|
|
31
|
+
index: int = Field(ge=1)
|
|
32
|
+
start_char: int = Field(ge=0)
|
|
33
|
+
end_char: int = Field(ge=0)
|
|
34
|
+
text: str
|
|
35
|
+
attributes: Dict[str, str] = Field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
_TAG_PATTERN = re.compile(r"<span\b[^>]*>|</span>")
|
|
39
|
+
_OPEN_TAG_PATTERN = re.compile(r"<span\b([^>]*)>")
|
|
40
|
+
_ATTRIBUTE_PATTERN = re.compile(r'([A-Za-z_][A-Za-z0-9_-]*)="([^"]*)"')
|
|
41
|
+
_SPAN_INDEX_PATTERN = re.compile(r"Span (\d+)")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def strip_span_tags(text: str) -> str:
|
|
45
|
+
"""
|
|
46
|
+
Remove span tags from text.
|
|
47
|
+
|
|
48
|
+
:param text: Text with span tags.
|
|
49
|
+
:type text: str
|
|
50
|
+
:return: Text with span tags removed.
|
|
51
|
+
:rtype: str
|
|
52
|
+
"""
|
|
53
|
+
return re.sub(r"</?span\b[^>]*>", "", text)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_span_markup(marked_up_text: str) -> List[TextAnnotatedSpan]:
|
|
57
|
+
"""
|
|
58
|
+
Parse span tags with attributes into annotated spans.
|
|
59
|
+
|
|
60
|
+
:param marked_up_text: Text containing span tags.
|
|
61
|
+
:type marked_up_text: str
|
|
62
|
+
:return: Parsed spans with attributes.
|
|
63
|
+
:rtype: list[TextAnnotatedSpan]
|
|
64
|
+
:raises ValueError: If tags are malformed or nested.
|
|
65
|
+
"""
|
|
66
|
+
spans: List[TextAnnotatedSpan] = []
|
|
67
|
+
cursor = 0
|
|
68
|
+
original_index = 0
|
|
69
|
+
span_start = None
|
|
70
|
+
span_text = ""
|
|
71
|
+
span_attributes: Dict[str, str] = {}
|
|
72
|
+
|
|
73
|
+
for match in _TAG_PATTERN.finditer(marked_up_text):
|
|
74
|
+
chunk = marked_up_text[cursor : match.start()]
|
|
75
|
+
if chunk:
|
|
76
|
+
if span_start is not None:
|
|
77
|
+
span_text += chunk
|
|
78
|
+
original_index += len(chunk)
|
|
79
|
+
tag = match.group(0)
|
|
80
|
+
if tag.startswith("<span"):
|
|
81
|
+
if span_start is not None:
|
|
82
|
+
raise ValueError("Text markup contains nested spans")
|
|
83
|
+
span_start = original_index
|
|
84
|
+
span_text = ""
|
|
85
|
+
span_attributes = _parse_span_attributes(tag)
|
|
86
|
+
else:
|
|
87
|
+
if span_start is None:
|
|
88
|
+
raise ValueError("Text markup contains an unmatched closing tag")
|
|
89
|
+
span_end = original_index
|
|
90
|
+
spans.append(
|
|
91
|
+
TextAnnotatedSpan(
|
|
92
|
+
index=len(spans) + 1,
|
|
93
|
+
start_char=span_start,
|
|
94
|
+
end_char=span_end,
|
|
95
|
+
text=span_text,
|
|
96
|
+
attributes=span_attributes,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
span_start = None
|
|
100
|
+
span_text = ""
|
|
101
|
+
span_attributes = {}
|
|
102
|
+
cursor = match.end()
|
|
103
|
+
|
|
104
|
+
tail = marked_up_text[cursor:]
|
|
105
|
+
if tail:
|
|
106
|
+
if span_start is not None:
|
|
107
|
+
span_text += tail
|
|
108
|
+
original_index += len(tail)
|
|
109
|
+
|
|
110
|
+
if span_start is not None:
|
|
111
|
+
raise ValueError("Text markup contains an unclosed span")
|
|
112
|
+
|
|
113
|
+
return spans
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def extract_span_indices(errors: Sequence[str]) -> List[int]:
|
|
117
|
+
"""
|
|
118
|
+
Extract span indices referenced in error messages.
|
|
119
|
+
|
|
120
|
+
:param errors: Validation error messages.
|
|
121
|
+
:type errors: Sequence[str]
|
|
122
|
+
:return: Sorted list of referenced span indices.
|
|
123
|
+
:rtype: list[int]
|
|
124
|
+
"""
|
|
125
|
+
indices: List[int] = []
|
|
126
|
+
for error in errors:
|
|
127
|
+
match = _SPAN_INDEX_PATTERN.search(error)
|
|
128
|
+
if match is None:
|
|
129
|
+
continue
|
|
130
|
+
indices.append(int(match.group(1)))
|
|
131
|
+
return sorted(set(indices))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def summarize_span_context(marked_up_text: str, span_indices: Sequence[int]) -> List[str]:
|
|
135
|
+
"""
|
|
136
|
+
Summarize span context for the requested indices.
|
|
137
|
+
|
|
138
|
+
:param marked_up_text: Text containing span tags.
|
|
139
|
+
:type marked_up_text: str
|
|
140
|
+
:param span_indices: Span indices to summarize.
|
|
141
|
+
:type span_indices: Sequence[int]
|
|
142
|
+
:return: Human-readable span summaries.
|
|
143
|
+
:rtype: list[str]
|
|
144
|
+
:raises ValueError: If the markup is invalid.
|
|
145
|
+
"""
|
|
146
|
+
spans = parse_span_markup(marked_up_text)
|
|
147
|
+
span_by_index = {span.index: span for span in spans}
|
|
148
|
+
summaries: List[str] = []
|
|
149
|
+
for index in span_indices:
|
|
150
|
+
span = span_by_index.get(index)
|
|
151
|
+
if span is None:
|
|
152
|
+
continue
|
|
153
|
+
cleaned_text = " ".join(span.text.split())
|
|
154
|
+
if cleaned_text:
|
|
155
|
+
summaries.append(f"Span {index}: {cleaned_text}")
|
|
156
|
+
return summaries
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def build_span_context_section(marked_up_text: str, errors: Sequence[str]) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Build a formatted span context section for retry messages.
|
|
162
|
+
|
|
163
|
+
:param marked_up_text: Text containing span tags.
|
|
164
|
+
:type marked_up_text: str
|
|
165
|
+
:param errors: Validation error messages.
|
|
166
|
+
:type errors: Sequence[str]
|
|
167
|
+
:return: Formatted span context block or empty string.
|
|
168
|
+
:rtype: str
|
|
169
|
+
"""
|
|
170
|
+
indices = extract_span_indices(errors)
|
|
171
|
+
if not indices:
|
|
172
|
+
return ""
|
|
173
|
+
try:
|
|
174
|
+
summaries = summarize_span_context(marked_up_text, indices)
|
|
175
|
+
except ValueError:
|
|
176
|
+
return ""
|
|
177
|
+
if not summaries:
|
|
178
|
+
return ""
|
|
179
|
+
summary_lines = "\n".join(f"- {summary}" for summary in summaries)
|
|
180
|
+
return f"Relevant spans:\n{summary_lines}\n\n"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _parse_span_attributes(tag_text: str) -> Dict[str, str]:
|
|
184
|
+
match = _OPEN_TAG_PATTERN.fullmatch(tag_text)
|
|
185
|
+
if match is None:
|
|
186
|
+
raise ValueError("Text markup contains an invalid span tag")
|
|
187
|
+
attr_text = match.group(1).strip().replace('\\"', '"')
|
|
188
|
+
if not attr_text:
|
|
189
|
+
return {}
|
|
190
|
+
attributes: Dict[str, str] = {}
|
|
191
|
+
for attr_match in _ATTRIBUTE_PATTERN.finditer(attr_text):
|
|
192
|
+
name = attr_match.group(1)
|
|
193
|
+
value = attr_match.group(2)
|
|
194
|
+
if name in attributes:
|
|
195
|
+
raise ValueError("Text markup contains duplicate span attributes")
|
|
196
|
+
attributes[name] = value
|
|
197
|
+
cleaned = _ATTRIBUTE_PATTERN.sub("", attr_text).strip()
|
|
198
|
+
if cleaned:
|
|
199
|
+
raise ValueError("Text markup contains unsupported span attributes")
|
|
200
|
+
return attributes
|
biblicus/text/models.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic models for agentic text utilities.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from ..ai.models import LlmClientConfig
|
|
12
|
+
from .markup import TextAnnotatedSpan
|
|
13
|
+
from .prompts import (
|
|
14
|
+
DEFAULT_ANNOTATE_SYSTEM_PROMPT,
|
|
15
|
+
DEFAULT_EXTRACT_SYSTEM_PROMPT,
|
|
16
|
+
DEFAULT_LINK_SYSTEM_PROMPT,
|
|
17
|
+
DEFAULT_PROMPT_TEMPLATE,
|
|
18
|
+
DEFAULT_REDACT_SYSTEM_PROMPT,
|
|
19
|
+
DEFAULT_SLICE_SYSTEM_PROMPT,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TextToolLoopRequest(BaseModel):
|
|
24
|
+
"""
|
|
25
|
+
Request to apply a tool-loop text operation using a language model.
|
|
26
|
+
|
|
27
|
+
:param text: Input text to process.
|
|
28
|
+
:type text: str
|
|
29
|
+
:param client: LLM client configuration.
|
|
30
|
+
:type client: biblicus.ai.models.LlmClientConfig
|
|
31
|
+
:param prompt_template: Prompt template describing what to return (must not include ``{text}``).
|
|
32
|
+
:type prompt_template: str
|
|
33
|
+
:param system_prompt: System prompt template containing ``{text}``. The base request requires
|
|
34
|
+
callers to supply this. Specific utility requests provide built-in defaults so callers
|
|
35
|
+
can typically omit it.
|
|
36
|
+
:type system_prompt: str
|
|
37
|
+
:param max_rounds: Maximum number of edit rounds.
|
|
38
|
+
:type max_rounds: int
|
|
39
|
+
:param max_edits_per_round: Maximum edits per round.
|
|
40
|
+
:type max_edits_per_round: int
|
|
41
|
+
:param mock_marked_up_text: Optional pre-rendered markup for deterministic tests.
|
|
42
|
+
:type mock_marked_up_text: str or None
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
model_config = ConfigDict(extra="forbid")
|
|
46
|
+
|
|
47
|
+
text: str = Field(min_length=1)
|
|
48
|
+
client: LlmClientConfig
|
|
49
|
+
prompt_template: str = Field(default=DEFAULT_PROMPT_TEMPLATE, min_length=1)
|
|
50
|
+
system_prompt: str = Field(min_length=1)
|
|
51
|
+
max_rounds: int = Field(default=6, ge=1)
|
|
52
|
+
max_edits_per_round: int = Field(default=500, ge=1)
|
|
53
|
+
mock_marked_up_text: Optional[str] = Field(default=None, min_length=1)
|
|
54
|
+
|
|
55
|
+
@model_validator(mode="after")
|
|
56
|
+
def _validate_prompts(self) -> "TextToolLoopRequest":
|
|
57
|
+
if "{text}" not in self.system_prompt:
|
|
58
|
+
raise ValueError("system_prompt must include {text}")
|
|
59
|
+
if "{text}" in self.prompt_template:
|
|
60
|
+
raise ValueError("prompt_template must not include {text}")
|
|
61
|
+
return self
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class TextExtractRequest(TextToolLoopRequest):
|
|
65
|
+
"""
|
|
66
|
+
Request to apply text extract using a language model.
|
|
67
|
+
|
|
68
|
+
:param text: Input text to annotate with XML span tags.
|
|
69
|
+
:type text: str
|
|
70
|
+
:param client: LLM client configuration.
|
|
71
|
+
:type client: biblicus.ai.models.LlmClientConfig
|
|
72
|
+
:param prompt_template: Prompt template describing what to return (must not include ``{text}``).
|
|
73
|
+
:type prompt_template: str
|
|
74
|
+
:param system_prompt: System prompt template containing ``{text}``. Defaults to the built-in
|
|
75
|
+
text extract system prompt.
|
|
76
|
+
:type system_prompt: str
|
|
77
|
+
:param max_rounds: Maximum number of edit rounds.
|
|
78
|
+
:type max_rounds: int
|
|
79
|
+
:param max_edits_per_round: Maximum edits per round.
|
|
80
|
+
:type max_edits_per_round: int
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
system_prompt: str = Field(default=DEFAULT_EXTRACT_SYSTEM_PROMPT, min_length=1)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class TextSliceRequest(TextToolLoopRequest):
|
|
87
|
+
"""
|
|
88
|
+
Request to apply text slice using a language model.
|
|
89
|
+
|
|
90
|
+
:param text: Input text to mark with slice markers.
|
|
91
|
+
:type text: str
|
|
92
|
+
:param client: LLM client configuration.
|
|
93
|
+
:type client: biblicus.ai.models.LlmClientConfig
|
|
94
|
+
:param prompt_template: Prompt template describing what to return (must not include ``{text}``).
|
|
95
|
+
:type prompt_template: str
|
|
96
|
+
:param system_prompt: System prompt template containing ``{text}``. Defaults to the built-in
|
|
97
|
+
text slice system prompt.
|
|
98
|
+
:type system_prompt: str
|
|
99
|
+
:param max_rounds: Maximum number of edit rounds.
|
|
100
|
+
:type max_rounds: int
|
|
101
|
+
:param max_edits_per_round: Maximum edits per round.
|
|
102
|
+
:type max_edits_per_round: int
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
system_prompt: str = Field(default=DEFAULT_SLICE_SYSTEM_PROMPT, min_length=1)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class TextAnnotateRequest(TextToolLoopRequest):
|
|
109
|
+
"""
|
|
110
|
+
Request to apply text annotation using span attributes.
|
|
111
|
+
|
|
112
|
+
:param text: Input text to annotate with XML span tags.
|
|
113
|
+
:type text: str
|
|
114
|
+
:param client: LLM client configuration.
|
|
115
|
+
:type client: biblicus.ai.models.LlmClientConfig
|
|
116
|
+
:param prompt_template: Prompt template describing what to return (must not include ``{text}``).
|
|
117
|
+
:type prompt_template: str
|
|
118
|
+
:param system_prompt: System prompt containing ``{text}``. Defaults to the built-in
|
|
119
|
+
text annotate system prompt.
|
|
120
|
+
:type system_prompt: str
|
|
121
|
+
:param allowed_attributes: Optional list of allowed span attribute names.
|
|
122
|
+
:type allowed_attributes: list[str] or None
|
|
123
|
+
:param max_rounds: Maximum number of edit rounds.
|
|
124
|
+
:type max_rounds: int
|
|
125
|
+
:param max_edits_per_round: Maximum edits per round.
|
|
126
|
+
:type max_edits_per_round: int
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
system_prompt: str = Field(default=DEFAULT_ANNOTATE_SYSTEM_PROMPT, min_length=1)
|
|
130
|
+
allowed_attributes: Optional[List[str]] = None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class TextRedactRequest(TextToolLoopRequest):
|
|
134
|
+
"""
|
|
135
|
+
Request to apply text redaction using span markers.
|
|
136
|
+
|
|
137
|
+
:param text: Input text to annotate with XML span tags.
|
|
138
|
+
:type text: str
|
|
139
|
+
:param client: LLM client configuration.
|
|
140
|
+
:type client: biblicus.ai.models.LlmClientConfig
|
|
141
|
+
:param prompt_template: Prompt template describing what to return (must not include ``{text}``).
|
|
142
|
+
:type prompt_template: str
|
|
143
|
+
:param system_prompt: System prompt containing ``{text}``. Defaults to the built-in
|
|
144
|
+
text redact system prompt.
|
|
145
|
+
:type system_prompt: str
|
|
146
|
+
:param redaction_types: Optional list of allowed redaction types. When omitted, no attributes are allowed.
|
|
147
|
+
:type redaction_types: list[str] or None
|
|
148
|
+
:param max_rounds: Maximum number of edit rounds.
|
|
149
|
+
:type max_rounds: int
|
|
150
|
+
:param max_edits_per_round: Maximum edits per round.
|
|
151
|
+
:type max_edits_per_round: int
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
system_prompt: str = Field(default=DEFAULT_REDACT_SYSTEM_PROMPT, min_length=1)
|
|
155
|
+
redaction_types: Optional[List[str]] = None
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class TextLinkRequest(TextToolLoopRequest):
|
|
159
|
+
"""
|
|
160
|
+
Request to apply text linking using id/ref span attributes.
|
|
161
|
+
|
|
162
|
+
:param text: Input text to annotate with XML span tags.
|
|
163
|
+
:type text: str
|
|
164
|
+
:param client: LLM client configuration.
|
|
165
|
+
:type client: biblicus.ai.models.LlmClientConfig
|
|
166
|
+
:param prompt_template: Prompt template describing what to return (must not include ``{text}``).
|
|
167
|
+
:type prompt_template: str
|
|
168
|
+
:param system_prompt: System prompt containing ``{text}``. Defaults to the built-in
|
|
169
|
+
text link system prompt.
|
|
170
|
+
:type system_prompt: str
|
|
171
|
+
:param id_prefix: Prefix required for id attributes.
|
|
172
|
+
:type id_prefix: str
|
|
173
|
+
:param max_rounds: Maximum number of edit rounds.
|
|
174
|
+
:type max_rounds: int
|
|
175
|
+
:param max_edits_per_round: Maximum edits per round.
|
|
176
|
+
:type max_edits_per_round: int
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
system_prompt: str = Field(default=DEFAULT_LINK_SYSTEM_PROMPT, min_length=1)
|
|
180
|
+
id_prefix: str = Field(default="link_", min_length=1)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class TextExtractSpan(BaseModel):
|
|
184
|
+
"""
|
|
185
|
+
Extracted span of text.
|
|
186
|
+
|
|
187
|
+
:param index: One-based index of the span in the output order.
|
|
188
|
+
:type index: int
|
|
189
|
+
:param start_char: Start character offset in the original text.
|
|
190
|
+
:type start_char: int
|
|
191
|
+
:param end_char: End character offset in the original text.
|
|
192
|
+
:type end_char: int
|
|
193
|
+
:param text: Span text.
|
|
194
|
+
:type text: str
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
model_config = ConfigDict(extra="forbid")
|
|
198
|
+
|
|
199
|
+
index: int = Field(ge=1)
|
|
200
|
+
start_char: int = Field(ge=0)
|
|
201
|
+
end_char: int = Field(ge=0)
|
|
202
|
+
text: str
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class TextSliceSegment(BaseModel):
|
|
206
|
+
"""
|
|
207
|
+
Extracted text slice.
|
|
208
|
+
|
|
209
|
+
:param index: One-based index of the slice in the output order.
|
|
210
|
+
:type index: int
|
|
211
|
+
:param start_char: Start character offset in the original text.
|
|
212
|
+
:type start_char: int
|
|
213
|
+
:param end_char: End character offset in the original text.
|
|
214
|
+
:type end_char: int
|
|
215
|
+
:param text: Slice text.
|
|
216
|
+
:type text: str
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
model_config = ConfigDict(extra="forbid")
|
|
220
|
+
|
|
221
|
+
index: int = Field(ge=1)
|
|
222
|
+
start_char: int = Field(ge=0)
|
|
223
|
+
end_char: int = Field(ge=0)
|
|
224
|
+
text: str
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class TextExtractResult(BaseModel):
|
|
228
|
+
"""
|
|
229
|
+
Text extract output bundle.
|
|
230
|
+
|
|
231
|
+
:param marked_up_text: Original text with XML span tags inserted.
|
|
232
|
+
:type marked_up_text: str
|
|
233
|
+
:param spans: Extracted spans in document order.
|
|
234
|
+
:type spans: list[TextExtractSpan]
|
|
235
|
+
:param warnings: Warning messages for the caller.
|
|
236
|
+
:type warnings: list[str]
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
model_config = ConfigDict(extra="forbid")
|
|
240
|
+
|
|
241
|
+
marked_up_text: str
|
|
242
|
+
spans: List[TextExtractSpan] = Field(default_factory=list)
|
|
243
|
+
warnings: List[str] = Field(default_factory=list)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class TextSliceResult(BaseModel):
|
|
247
|
+
"""
|
|
248
|
+
Text slice output bundle.
|
|
249
|
+
|
|
250
|
+
:param marked_up_text: Original text with slice markers inserted.
|
|
251
|
+
:type marked_up_text: str
|
|
252
|
+
:param slices: Extracted slices in document order.
|
|
253
|
+
:type slices: list[TextSliceSegment]
|
|
254
|
+
:param warnings: Warning messages for the caller.
|
|
255
|
+
:type warnings: list[str]
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
model_config = ConfigDict(extra="forbid")
|
|
259
|
+
|
|
260
|
+
marked_up_text: str
|
|
261
|
+
slices: List[TextSliceSegment] = Field(default_factory=list)
|
|
262
|
+
warnings: List[str] = Field(default_factory=list)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class TextAnnotateResult(BaseModel):
|
|
266
|
+
"""
|
|
267
|
+
Text annotation output bundle.
|
|
268
|
+
|
|
269
|
+
:param marked_up_text: Original text with XML span tags inserted.
|
|
270
|
+
:type marked_up_text: str
|
|
271
|
+
:param spans: Extracted spans in document order.
|
|
272
|
+
:type spans: list[TextAnnotatedSpan]
|
|
273
|
+
:param warnings: Warning messages for the caller.
|
|
274
|
+
:type warnings: list[str]
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
model_config = ConfigDict(extra="forbid")
|
|
278
|
+
|
|
279
|
+
marked_up_text: str
|
|
280
|
+
spans: List[TextAnnotatedSpan] = Field(default_factory=list)
|
|
281
|
+
warnings: List[str] = Field(default_factory=list)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class TextRedactResult(BaseModel):
|
|
285
|
+
"""
|
|
286
|
+
Text redaction output bundle.
|
|
287
|
+
|
|
288
|
+
:param marked_up_text: Original text with XML span tags inserted.
|
|
289
|
+
:type marked_up_text: str
|
|
290
|
+
:param spans: Redacted spans in document order.
|
|
291
|
+
:type spans: list[TextAnnotatedSpan]
|
|
292
|
+
:param warnings: Warning messages for the caller.
|
|
293
|
+
:type warnings: list[str]
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
model_config = ConfigDict(extra="forbid")
|
|
297
|
+
|
|
298
|
+
marked_up_text: str
|
|
299
|
+
spans: List[TextAnnotatedSpan] = Field(default_factory=list)
|
|
300
|
+
warnings: List[str] = Field(default_factory=list)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class TextLinkResult(BaseModel):
|
|
304
|
+
"""
|
|
305
|
+
Text linking output bundle.
|
|
306
|
+
|
|
307
|
+
:param marked_up_text: Original text with XML span tags inserted.
|
|
308
|
+
:type marked_up_text: str
|
|
309
|
+
:param spans: Linked spans in document order.
|
|
310
|
+
:type spans: list[TextAnnotatedSpan]
|
|
311
|
+
:param warnings: Warning messages for the caller.
|
|
312
|
+
:type warnings: list[str]
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
model_config = ConfigDict(extra="forbid")
|
|
316
|
+
|
|
317
|
+
marked_up_text: str
|
|
318
|
+
spans: List[TextAnnotatedSpan] = Field(default_factory=list)
|
|
319
|
+
warnings: List[str] = Field(default_factory=list)
|
biblicus/text/prompts.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Default prompts for Biblicus text utilities.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
DEFAULT_PROMPT_TEMPLATE = "Return the requested text."
|
|
8
|
+
|
|
9
|
+
DEFAULT_EXTRACT_SYSTEM_PROMPT = (
|
|
10
|
+
"You are a virtual file editor. Use the available tools to edit the text.\n"
|
|
11
|
+
"Interpret the word 'return' in the user's request as: wrap the returned text with "
|
|
12
|
+
"<span>...</span> in-place in the current text.\n\n"
|
|
13
|
+
"Use the str_replace tool to insert <span>...</span> tags and the done tool when finished.\n"
|
|
14
|
+
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
15
|
+
"Rules:\n"
|
|
16
|
+
"- Use str_replace only.\n"
|
|
17
|
+
"- old_str must match exactly once in the current text.\n"
|
|
18
|
+
"- When choosing old_str, copy the exact substring (including punctuation/case) from the current text.\n"
|
|
19
|
+
"- old_str and new_str must be non-empty strings.\n"
|
|
20
|
+
"- new_str must be identical to old_str with only <span> and </span> inserted.\n"
|
|
21
|
+
"- Do not include <span> or </span> inside old_str or new_str.\n"
|
|
22
|
+
"- Do not insert nested spans.\n"
|
|
23
|
+
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
24
|
+
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
25
|
+
"- Do not delete, reorder, paraphrase, or label text.\n\n"
|
|
26
|
+
"Current text:\n---\n{text}\n---\n"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
DEFAULT_SLICE_SYSTEM_PROMPT = (
|
|
30
|
+
"You are a virtual file editor. Use the available tools to edit the text.\n"
|
|
31
|
+
"Interpret the word 'return' in the user's request as: insert <slice/> markers in-place in the current text.\n\n"
|
|
32
|
+
"Use the str_replace tool to insert <slice/> markers and the done tool when finished.\n"
|
|
33
|
+
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
34
|
+
"Rules:\n"
|
|
35
|
+
"- Use str_replace only.\n"
|
|
36
|
+
"- old_str must match exactly once in the current text.\n"
|
|
37
|
+
"- old_str and new_str must be non-empty strings.\n"
|
|
38
|
+
"- new_str must be identical to old_str with only <slice/> inserted.\n"
|
|
39
|
+
"- Do not include <slice/> inside old_str or new_str.\n"
|
|
40
|
+
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
41
|
+
"- If a tool call fails, read the error and keep editing. Do not call done until markers are inserted.\n"
|
|
42
|
+
"- Do not delete, reorder, or paraphrase text.\n\n"
|
|
43
|
+
"Current text:\n---\n{text}\n---\n"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
DEFAULT_ANNOTATE_SYSTEM_PROMPT = (
|
|
47
|
+
"You are a virtual file editor. Use the available tools to edit the text.\n"
|
|
48
|
+
"Interpret the user's request as: wrap the requested text with "
|
|
49
|
+
'<span ATTRIBUTE="VALUE">...</span> in-place in the current text.\n'
|
|
50
|
+
"Each span must include exactly one attribute from: {{ allowed_attributes }}.\n\n"
|
|
51
|
+
"Use the str_replace tool to insert span tags and the done tool when finished.\n"
|
|
52
|
+
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
53
|
+
"Rules:\n"
|
|
54
|
+
"- Use str_replace only.\n"
|
|
55
|
+
"- old_str must match exactly once in the current text.\n"
|
|
56
|
+
"- old_str and new_str must be non-empty strings.\n"
|
|
57
|
+
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
58
|
+
"- Do not include <span or </span> inside old_str or new_str.\n"
|
|
59
|
+
"- Do not insert nested spans.\n"
|
|
60
|
+
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
61
|
+
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
62
|
+
"- Do not delete, reorder, paraphrase, or label text beyond the span attributes.\n\n"
|
|
63
|
+
"Current text:\n---\n{text}\n---\n"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
DEFAULT_LINK_SYSTEM_PROMPT = (
|
|
67
|
+
"You are a virtual file editor. Use the available tools to edit the text.\n"
|
|
68
|
+
"Interpret the word 'return' in the user's request as: wrap the returned text with "
|
|
69
|
+
'<span ATTRIBUTE="VALUE">...</span> in-place in the current text.\n'
|
|
70
|
+
"Each span must include exactly one attribute: id for first mentions and ref for repeats.\n"
|
|
71
|
+
"Id values must start with '{{ id_prefix }}'.\n\n"
|
|
72
|
+
"Linking rules:\n"
|
|
73
|
+
"- For each distinct name or entity, assign exactly one id on its first occurrence.\n"
|
|
74
|
+
"- Use ref on every subsequent occurrence of the same name or entity.\n"
|
|
75
|
+
"- Wrap only the repeated name or entity text itself (no extra surrounding words).\n"
|
|
76
|
+
"- Reuse the same id/ref value for identical names; do not create multiple ids for the same name.\n"
|
|
77
|
+
"- Use ids in order of first appearance ({{ id_prefix }}1, {{ id_prefix }}2, ...).\n"
|
|
78
|
+
"- Do not call done until every repeated name or entity in the text is wrapped.\n"
|
|
79
|
+
"- If a name appears multiple times, there must be one id and refs for every later occurrence.\n\n"
|
|
80
|
+
"Use the str_replace tool to insert span tags and the done tool when finished.\n"
|
|
81
|
+
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
82
|
+
"Rules:\n"
|
|
83
|
+
"- Use str_replace only.\n"
|
|
84
|
+
"- old_str must match exactly once in the current text.\n"
|
|
85
|
+
"- old_str and new_str must be non-empty strings.\n"
|
|
86
|
+
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
87
|
+
"- Do not include <span or </span> inside old_str or new_str.\n"
|
|
88
|
+
"- Do not insert nested spans.\n"
|
|
89
|
+
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
90
|
+
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
91
|
+
"- Do not delete, reorder, or paraphrase text.\n\n"
|
|
92
|
+
"Current text:\n---\n{text}\n---\n"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
DEFAULT_REDACT_SYSTEM_PROMPT = (
|
|
96
|
+
"You are a virtual file editor. Use the available tools to edit the text.\n"
|
|
97
|
+
"Interpret the word 'return' in the user's request as: wrap the returned text with "
|
|
98
|
+
"<span>...</span> in-place in the current text.\n"
|
|
99
|
+
"If redaction types are provided, use a redact attribute with one of: {{ redaction_types }}.\n\n"
|
|
100
|
+
"Use the str_replace tool to insert span tags and the done tool when finished.\n"
|
|
101
|
+
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
102
|
+
"Rules:\n"
|
|
103
|
+
"- Use str_replace only.\n"
|
|
104
|
+
"- old_str must match exactly once in the current text.\n"
|
|
105
|
+
"- old_str and new_str must be non-empty strings.\n"
|
|
106
|
+
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
107
|
+
"- Do not include <span or </span> inside old_str or new_str.\n"
|
|
108
|
+
"- Do not insert nested spans.\n"
|
|
109
|
+
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
110
|
+
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
111
|
+
"- Do not delete, reorder, or paraphrase text.\n\n"
|
|
112
|
+
"Current text:\n---\n{text}\n---\n"
|
|
113
|
+
)
|