biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +2 -2
- biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus/ai/__init__.py +39 -0
- biblicus/ai/embeddings.py +114 -0
- biblicus/ai/llm.py +138 -0
- biblicus/ai/models.py +226 -0
- biblicus/analysis/__init__.py +5 -2
- biblicus/analysis/markov.py +1624 -0
- biblicus/analysis/models.py +754 -1
- biblicus/analysis/topic_modeling.py +98 -19
- biblicus/backends/hybrid.py +6 -1
- biblicus/backends/sqlite_full_text_search.py +4 -2
- biblicus/cli.py +118 -23
- biblicus/context.py +2 -2
- biblicus/recipes.py +136 -0
- biblicus/text/__init__.py +43 -0
- biblicus/text/annotate.py +222 -0
- biblicus/text/extract.py +210 -0
- biblicus/text/link.py +519 -0
- biblicus/text/markup.py +200 -0
- biblicus/text/models.py +319 -0
- biblicus/text/prompts.py +113 -0
- biblicus/text/redact.py +229 -0
- biblicus/text/slice.py +155 -0
- biblicus/text/tool_loop.py +334 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/METADATA +90 -26
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/RECORD +32 -17
- biblicus/analysis/llm.py +0 -106
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/WHEEL +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/top_level.txt +0 -0
biblicus/text/redact.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agentic text redaction using virtual file edits.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Iterable, List, Sequence
|
|
8
|
+
|
|
9
|
+
from jinja2 import Environment, StrictUndefined
|
|
10
|
+
|
|
11
|
+
from .markup import (
|
|
12
|
+
TextAnnotatedSpan,
|
|
13
|
+
build_span_context_section,
|
|
14
|
+
parse_span_markup,
|
|
15
|
+
strip_span_tags,
|
|
16
|
+
)
|
|
17
|
+
from .models import TextRedactRequest, TextRedactResult
|
|
18
|
+
from .tool_loop import request_confirmation, run_tool_loop
|
|
19
|
+
|
|
20
|
+
DEFAULT_REDACTION_TYPES = ["pii", "pci", "phi"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def apply_text_redact(request: TextRedactRequest) -> TextRedactResult:
|
|
24
|
+
"""
|
|
25
|
+
Apply text redaction using a language model.
|
|
26
|
+
|
|
27
|
+
:param request: Text redact request.
|
|
28
|
+
:type request: TextRedactRequest
|
|
29
|
+
:return: Text redact result.
|
|
30
|
+
:rtype: TextRedactResult
|
|
31
|
+
:raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
|
|
32
|
+
a confirmation round and return a warning when confirmed.
|
|
33
|
+
"""
|
|
34
|
+
warnings: List[str] = []
|
|
35
|
+
redaction_types = _resolve_redaction_types(request.redaction_types)
|
|
36
|
+
system_prompt = _render_system_prompt(
|
|
37
|
+
request.system_prompt,
|
|
38
|
+
redaction_types=redaction_types,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
if request.mock_marked_up_text is not None:
|
|
42
|
+
return _build_mock_result(
|
|
43
|
+
request,
|
|
44
|
+
request.mock_marked_up_text,
|
|
45
|
+
redaction_types=redaction_types,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
result = run_tool_loop(
|
|
49
|
+
text=request.text,
|
|
50
|
+
client=request.client,
|
|
51
|
+
system_prompt=system_prompt,
|
|
52
|
+
prompt_template=request.prompt_template,
|
|
53
|
+
max_rounds=request.max_rounds,
|
|
54
|
+
max_edits_per_round=request.max_edits_per_round,
|
|
55
|
+
apply_str_replace=_apply_redact_replace,
|
|
56
|
+
validate_text=lambda current_text: _validate_redaction_markup(
|
|
57
|
+
current_text, redaction_types
|
|
58
|
+
),
|
|
59
|
+
build_retry_message=lambda errors, current_text: _build_retry_message(
|
|
60
|
+
errors, current_text, redaction_types
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if not result.done:
|
|
65
|
+
if result.last_error:
|
|
66
|
+
raise ValueError(f"Text redact failed: {result.last_error}")
|
|
67
|
+
warnings.append("Text redact reached max rounds without done=true")
|
|
68
|
+
|
|
69
|
+
if result.text == request.text:
|
|
70
|
+
if result.last_error:
|
|
71
|
+
raise ValueError(result.last_error)
|
|
72
|
+
confirmation = request_confirmation(
|
|
73
|
+
result=result,
|
|
74
|
+
text=result.text,
|
|
75
|
+
client=request.client,
|
|
76
|
+
system_prompt=system_prompt,
|
|
77
|
+
prompt_template=request.prompt_template,
|
|
78
|
+
max_rounds=2,
|
|
79
|
+
max_edits_per_round=request.max_edits_per_round,
|
|
80
|
+
apply_str_replace=_apply_redact_replace,
|
|
81
|
+
confirmation_message=_build_empty_confirmation_message(result.text),
|
|
82
|
+
validate_text=lambda current_text: _validate_redaction_markup(
|
|
83
|
+
current_text, redaction_types
|
|
84
|
+
),
|
|
85
|
+
build_retry_message=lambda errors, current_text: _build_retry_message(
|
|
86
|
+
errors, current_text, redaction_types
|
|
87
|
+
),
|
|
88
|
+
)
|
|
89
|
+
if not confirmation.done:
|
|
90
|
+
if confirmation.last_error:
|
|
91
|
+
raise ValueError(f"Text redact failed: {confirmation.last_error}")
|
|
92
|
+
warnings.append("Text redact confirmation reached max rounds without done=true")
|
|
93
|
+
_validate_preserved_text(original=request.text, marked_up=confirmation.text)
|
|
94
|
+
spans = parse_span_markup(confirmation.text)
|
|
95
|
+
validation_errors = _validate_redaction_spans(spans, redaction_types)
|
|
96
|
+
if validation_errors:
|
|
97
|
+
raise ValueError("; ".join(validation_errors))
|
|
98
|
+
if not spans:
|
|
99
|
+
warnings.append("Text redact returned no spans; model confirmed empty result")
|
|
100
|
+
return TextRedactResult(
|
|
101
|
+
marked_up_text=confirmation.text,
|
|
102
|
+
spans=spans,
|
|
103
|
+
warnings=warnings,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
_validate_preserved_text(original=request.text, marked_up=result.text)
|
|
107
|
+
spans = parse_span_markup(result.text)
|
|
108
|
+
validation_errors = _validate_redaction_spans(spans, redaction_types)
|
|
109
|
+
if validation_errors:
|
|
110
|
+
raise ValueError("; ".join(validation_errors))
|
|
111
|
+
return TextRedactResult(marked_up_text=result.text, spans=spans, warnings=warnings)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _build_mock_result(
|
|
115
|
+
request: TextRedactRequest,
|
|
116
|
+
marked_up_text: str,
|
|
117
|
+
*,
|
|
118
|
+
redaction_types: Sequence[str] | None,
|
|
119
|
+
) -> TextRedactResult:
|
|
120
|
+
if marked_up_text == request.text:
|
|
121
|
+
raise ValueError("Text redact produced no spans")
|
|
122
|
+
_validate_preserved_text(original=request.text, marked_up=marked_up_text)
|
|
123
|
+
spans = parse_span_markup(marked_up_text)
|
|
124
|
+
errors = _validate_redaction_spans(spans, redaction_types)
|
|
125
|
+
if errors:
|
|
126
|
+
raise ValueError("; ".join(errors))
|
|
127
|
+
return TextRedactResult(marked_up_text=marked_up_text, spans=spans, warnings=[])
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _resolve_redaction_types(redaction_types: Sequence[str] | None) -> List[str] | None:
|
|
131
|
+
if redaction_types is None or len(redaction_types) == 0:
|
|
132
|
+
return None
|
|
133
|
+
return [value for value in redaction_types]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _render_system_prompt(template: str, *, redaction_types: Sequence[str] | None) -> str:
|
|
137
|
+
env = Environment(undefined=StrictUndefined)
|
|
138
|
+
rendered = env.from_string(template).render(
|
|
139
|
+
redaction_types=list(redaction_types) if redaction_types is not None else [],
|
|
140
|
+
)
|
|
141
|
+
return rendered
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _apply_redact_replace(text: str, old_str: str, new_str: str) -> str:
|
|
145
|
+
occurrences = text.count(old_str)
|
|
146
|
+
if occurrences == 0:
|
|
147
|
+
raise ValueError("Text redact replacement old_str not found")
|
|
148
|
+
if occurrences > 1:
|
|
149
|
+
raise ValueError("Text redact replacement old_str is not unique")
|
|
150
|
+
_validate_replace_text(old_str, new_str)
|
|
151
|
+
return text.replace(old_str, new_str, 1)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _validate_replace_text(old_str: str, new_str: str) -> None:
|
|
155
|
+
if strip_span_tags(old_str) != strip_span_tags(new_str):
|
|
156
|
+
raise ValueError("Text redact replacements may only insert span tags")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _validate_preserved_text(*, original: str, marked_up: str) -> None:
|
|
160
|
+
if strip_span_tags(marked_up) != original:
|
|
161
|
+
raise ValueError("Text redact edits modified the source text")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _validate_redaction_markup(
|
|
165
|
+
marked_up_text: str, redaction_types: Sequence[str] | None
|
|
166
|
+
) -> List[str]:
|
|
167
|
+
try:
|
|
168
|
+
spans = parse_span_markup(marked_up_text)
|
|
169
|
+
except ValueError as exc:
|
|
170
|
+
return [str(exc)]
|
|
171
|
+
return _validate_redaction_spans(spans, redaction_types)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _validate_redaction_spans(
|
|
175
|
+
spans: Iterable[TextAnnotatedSpan], redaction_types: Sequence[str] | None
|
|
176
|
+
) -> List[str]:
|
|
177
|
+
errors: List[str] = []
|
|
178
|
+
if redaction_types is None:
|
|
179
|
+
for span in spans:
|
|
180
|
+
if span.attributes:
|
|
181
|
+
errors.append(
|
|
182
|
+
f"Span {span.index} contains attributes but redaction types are disabled"
|
|
183
|
+
)
|
|
184
|
+
return errors
|
|
185
|
+
|
|
186
|
+
allowed_values = set(redaction_types)
|
|
187
|
+
for span in spans:
|
|
188
|
+
if len(span.attributes) != 1:
|
|
189
|
+
errors.append(f"Span {span.index} must include exactly one redact attribute")
|
|
190
|
+
continue
|
|
191
|
+
name, value = next(iter(span.attributes.items()))
|
|
192
|
+
if name != "redact":
|
|
193
|
+
errors.append(f"Span {span.index} uses attribute '{name}' but only 'redact' is allowed")
|
|
194
|
+
if value not in allowed_values:
|
|
195
|
+
errors.append(
|
|
196
|
+
f"Span {span.index} uses redaction type '{value}'. Allowed types: {', '.join(redaction_types)}"
|
|
197
|
+
)
|
|
198
|
+
return errors
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _build_retry_message(
|
|
202
|
+
errors: Sequence[str], current_text: str, redaction_types: Sequence[str] | None
|
|
203
|
+
) -> str:
|
|
204
|
+
error_lines = "\n".join(f"- {error}" for error in errors)
|
|
205
|
+
context_section = build_span_context_section(current_text, errors)
|
|
206
|
+
type_message = (
|
|
207
|
+
"Do not add attributes."
|
|
208
|
+
if redaction_types is None
|
|
209
|
+
else f"Use a redact attribute with one of: {', '.join(redaction_types)}."
|
|
210
|
+
)
|
|
211
|
+
return (
|
|
212
|
+
"Your last edit did not validate.\n"
|
|
213
|
+
"Issues:\n"
|
|
214
|
+
f"{error_lines}\n\n"
|
|
215
|
+
f"{context_section}"
|
|
216
|
+
"Please fix the markup using str_replace. "
|
|
217
|
+
f"{type_message} Try again.\n"
|
|
218
|
+
"Current text:\n"
|
|
219
|
+
f"---\n{current_text}\n---"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _build_empty_confirmation_message(text: str) -> str:
|
|
224
|
+
return (
|
|
225
|
+
"No redaction spans were inserted. If there are truly no spans to return, "
|
|
226
|
+
"call done again without changes. Otherwise insert the appropriate span tags.\n"
|
|
227
|
+
"Current text:\n"
|
|
228
|
+
f"---\n{text}\n---"
|
|
229
|
+
)
|
biblicus/text/slice.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agentic text slicing using virtual file edits.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from .models import TextSliceRequest, TextSliceResult, TextSliceSegment
|
|
11
|
+
from .tool_loop import request_confirmation, run_tool_loop
|
|
12
|
+
|
|
13
|
+
_SLICE_MARKER = "<slice/>"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def apply_text_slice(request: TextSliceRequest) -> TextSliceResult:
|
|
17
|
+
"""
|
|
18
|
+
Apply text slicing using a language model.
|
|
19
|
+
|
|
20
|
+
:param request: Text slice request.
|
|
21
|
+
:type request: TextSliceRequest
|
|
22
|
+
:return: Text slice result.
|
|
23
|
+
:rtype: TextSliceResult
|
|
24
|
+
:raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
|
|
25
|
+
a confirmation round and return a warning when confirmed.
|
|
26
|
+
"""
|
|
27
|
+
if request.mock_marked_up_text is not None:
|
|
28
|
+
return _build_mock_result(request, request.mock_marked_up_text)
|
|
29
|
+
|
|
30
|
+
warnings: List[str] = []
|
|
31
|
+
result = run_tool_loop(
|
|
32
|
+
text=request.text,
|
|
33
|
+
client=request.client,
|
|
34
|
+
system_prompt=request.system_prompt,
|
|
35
|
+
prompt_template=request.prompt_template,
|
|
36
|
+
max_rounds=request.max_rounds,
|
|
37
|
+
max_edits_per_round=request.max_edits_per_round,
|
|
38
|
+
apply_str_replace=_apply_slice_replace,
|
|
39
|
+
)
|
|
40
|
+
if not result.done:
|
|
41
|
+
if result.last_error:
|
|
42
|
+
raise ValueError(f"Text slice failed: {result.last_error}")
|
|
43
|
+
warnings.append("Text slice reached max rounds without done=true")
|
|
44
|
+
if result.text == request.text:
|
|
45
|
+
if result.last_error:
|
|
46
|
+
raise ValueError(result.last_error)
|
|
47
|
+
confirmation = request_confirmation(
|
|
48
|
+
result=result,
|
|
49
|
+
text=result.text,
|
|
50
|
+
client=request.client,
|
|
51
|
+
system_prompt=request.system_prompt,
|
|
52
|
+
prompt_template=request.prompt_template,
|
|
53
|
+
max_rounds=2,
|
|
54
|
+
max_edits_per_round=request.max_edits_per_round,
|
|
55
|
+
apply_str_replace=_apply_slice_replace,
|
|
56
|
+
confirmation_message=_build_empty_confirmation_message(result.text),
|
|
57
|
+
)
|
|
58
|
+
if not confirmation.done:
|
|
59
|
+
if confirmation.last_error:
|
|
60
|
+
raise ValueError(f"Text slice failed: {confirmation.last_error}")
|
|
61
|
+
warnings.append("Text slice confirmation reached max rounds without done=true")
|
|
62
|
+
_validate_preserved_text(original=request.text, marked_up=confirmation.text)
|
|
63
|
+
slices = _extract_slices(marked_up_text=confirmation.text)
|
|
64
|
+
if confirmation.text == request.text:
|
|
65
|
+
warnings.append("Text slice returned no markers; model confirmed single slice")
|
|
66
|
+
if not slices:
|
|
67
|
+
raise ValueError("Text slice produced no slices")
|
|
68
|
+
return TextSliceResult(
|
|
69
|
+
marked_up_text=confirmation.text,
|
|
70
|
+
slices=slices,
|
|
71
|
+
warnings=warnings,
|
|
72
|
+
)
|
|
73
|
+
_validate_preserved_text(original=request.text, marked_up=result.text)
|
|
74
|
+
slices = _extract_slices(marked_up_text=result.text)
|
|
75
|
+
if not slices:
|
|
76
|
+
raise ValueError("Text slice produced no slices")
|
|
77
|
+
return TextSliceResult(marked_up_text=result.text, slices=slices, warnings=warnings)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _build_mock_result(request: TextSliceRequest, marked_up_text: str) -> TextSliceResult:
|
|
81
|
+
if marked_up_text == request.text:
|
|
82
|
+
raise ValueError("Text slice produced no markers")
|
|
83
|
+
_validate_preserved_text(original=request.text, marked_up=marked_up_text)
|
|
84
|
+
slices = _extract_slices(marked_up_text=marked_up_text)
|
|
85
|
+
return TextSliceResult(marked_up_text=marked_up_text, slices=slices, warnings=[])
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _apply_slice_replace(text: str, old_str: str, new_str: str) -> str:
|
|
89
|
+
occurrences = text.count(old_str)
|
|
90
|
+
if occurrences == 0:
|
|
91
|
+
raise ValueError("Text slice replacement old_str not found")
|
|
92
|
+
if occurrences > 1:
|
|
93
|
+
raise ValueError("Text slice replacement old_str is not unique")
|
|
94
|
+
_validate_replace_text(old_str, new_str)
|
|
95
|
+
return text.replace(old_str, new_str, 1)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _validate_replace_text(old_str: str, new_str: str) -> None:
|
|
99
|
+
if _strip_slice_markers(old_str) != _strip_slice_markers(new_str):
|
|
100
|
+
raise ValueError("Text slice replacements may only insert slice markers")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _validate_preserved_text(*, original: str, marked_up: str) -> None:
|
|
104
|
+
if _strip_slice_markers(marked_up) != original:
|
|
105
|
+
raise ValueError("Text slice edits modified the source text")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _strip_slice_markers(text: str) -> str:
|
|
109
|
+
return text.replace(_SLICE_MARKER, "")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _extract_slices(*, marked_up_text: str) -> List[TextSliceSegment]:
|
|
113
|
+
marker_pattern = re.compile(re.escape(_SLICE_MARKER))
|
|
114
|
+
slices: List[TextSliceSegment] = []
|
|
115
|
+
cursor = 0
|
|
116
|
+
original_index = 0
|
|
117
|
+
|
|
118
|
+
for match in marker_pattern.finditer(marked_up_text):
|
|
119
|
+
chunk = marked_up_text[cursor : match.start()]
|
|
120
|
+
if chunk:
|
|
121
|
+
slice_end = original_index + len(chunk)
|
|
122
|
+
slices.append(
|
|
123
|
+
TextSliceSegment(
|
|
124
|
+
index=len(slices) + 1,
|
|
125
|
+
start_char=original_index,
|
|
126
|
+
end_char=slice_end,
|
|
127
|
+
text=chunk,
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
original_index = slice_end
|
|
131
|
+
cursor = match.end()
|
|
132
|
+
|
|
133
|
+
tail = marked_up_text[cursor:]
|
|
134
|
+
if tail:
|
|
135
|
+
slice_end = original_index + len(tail)
|
|
136
|
+
slices.append(
|
|
137
|
+
TextSliceSegment(
|
|
138
|
+
index=len(slices) + 1,
|
|
139
|
+
start_char=original_index,
|
|
140
|
+
end_char=slice_end,
|
|
141
|
+
text=tail,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return slices
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _build_empty_confirmation_message(text: str) -> str:
|
|
149
|
+
return (
|
|
150
|
+
"No slice markers were inserted. If the text should remain a single slice, "
|
|
151
|
+
"call done again without changes. Otherwise insert <slice/> markers at the "
|
|
152
|
+
"boundaries of the requested slices.\n"
|
|
153
|
+
"Current text:\n"
|
|
154
|
+
f"---\n{text}\n---"
|
|
155
|
+
)
|