biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ """
2
+ Agentic text redaction using virtual file edits.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Iterable, List, Sequence
8
+
9
+ from jinja2 import Environment, StrictUndefined
10
+
11
+ from .markup import (
12
+ TextAnnotatedSpan,
13
+ build_span_context_section,
14
+ parse_span_markup,
15
+ strip_span_tags,
16
+ )
17
+ from .models import TextRedactRequest, TextRedactResult
18
+ from .tool_loop import request_confirmation, run_tool_loop
19
+
20
+ DEFAULT_REDACTION_TYPES = ["pii", "pci", "phi"]
21
+
22
+
23
+ def apply_text_redact(request: TextRedactRequest) -> TextRedactResult:
24
+ """
25
+ Apply text redaction using a language model.
26
+
27
+ :param request: Text redact request.
28
+ :type request: TextRedactRequest
29
+ :return: Text redact result.
30
+ :rtype: TextRedactResult
31
+ :raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
32
+ a confirmation round and return a warning when confirmed.
33
+ """
34
+ warnings: List[str] = []
35
+ redaction_types = _resolve_redaction_types(request.redaction_types)
36
+ system_prompt = _render_system_prompt(
37
+ request.system_prompt,
38
+ redaction_types=redaction_types,
39
+ )
40
+
41
+ if request.mock_marked_up_text is not None:
42
+ return _build_mock_result(
43
+ request,
44
+ request.mock_marked_up_text,
45
+ redaction_types=redaction_types,
46
+ )
47
+
48
+ result = run_tool_loop(
49
+ text=request.text,
50
+ client=request.client,
51
+ system_prompt=system_prompt,
52
+ prompt_template=request.prompt_template,
53
+ max_rounds=request.max_rounds,
54
+ max_edits_per_round=request.max_edits_per_round,
55
+ apply_str_replace=_apply_redact_replace,
56
+ validate_text=lambda current_text: _validate_redaction_markup(
57
+ current_text, redaction_types
58
+ ),
59
+ build_retry_message=lambda errors, current_text: _build_retry_message(
60
+ errors, current_text, redaction_types
61
+ ),
62
+ )
63
+
64
+ if not result.done:
65
+ if result.last_error:
66
+ raise ValueError(f"Text redact failed: {result.last_error}")
67
+ warnings.append("Text redact reached max rounds without done=true")
68
+
69
+ if result.text == request.text:
70
+ if result.last_error:
71
+ raise ValueError(result.last_error)
72
+ confirmation = request_confirmation(
73
+ result=result,
74
+ text=result.text,
75
+ client=request.client,
76
+ system_prompt=system_prompt,
77
+ prompt_template=request.prompt_template,
78
+ max_rounds=2,
79
+ max_edits_per_round=request.max_edits_per_round,
80
+ apply_str_replace=_apply_redact_replace,
81
+ confirmation_message=_build_empty_confirmation_message(result.text),
82
+ validate_text=lambda current_text: _validate_redaction_markup(
83
+ current_text, redaction_types
84
+ ),
85
+ build_retry_message=lambda errors, current_text: _build_retry_message(
86
+ errors, current_text, redaction_types
87
+ ),
88
+ )
89
+ if not confirmation.done:
90
+ if confirmation.last_error:
91
+ raise ValueError(f"Text redact failed: {confirmation.last_error}")
92
+ warnings.append("Text redact confirmation reached max rounds without done=true")
93
+ _validate_preserved_text(original=request.text, marked_up=confirmation.text)
94
+ spans = parse_span_markup(confirmation.text)
95
+ validation_errors = _validate_redaction_spans(spans, redaction_types)
96
+ if validation_errors:
97
+ raise ValueError("; ".join(validation_errors))
98
+ if not spans:
99
+ warnings.append("Text redact returned no spans; model confirmed empty result")
100
+ return TextRedactResult(
101
+ marked_up_text=confirmation.text,
102
+ spans=spans,
103
+ warnings=warnings,
104
+ )
105
+
106
+ _validate_preserved_text(original=request.text, marked_up=result.text)
107
+ spans = parse_span_markup(result.text)
108
+ validation_errors = _validate_redaction_spans(spans, redaction_types)
109
+ if validation_errors:
110
+ raise ValueError("; ".join(validation_errors))
111
+ return TextRedactResult(marked_up_text=result.text, spans=spans, warnings=warnings)
112
+
113
+
114
+ def _build_mock_result(
115
+ request: TextRedactRequest,
116
+ marked_up_text: str,
117
+ *,
118
+ redaction_types: Sequence[str] | None,
119
+ ) -> TextRedactResult:
120
+ if marked_up_text == request.text:
121
+ raise ValueError("Text redact produced no spans")
122
+ _validate_preserved_text(original=request.text, marked_up=marked_up_text)
123
+ spans = parse_span_markup(marked_up_text)
124
+ errors = _validate_redaction_spans(spans, redaction_types)
125
+ if errors:
126
+ raise ValueError("; ".join(errors))
127
+ return TextRedactResult(marked_up_text=marked_up_text, spans=spans, warnings=[])
128
+
129
+
130
+ def _resolve_redaction_types(redaction_types: Sequence[str] | None) -> List[str] | None:
131
+ if redaction_types is None or len(redaction_types) == 0:
132
+ return None
133
+ return [value for value in redaction_types]
134
+
135
+
136
+ def _render_system_prompt(template: str, *, redaction_types: Sequence[str] | None) -> str:
137
+ env = Environment(undefined=StrictUndefined)
138
+ rendered = env.from_string(template).render(
139
+ redaction_types=list(redaction_types) if redaction_types is not None else [],
140
+ )
141
+ return rendered
142
+
143
+
144
+ def _apply_redact_replace(text: str, old_str: str, new_str: str) -> str:
145
+ occurrences = text.count(old_str)
146
+ if occurrences == 0:
147
+ raise ValueError("Text redact replacement old_str not found")
148
+ if occurrences > 1:
149
+ raise ValueError("Text redact replacement old_str is not unique")
150
+ _validate_replace_text(old_str, new_str)
151
+ return text.replace(old_str, new_str, 1)
152
+
153
+
154
+ def _validate_replace_text(old_str: str, new_str: str) -> None:
155
+ if strip_span_tags(old_str) != strip_span_tags(new_str):
156
+ raise ValueError("Text redact replacements may only insert span tags")
157
+
158
+
159
+ def _validate_preserved_text(*, original: str, marked_up: str) -> None:
160
+ if strip_span_tags(marked_up) != original:
161
+ raise ValueError("Text redact edits modified the source text")
162
+
163
+
164
+ def _validate_redaction_markup(
165
+ marked_up_text: str, redaction_types: Sequence[str] | None
166
+ ) -> List[str]:
167
+ try:
168
+ spans = parse_span_markup(marked_up_text)
169
+ except ValueError as exc:
170
+ return [str(exc)]
171
+ return _validate_redaction_spans(spans, redaction_types)
172
+
173
+
174
+ def _validate_redaction_spans(
175
+ spans: Iterable[TextAnnotatedSpan], redaction_types: Sequence[str] | None
176
+ ) -> List[str]:
177
+ errors: List[str] = []
178
+ if redaction_types is None:
179
+ for span in spans:
180
+ if span.attributes:
181
+ errors.append(
182
+ f"Span {span.index} contains attributes but redaction types are disabled"
183
+ )
184
+ return errors
185
+
186
+ allowed_values = set(redaction_types)
187
+ for span in spans:
188
+ if len(span.attributes) != 1:
189
+ errors.append(f"Span {span.index} must include exactly one redact attribute")
190
+ continue
191
+ name, value = next(iter(span.attributes.items()))
192
+ if name != "redact":
193
+ errors.append(f"Span {span.index} uses attribute '{name}' but only 'redact' is allowed")
194
+ if value not in allowed_values:
195
+ errors.append(
196
+ f"Span {span.index} uses redaction type '{value}'. Allowed types: {', '.join(redaction_types)}"
197
+ )
198
+ return errors
199
+
200
+
201
+ def _build_retry_message(
202
+ errors: Sequence[str], current_text: str, redaction_types: Sequence[str] | None
203
+ ) -> str:
204
+ error_lines = "\n".join(f"- {error}" for error in errors)
205
+ context_section = build_span_context_section(current_text, errors)
206
+ type_message = (
207
+ "Do not add attributes."
208
+ if redaction_types is None
209
+ else f"Use a redact attribute with one of: {', '.join(redaction_types)}."
210
+ )
211
+ return (
212
+ "Your last edit did not validate.\n"
213
+ "Issues:\n"
214
+ f"{error_lines}\n\n"
215
+ f"{context_section}"
216
+ "Please fix the markup using str_replace. "
217
+ f"{type_message} Try again.\n"
218
+ "Current text:\n"
219
+ f"---\n{current_text}\n---"
220
+ )
221
+
222
+
223
+ def _build_empty_confirmation_message(text: str) -> str:
224
+ return (
225
+ "No redaction spans were inserted. If there are truly no spans to return, "
226
+ "call done again without changes. Otherwise insert the appropriate span tags.\n"
227
+ "Current text:\n"
228
+ f"---\n{text}\n---"
229
+ )
biblicus/text/slice.py ADDED
@@ -0,0 +1,155 @@
1
+ """
2
+ Agentic text slicing using virtual file edits.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ from typing import List
9
+
10
+ from .models import TextSliceRequest, TextSliceResult, TextSliceSegment
11
+ from .tool_loop import request_confirmation, run_tool_loop
12
+
13
+ _SLICE_MARKER = "<slice/>"
14
+
15
+
16
+ def apply_text_slice(request: TextSliceRequest) -> TextSliceResult:
17
+ """
18
+ Apply text slicing using a language model.
19
+
20
+ :param request: Text slice request.
21
+ :type request: TextSliceRequest
22
+ :return: Text slice result.
23
+ :rtype: TextSliceResult
24
+ :raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
25
+ a confirmation round and return a warning when confirmed.
26
+ """
27
+ if request.mock_marked_up_text is not None:
28
+ return _build_mock_result(request, request.mock_marked_up_text)
29
+
30
+ warnings: List[str] = []
31
+ result = run_tool_loop(
32
+ text=request.text,
33
+ client=request.client,
34
+ system_prompt=request.system_prompt,
35
+ prompt_template=request.prompt_template,
36
+ max_rounds=request.max_rounds,
37
+ max_edits_per_round=request.max_edits_per_round,
38
+ apply_str_replace=_apply_slice_replace,
39
+ )
40
+ if not result.done:
41
+ if result.last_error:
42
+ raise ValueError(f"Text slice failed: {result.last_error}")
43
+ warnings.append("Text slice reached max rounds without done=true")
44
+ if result.text == request.text:
45
+ if result.last_error:
46
+ raise ValueError(result.last_error)
47
+ confirmation = request_confirmation(
48
+ result=result,
49
+ text=result.text,
50
+ client=request.client,
51
+ system_prompt=request.system_prompt,
52
+ prompt_template=request.prompt_template,
53
+ max_rounds=2,
54
+ max_edits_per_round=request.max_edits_per_round,
55
+ apply_str_replace=_apply_slice_replace,
56
+ confirmation_message=_build_empty_confirmation_message(result.text),
57
+ )
58
+ if not confirmation.done:
59
+ if confirmation.last_error:
60
+ raise ValueError(f"Text slice failed: {confirmation.last_error}")
61
+ warnings.append("Text slice confirmation reached max rounds without done=true")
62
+ _validate_preserved_text(original=request.text, marked_up=confirmation.text)
63
+ slices = _extract_slices(marked_up_text=confirmation.text)
64
+ if confirmation.text == request.text:
65
+ warnings.append("Text slice returned no markers; model confirmed single slice")
66
+ if not slices:
67
+ raise ValueError("Text slice produced no slices")
68
+ return TextSliceResult(
69
+ marked_up_text=confirmation.text,
70
+ slices=slices,
71
+ warnings=warnings,
72
+ )
73
+ _validate_preserved_text(original=request.text, marked_up=result.text)
74
+ slices = _extract_slices(marked_up_text=result.text)
75
+ if not slices:
76
+ raise ValueError("Text slice produced no slices")
77
+ return TextSliceResult(marked_up_text=result.text, slices=slices, warnings=warnings)
78
+
79
+
80
+ def _build_mock_result(request: TextSliceRequest, marked_up_text: str) -> TextSliceResult:
81
+ if marked_up_text == request.text:
82
+ raise ValueError("Text slice produced no markers")
83
+ _validate_preserved_text(original=request.text, marked_up=marked_up_text)
84
+ slices = _extract_slices(marked_up_text=marked_up_text)
85
+ return TextSliceResult(marked_up_text=marked_up_text, slices=slices, warnings=[])
86
+
87
+
88
+ def _apply_slice_replace(text: str, old_str: str, new_str: str) -> str:
89
+ occurrences = text.count(old_str)
90
+ if occurrences == 0:
91
+ raise ValueError("Text slice replacement old_str not found")
92
+ if occurrences > 1:
93
+ raise ValueError("Text slice replacement old_str is not unique")
94
+ _validate_replace_text(old_str, new_str)
95
+ return text.replace(old_str, new_str, 1)
96
+
97
+
98
+ def _validate_replace_text(old_str: str, new_str: str) -> None:
99
+ if _strip_slice_markers(old_str) != _strip_slice_markers(new_str):
100
+ raise ValueError("Text slice replacements may only insert slice markers")
101
+
102
+
103
+ def _validate_preserved_text(*, original: str, marked_up: str) -> None:
104
+ if _strip_slice_markers(marked_up) != original:
105
+ raise ValueError("Text slice edits modified the source text")
106
+
107
+
108
+ def _strip_slice_markers(text: str) -> str:
109
+ return text.replace(_SLICE_MARKER, "")
110
+
111
+
112
+ def _extract_slices(*, marked_up_text: str) -> List[TextSliceSegment]:
113
+ marker_pattern = re.compile(re.escape(_SLICE_MARKER))
114
+ slices: List[TextSliceSegment] = []
115
+ cursor = 0
116
+ original_index = 0
117
+
118
+ for match in marker_pattern.finditer(marked_up_text):
119
+ chunk = marked_up_text[cursor : match.start()]
120
+ if chunk:
121
+ slice_end = original_index + len(chunk)
122
+ slices.append(
123
+ TextSliceSegment(
124
+ index=len(slices) + 1,
125
+ start_char=original_index,
126
+ end_char=slice_end,
127
+ text=chunk,
128
+ )
129
+ )
130
+ original_index = slice_end
131
+ cursor = match.end()
132
+
133
+ tail = marked_up_text[cursor:]
134
+ if tail:
135
+ slice_end = original_index + len(tail)
136
+ slices.append(
137
+ TextSliceSegment(
138
+ index=len(slices) + 1,
139
+ start_char=original_index,
140
+ end_char=slice_end,
141
+ text=tail,
142
+ )
143
+ )
144
+
145
+ return slices
146
+
147
+
148
+ def _build_empty_confirmation_message(text: str) -> str:
149
+ return (
150
+ "No slice markers were inserted. If the text should remain a single slice, "
151
+ "call done again without changes. Otherwise insert <slice/> markers at the "
152
+ "boundaries of the requested slices.\n"
153
+ "Current text:\n"
154
+ f"---\n{text}\n---"
155
+ )