biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/text/link.py ADDED
@@ -0,0 +1,519 @@
1
+ """
2
+ Agentic text linking using virtual file edits.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ from collections import Counter
9
+ from typing import Dict, Iterable, List, Optional, Sequence, Tuple
10
+
11
+ from jinja2 import Environment, StrictUndefined
12
+
13
+ from .markup import (
14
+ TextAnnotatedSpan,
15
+ build_span_context_section,
16
+ parse_span_markup,
17
+ strip_span_tags,
18
+ )
19
+ from .models import TextLinkRequest, TextLinkResult
20
+ from .tool_loop import request_confirmation, run_tool_loop
21
+
22
+ DEFAULT_LINK_ID_PREFIX = "link_"
23
+
24
+
25
+ def apply_text_link(request: TextLinkRequest) -> TextLinkResult:
26
+ """
27
+ Apply text linking using a language model.
28
+
29
+ :param request: Text link request.
30
+ :type request: TextLinkRequest
31
+ :return: Text link result.
32
+ :rtype: TextLinkResult
33
+ :raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
34
+ a confirmation round and return a warning when confirmed.
35
+ """
36
+ warnings: List[str] = []
37
+ system_prompt = _render_system_prompt(
38
+ request.system_prompt,
39
+ id_prefix=request.id_prefix,
40
+ )
41
+
42
+ if request.mock_marked_up_text is not None:
43
+ return _build_mock_result(
44
+ request,
45
+ request.mock_marked_up_text,
46
+ )
47
+
48
+ result = run_tool_loop(
49
+ text=request.text,
50
+ client=request.client,
51
+ system_prompt=system_prompt,
52
+ prompt_template=request.prompt_template,
53
+ max_rounds=request.max_rounds,
54
+ max_edits_per_round=request.max_edits_per_round,
55
+ apply_str_replace=_apply_link_replace,
56
+ validate_text=lambda current_text: _validate_link_markup(current_text, request.id_prefix),
57
+ build_retry_message=lambda errors, current_text: _build_retry_message(
58
+ errors, current_text, request.id_prefix
59
+ ),
60
+ )
61
+
62
+ if not result.done:
63
+ if result.last_error:
64
+ recovered = _attempt_missing_coverage_recovery(
65
+ marked_up_text=result.text,
66
+ id_prefix=request.id_prefix,
67
+ warnings=warnings,
68
+ )
69
+ if recovered is not None:
70
+ return recovered
71
+ raise ValueError(f"Text link failed: {result.last_error}")
72
+ warnings.append("Text link reached max rounds without done=true")
73
+
74
+ if result.text == request.text:
75
+ if result.last_error:
76
+ raise ValueError(result.last_error)
77
+ confirmation = request_confirmation(
78
+ result=result,
79
+ text=result.text,
80
+ client=request.client,
81
+ system_prompt=system_prompt,
82
+ prompt_template=request.prompt_template,
83
+ max_rounds=2,
84
+ max_edits_per_round=request.max_edits_per_round,
85
+ apply_str_replace=_apply_link_replace,
86
+ confirmation_message=_build_empty_confirmation_message(result.text),
87
+ validate_text=lambda current_text: _validate_link_markup(
88
+ current_text, request.id_prefix
89
+ ),
90
+ build_retry_message=lambda errors, current_text: _build_retry_message(
91
+ errors, current_text, request.id_prefix
92
+ ),
93
+ )
94
+ if not confirmation.done:
95
+ if confirmation.last_error:
96
+ raise ValueError(f"Text link failed: {confirmation.last_error}")
97
+ warnings.append("Text link confirmation reached max rounds without done=true")
98
+ _validate_preserved_text(original=request.text, marked_up=confirmation.text)
99
+ if confirmation.text == request.text:
100
+ warnings.append("Text link returned no spans; model confirmed empty result")
101
+ return TextLinkResult(marked_up_text=confirmation.text, spans=[], warnings=warnings)
102
+ spans = parse_span_markup(confirmation.text)
103
+ errors = _validate_link_spans(spans, request.id_prefix)
104
+ if errors:
105
+ raise ValueError("; ".join(errors))
106
+ return TextLinkResult(marked_up_text=confirmation.text, spans=spans, warnings=warnings)
107
+
108
+ _validate_preserved_text(original=request.text, marked_up=result.text)
109
+ spans = parse_span_markup(result.text)
110
+ errors = _validate_link_spans(spans, request.id_prefix)
111
+ if errors:
112
+ autofilled = _autofill_ref_spans(result.text, spans)
113
+ if autofilled is not None:
114
+ result_text, spans, autofill_warnings = autofilled
115
+ warnings.extend(autofill_warnings)
116
+ errors = _validate_link_spans(spans, request.id_prefix)
117
+ if errors:
118
+ raise ValueError("; ".join(errors))
119
+ return TextLinkResult(
120
+ marked_up_text=result_text,
121
+ spans=spans,
122
+ warnings=warnings,
123
+ )
124
+ raise ValueError("; ".join(errors))
125
+
126
+ return TextLinkResult(marked_up_text=result.text, spans=spans, warnings=warnings)
127
+
128
+
129
+ def _build_mock_result(
130
+ request: TextLinkRequest,
131
+ marked_up_text: str,
132
+ ) -> TextLinkResult:
133
+ if marked_up_text == request.text:
134
+ raise ValueError("Text link produced no spans")
135
+ _validate_preserved_text(original=request.text, marked_up=marked_up_text)
136
+ spans = parse_span_markup(marked_up_text)
137
+ errors = _validate_link_spans(spans, request.id_prefix)
138
+ if errors:
139
+ raise ValueError("; ".join(errors))
140
+ return TextLinkResult(marked_up_text=marked_up_text, spans=spans, warnings=[])
141
+
142
+
143
+ def _render_system_prompt(template: str, *, id_prefix: str) -> str:
144
+ env = Environment(undefined=StrictUndefined)
145
+ rendered = env.from_string(template).render(
146
+ id_prefix=id_prefix,
147
+ )
148
+ return rendered
149
+
150
+
151
+ def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
152
+ occurrences = text.count(old_str)
153
+ if occurrences == 0:
154
+ raise ValueError("Text link replacement old_str not found")
155
+ if occurrences > 1:
156
+ raise ValueError("Text link replacement old_str is not unique")
157
+ _validate_replace_text(old_str, new_str)
158
+ return text.replace(old_str, new_str, 1)
159
+
160
+
161
+ def _validate_replace_text(old_str: str, new_str: str) -> None:
162
+ if strip_span_tags(old_str) != strip_span_tags(new_str):
163
+ raise ValueError("Text link replacements may only insert span tags")
164
+
165
+
166
+ def _validate_preserved_text(*, original: str, marked_up: str) -> None:
167
+ if strip_span_tags(marked_up) != original:
168
+ raise ValueError("Text link edits modified the source text")
169
+
170
+
171
+ def _validate_link_markup(marked_up_text: str, id_prefix: str) -> List[str]:
172
+ try:
173
+ spans = parse_span_markup(marked_up_text)
174
+ except ValueError as exc:
175
+ return [str(exc)]
176
+ errors = _validate_link_spans(spans, id_prefix)
177
+ errors.extend(_validate_link_coverage(marked_up_text, spans))
178
+ errors.extend(_validate_link_span_minimality(spans))
179
+ return errors
180
+
181
+
182
+ def _attempt_missing_coverage_recovery(
183
+ *,
184
+ marked_up_text: str,
185
+ id_prefix: str,
186
+ warnings: List[str],
187
+ ) -> Optional[TextLinkResult]:
188
+ try:
189
+ spans = parse_span_markup(marked_up_text)
190
+ except ValueError:
191
+ return None
192
+ plain_text = strip_span_tags(marked_up_text)
193
+ promoted_spans, promotion_warnings = _promote_ref_spans_to_id(
194
+ spans=spans,
195
+ id_prefix=id_prefix,
196
+ )
197
+ if promotion_warnings:
198
+ warnings.extend(promotion_warnings)
199
+ spans = promoted_spans
200
+ marked_up_text = _render_span_markup(plain_text, spans)
201
+ errors = _validate_link_spans(spans, id_prefix)
202
+ errors.extend(_validate_link_coverage(marked_up_text, spans))
203
+ errors.extend(_validate_link_span_minimality(spans))
204
+ if not errors or not _errors_are_missing_coverage_only(errors):
205
+ return None
206
+ autofilled = _autofill_ref_spans(marked_up_text, spans)
207
+ if autofilled is None:
208
+ return None
209
+ result_text, result_spans, autofill_warnings = autofilled
210
+ warnings.extend(autofill_warnings)
211
+ errors_after = _validate_link_spans(result_spans, id_prefix)
212
+ errors_after.extend(_validate_link_coverage(result_text, result_spans))
213
+ errors_after.extend(_validate_link_span_minimality(result_spans))
214
+ if errors_after:
215
+ return None
216
+ return TextLinkResult(
217
+ marked_up_text=result_text,
218
+ spans=result_spans,
219
+ warnings=warnings,
220
+ )
221
+
222
+
223
+ def _errors_are_missing_coverage_only(errors: Sequence[str]) -> bool:
224
+ return all(_is_ref_coverage_error(error) for error in errors)
225
+
226
+
227
+ def _is_ref_coverage_error(error: str) -> bool:
228
+ if error.startswith("Missing linked spans for repeated text"):
229
+ return True
230
+ if error.startswith("Id '") and error.endswith("must have at least one ref span"):
231
+ return True
232
+ if error.startswith("Repeated text '") and error.endswith("must include ref spans for repeats"):
233
+ return True
234
+ return False
235
+
236
+
237
+ def _promote_ref_spans_to_id(
238
+ *,
239
+ spans: Sequence[TextAnnotatedSpan],
240
+ id_prefix: str,
241
+ ) -> Tuple[List[TextAnnotatedSpan], List[str]]:
242
+ spans_by_text: Dict[str, List[TextAnnotatedSpan]] = {}
243
+ for span in spans:
244
+ spans_by_text.setdefault(span.text, []).append(span)
245
+ promote_indices: Dict[int, str] = {}
246
+ warnings: List[str] = []
247
+ for span_text, group in spans_by_text.items():
248
+ if any("id" in span.attributes for span in group):
249
+ continue
250
+ ref_spans = [span for span in group if "ref" in span.attributes]
251
+ if not ref_spans:
252
+ continue
253
+ candidate = sorted(ref_spans, key=lambda span: (span.start_char, span.end_char))[0]
254
+ ref_value = candidate.attributes.get("ref", "")
255
+ if not ref_value.startswith(id_prefix):
256
+ continue
257
+ promote_indices[candidate.index] = ref_value
258
+ warnings.append(f"Promoted ref span for '{span_text}' to id '{ref_value}'.")
259
+ if not promote_indices:
260
+ return list(spans), []
261
+ updated: List[TextAnnotatedSpan] = []
262
+ for span in spans:
263
+ attributes = span.attributes
264
+ if span.index in promote_indices:
265
+ attributes = {"id": promote_indices[span.index]}
266
+ updated.append(
267
+ TextAnnotatedSpan(
268
+ index=span.index,
269
+ start_char=span.start_char,
270
+ end_char=span.end_char,
271
+ text=span.text,
272
+ attributes=attributes,
273
+ )
274
+ )
275
+ return updated, warnings
276
+
277
+
278
+ def _validate_link_spans(spans: Iterable[TextAnnotatedSpan], id_prefix: str) -> List[str]:
279
+ errors: List[str] = []
280
+ seen_ids: List[str] = []
281
+ id_counts: Dict[str, int] = {}
282
+ ref_counts: Dict[str, int] = {}
283
+ span_texts_by_id: Dict[str, Dict[str, set[str]]] = {}
284
+ spans_by_text: Dict[str, List[TextAnnotatedSpan]] = {}
285
+ for span in spans:
286
+ if len(span.attributes) != 1:
287
+ errors.append(f"Span {span.index} must include exactly one attribute")
288
+ continue
289
+ name, value = next(iter(span.attributes.items()))
290
+ if value.strip() == "":
291
+ errors.append(f"Span {span.index} has an empty value for attribute '{name}'")
292
+ continue
293
+ if name == "id":
294
+ if not value.startswith(id_prefix):
295
+ errors.append(f"Span {span.index} id '{value}' must start with '{id_prefix}'")
296
+ if value in seen_ids:
297
+ errors.append(f"Span {span.index} uses duplicate id '{value}'")
298
+ seen_ids.append(value)
299
+ id_counts[value] = id_counts.get(value, 0) + 1
300
+ span_texts_by_id.setdefault(value, {"id": set(), "ref": set()})["id"].add(span.text)
301
+ elif name == "ref":
302
+ if value not in seen_ids:
303
+ errors.append(f"Span {span.index} ref '{value}' does not match a previous id")
304
+ ref_counts[value] = ref_counts.get(value, 0) + 1
305
+ span_texts_by_id.setdefault(value, {"id": set(), "ref": set()})["ref"].add(span.text)
306
+ else:
307
+ errors.append(
308
+ f"Span {span.index} uses attribute '{name}' but only 'id' or 'ref' are allowed"
309
+ )
310
+ spans_by_text.setdefault(span.text, []).append(span)
311
+ for id_value, text_sets in span_texts_by_id.items():
312
+ id_texts = text_sets["id"]
313
+ ref_texts = text_sets["ref"]
314
+ if len(id_texts) > 1:
315
+ errors.append(f"Id '{id_value}' spans must wrap the same text")
316
+ if len(ref_texts) > 1:
317
+ errors.append(f"Ref spans for id '{id_value}' must wrap the same text")
318
+ if id_texts and ref_texts and id_texts != ref_texts:
319
+ id_text = next(iter(id_texts))
320
+ ref_text = next(iter(ref_texts))
321
+ errors.append(
322
+ f"Id '{id_value}' span text must match ref span text "
323
+ f"(id: '{id_text}', ref: '{ref_text}')"
324
+ )
325
+ for span_text, span_group in spans_by_text.items():
326
+ if len(span_group) <= 1:
327
+ continue
328
+ id_values = [span.attributes.get("id") for span in span_group if "id" in span.attributes]
329
+ ref_values = [span.attributes.get("ref") for span in span_group if "ref" in span.attributes]
330
+ if len(id_values) != 1:
331
+ errors.append(f"Repeated text '{span_text}' must have exactly one id span")
332
+ continue
333
+ id_value = id_values[0]
334
+ if not ref_values:
335
+ errors.append(f"Repeated text '{span_text}' must include ref spans for repeats")
336
+ continue
337
+ if any(ref != id_value for ref in ref_values):
338
+ errors.append(f"Repeated text '{span_text}' refs must match id '{id_value}'")
339
+ for id_value, count in id_counts.items():
340
+ if count > 1:
341
+ continue
342
+ if ref_counts.get(id_value, 0) == 0:
343
+ errors.append(f"Id '{id_value}' must have at least one ref span")
344
+ return errors
345
+
346
+
347
+ def _validate_link_coverage(marked_up_text: str, spans: Iterable[TextAnnotatedSpan]) -> List[str]:
348
+ plain_text = strip_span_tags(marked_up_text)
349
+ spans_by_text: Dict[str, int] = {}
350
+ for span in spans:
351
+ if span.text:
352
+ spans_by_text[span.text] = spans_by_text.get(span.text, 0) + 1
353
+ errors: List[str] = []
354
+ for span_text, span_count in spans_by_text.items():
355
+ occurrences = plain_text.count(span_text)
356
+ if occurrences > span_count:
357
+ errors.append(
358
+ f"Missing linked spans for repeated text '{span_text}' "
359
+ f"({span_count}/{occurrences})"
360
+ )
361
+ return errors
362
+
363
+
364
+ _WORD_PATTERN = re.compile(r"[A-Za-z0-9_]+")
365
+
366
+
367
+ def _validate_link_span_minimality(spans: Iterable[TextAnnotatedSpan]) -> List[str]:
368
+ errors: List[str] = []
369
+ for span in spans:
370
+ tokens = [token for token in _WORD_PATTERN.findall(span.text) if token]
371
+ if not tokens:
372
+ continue
373
+ counts = Counter(tokens)
374
+ repeated = [token for token, count in counts.items() if count > 1]
375
+ if repeated:
376
+ errors.append(
377
+ f"Span {span.index} contains repeated token '{repeated[0]}'. "
378
+ "Split repeated mentions into separate spans."
379
+ )
380
+ return errors
381
+
382
+
383
+ def _autofill_ref_spans(
384
+ marked_up_text: str,
385
+ spans: Iterable[TextAnnotatedSpan],
386
+ ) -> Optional[Tuple[str, List[TextAnnotatedSpan], List[str]]]:
387
+ plain_text = strip_span_tags(marked_up_text)
388
+ existing_spans = list(spans)
389
+ occupied = [(span.start_char, span.end_char) for span in existing_spans]
390
+ new_spans: List[TextAnnotatedSpan] = []
391
+
392
+ def is_covered(start: int, end: int) -> bool:
393
+ return any(start < span_end and end > span_start for span_start, span_end in occupied)
394
+
395
+ for span in existing_spans:
396
+ id_value = span.attributes.get("id")
397
+ if not id_value:
398
+ continue
399
+ span_text = span.text
400
+ if not span_text:
401
+ continue
402
+ for match in re.finditer(re.escape(span_text), plain_text):
403
+ start = match.start()
404
+ end = match.end()
405
+ if is_covered(start, end):
406
+ continue
407
+ ref_span = TextAnnotatedSpan(
408
+ index=1,
409
+ start_char=start,
410
+ end_char=end,
411
+ text=span_text,
412
+ attributes={"ref": id_value},
413
+ )
414
+ new_spans.append(ref_span)
415
+ occupied.append((start, end))
416
+
417
+ if not new_spans:
418
+ return None
419
+
420
+ merged_spans = sorted(
421
+ existing_spans + new_spans,
422
+ key=lambda span: (span.start_char, span.end_char),
423
+ )
424
+ reindexed: List[TextAnnotatedSpan] = []
425
+ for index, span in enumerate(merged_spans, start=1):
426
+ reindexed.append(
427
+ TextAnnotatedSpan(
428
+ index=index,
429
+ start_char=span.start_char,
430
+ end_char=span.end_char,
431
+ text=span.text,
432
+ attributes=span.attributes,
433
+ )
434
+ )
435
+ rendered = _render_span_markup(plain_text, reindexed)
436
+ warnings = [f"Autofilled {len(new_spans)} ref spans for repeated text."]
437
+ return rendered, reindexed, warnings
438
+
439
+
440
+ def _render_span_markup(text: str, spans: List[TextAnnotatedSpan]) -> str:
441
+ parts: List[str] = []
442
+ cursor = 0
443
+ for span in spans:
444
+ if span.start_char < cursor:
445
+ raise ValueError("Span overlap detected while rendering markup")
446
+ parts.append(text[cursor : span.start_char])
447
+ attrs = " ".join(f'{key}="{value}"' for key, value in span.attributes.items())
448
+ if attrs:
449
+ parts.append(f"<span {attrs}>")
450
+ else:
451
+ parts.append("<span>")
452
+ parts.append(text[span.start_char : span.end_char])
453
+ parts.append("</span>")
454
+ cursor = span.end_char
455
+ parts.append(text[cursor:])
456
+ return "".join(parts)
457
+
458
+
459
+ def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: str) -> str:
460
+ error_lines = "\n".join(f"- {error}" for error in errors)
461
+ context_section = build_span_context_section(current_text, errors)
462
+ coverage_guidance = _build_coverage_guidance(errors)
463
+ return (
464
+ "Your last edit did not validate.\n"
465
+ "Issues:\n"
466
+ f"{error_lines}\n\n"
467
+ f"{context_section}"
468
+ f"{coverage_guidance}"
469
+ "Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
470
+ "Reuse the same id for identical names and do not assign multiple ids to the same name. "
471
+ f"Ids must start with '{id_prefix}'. Try again.\n"
472
+ "Current text:\n"
473
+ f"---\n{current_text}\n---"
474
+ )
475
+
476
+
477
+ def _build_coverage_guidance(errors: Sequence[str]) -> str:
478
+ instructions: List[str] = []
479
+ for error in errors:
480
+ match = re.match(
481
+ r"Missing linked spans for repeated text '(.+)' \((\d+)/(\d+)\)",
482
+ error,
483
+ )
484
+ if match:
485
+ span_text = match.group(1)
486
+ instructions.append(
487
+ f"- Add ref spans for every remaining occurrence of '{span_text}' "
488
+ "using the same id as its first mention."
489
+ )
490
+ continue
491
+ if error.startswith("Id '") and error.endswith("must have at least one ref span"):
492
+ id_value = error.split("'")[1]
493
+ instructions.append(f'- Add ref spans with ref="{id_value}" for each later occurrence.')
494
+ continue
495
+ if error.startswith("Repeated text '") and error.endswith(
496
+ "must include ref spans for repeats"
497
+ ):
498
+ span_text = error.split("'")[1]
499
+ instructions.append(
500
+ f"- Ensure '{span_text}' has one id on the first mention and ref spans on later mentions."
501
+ )
502
+ continue
503
+ if error.startswith("Id '") and "span text must match ref span text" in error:
504
+ id_value = error.split("'")[1]
505
+ instructions.append(
506
+ f"- Ensure every span with id/ref '{id_value}' wraps the exact same text."
507
+ )
508
+ if not instructions:
509
+ return ""
510
+ return "Fixes:\n" + "\n".join(instructions) + "\n\n"
511
+
512
+
513
+ def _build_empty_confirmation_message(text: str) -> str:
514
+ return (
515
+ "No linked spans were inserted. If there are truly no repeated names to link, "
516
+ "call done again without changes. Otherwise insert id/ref spans for the repeated names.\n"
517
+ "Current text:\n"
518
+ f"---\n{text}\n---"
519
+ )