biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Deterministic term-frequency vector retrieval backend.
2
+ Deterministic term-frequency vector retriever.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -8,93 +8,103 @@ import math
8
8
  import re
9
9
  from typing import Dict, Iterable, List, Optional, Tuple
10
10
 
11
- from pydantic import BaseModel, ConfigDict, Field
11
+ from pydantic import BaseModel, ConfigDict
12
12
 
13
13
  from ..corpus import Corpus
14
14
  from ..frontmatter import parse_front_matter
15
15
  from ..models import (
16
16
  Evidence,
17
- ExtractionRunReference,
17
+ ExtractionSnapshotReference,
18
18
  QueryBudget,
19
19
  RetrievalResult,
20
- RetrievalRun,
21
- parse_extraction_run_reference,
20
+ RetrievalSnapshot,
21
+ parse_extraction_snapshot_reference,
22
+ )
23
+ from ..retrieval import (
24
+ apply_budget,
25
+ create_configuration_manifest,
26
+ create_snapshot_manifest,
27
+ hash_text,
22
28
  )
23
- from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
24
29
  from ..time import utc_now_iso
25
30
 
26
31
 
27
- class TfVectorRecipeConfig(BaseModel):
32
+ class TfVectorConfiguration(BaseModel):
28
33
  """
29
- Configuration for the term-frequency vector retrieval backend.
34
+ Configuration for the term-frequency vector retriever.
30
35
 
31
- :ivar snippet_characters: Maximum characters to include in evidence snippets.
32
- :vartype snippet_characters: int
33
- :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
34
- :vartype extraction_run: str or None
36
+ :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
37
+ :vartype extraction_snapshot: str or None
38
+ :ivar snippet_characters: Optional maximum character count for returned evidence text.
39
+ :vartype snippet_characters: int or None
35
40
  """
36
41
 
37
42
  model_config = ConfigDict(extra="forbid")
38
43
 
39
- snippet_characters: int = Field(default=400, ge=1)
40
- extraction_run: Optional[str] = None
44
+ extraction_snapshot: Optional[str] = None
45
+ snippet_characters: Optional[int] = None
41
46
 
42
47
 
43
- class TfVectorBackend:
48
+ class TfVectorRetriever:
44
49
  """
45
- Deterministic vector backend using term-frequency cosine similarity.
50
+ Deterministic vector retriever using term-frequency cosine similarity.
46
51
 
47
- :ivar backend_id: Backend identifier.
48
- :vartype backend_id: str
52
+ :ivar retriever_id: Retriever identifier.
53
+ :vartype retriever_id: str
49
54
  """
50
55
 
51
- backend_id = "tf-vector"
56
+ retriever_id = "tf-vector"
52
57
 
53
- def build_run(
54
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
55
- ) -> RetrievalRun:
58
+ def build_snapshot(
59
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
60
+ ) -> RetrievalSnapshot:
56
61
  """
57
- Register a vector backend run (no materialization).
62
+ Register a vector retriever snapshot (no snapshot artifacts).
58
63
 
59
64
  :param corpus: Corpus to build against.
60
65
  :type corpus: Corpus
61
- :param recipe_name: Human-readable recipe name.
62
- :type recipe_name: str
63
- :param config: Backend-specific configuration values.
64
- :type config: dict[str, object]
65
- :return: Run manifest describing the build.
66
- :rtype: RetrievalRun
66
+ :param configuration_name: Human-readable configuration name.
67
+ :type configuration_name: str
68
+ :param configuration: Retriever-specific configuration values.
69
+ :type configuration: dict[str, object]
70
+ :return: Snapshot manifest describing the build.
71
+ :rtype: RetrievalSnapshot
67
72
  """
68
- recipe_config = TfVectorRecipeConfig.model_validate(config)
73
+ parsed_config = TfVectorConfiguration.model_validate(configuration)
69
74
  catalog = corpus.load_catalog()
70
- recipe = create_recipe_manifest(
71
- backend_id=self.backend_id,
72
- name=recipe_name,
73
- config=recipe_config.model_dump(),
75
+ configuration_manifest = create_configuration_manifest(
76
+ retriever_id=self.retriever_id,
77
+ name=configuration_name,
78
+ configuration=parsed_config.model_dump(),
74
79
  )
75
80
  stats = {
76
81
  "items": len(catalog.items),
77
- "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
82
+ "text_items": _count_text_items(corpus, catalog.items.values(), parsed_config),
78
83
  }
79
- run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
80
- corpus.write_run(run)
81
- return run
84
+ snapshot = create_snapshot_manifest(
85
+ corpus,
86
+ configuration=configuration_manifest,
87
+ stats=stats,
88
+ snapshot_artifacts=[],
89
+ )
90
+ corpus.write_snapshot(snapshot)
91
+ return snapshot
82
92
 
83
93
  def query(
84
94
  self,
85
95
  corpus: Corpus,
86
96
  *,
87
- run: RetrievalRun,
97
+ snapshot: RetrievalSnapshot,
88
98
  query_text: str,
89
99
  budget: QueryBudget,
90
100
  ) -> RetrievalResult:
91
101
  """
92
102
  Query the corpus using term-frequency cosine similarity.
93
103
 
94
- :param corpus: Corpus associated with the run.
104
+ :param corpus: Corpus associated with the snapshot.
95
105
  :type corpus: Corpus
96
- :param run: Run manifest to use for querying.
97
- :type run: RetrievalRun
106
+ :param snapshot: Snapshot manifest to use for querying.
107
+ :type snapshot: RetrievalSnapshot
98
108
  :param query_text: Query text to execute.
99
109
  :type query_text: str
100
110
  :param budget: Evidence selection budget.
@@ -102,15 +112,15 @@ class TfVectorBackend:
102
112
  :return: Retrieval results containing evidence.
103
113
  :rtype: RetrievalResult
104
114
  """
105
- recipe_config = TfVectorRecipeConfig.model_validate(run.recipe.config)
115
+ parsed_config = TfVectorConfiguration.model_validate(snapshot.configuration.configuration)
106
116
  query_tokens = _tokenize_text(query_text)
107
117
  if not query_tokens:
108
118
  return RetrievalResult(
109
119
  query_text=query_text,
110
120
  budget=budget,
111
- run_id=run.run_id,
112
- recipe_id=run.recipe.recipe_id,
113
- backend_id=self.backend_id,
121
+ snapshot_id=snapshot.snapshot_id,
122
+ configuration_id=snapshot.configuration.configuration_id,
123
+ retriever_id=snapshot.configuration.retriever_id,
114
124
  generated_at=utc_now_iso(),
115
125
  evidence=[],
116
126
  stats={"candidates": 0, "returned": 0},
@@ -118,15 +128,15 @@ class TfVectorBackend:
118
128
  query_vector = _term_frequencies(query_tokens)
119
129
  query_norm = _vector_norm(query_vector)
120
130
  catalog = corpus.load_catalog()
121
- extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
131
+ extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
122
132
  scored_candidates = _score_items(
123
133
  corpus,
124
134
  catalog.items.values(),
125
135
  query_tokens=query_tokens,
126
136
  query_vector=query_vector,
127
137
  query_norm=query_norm,
128
- snippet_characters=recipe_config.snippet_characters,
129
138
  extraction_reference=extraction_reference,
139
+ snippet_characters=parsed_config.snippet_characters,
130
140
  )
131
141
  sorted_candidates = sorted(
132
142
  scored_candidates,
@@ -136,8 +146,8 @@ class TfVectorBackend:
136
146
  evidence_item.model_copy(
137
147
  update={
138
148
  "rank": index,
139
- "recipe_id": run.recipe.recipe_id,
140
- "run_id": run.run_id,
149
+ "configuration_id": snapshot.configuration.configuration_id,
150
+ "snapshot_id": snapshot.snapshot_id,
141
151
  }
142
152
  )
143
153
  for index, evidence_item in enumerate(sorted_candidates, start=1)
@@ -147,9 +157,9 @@ class TfVectorBackend:
147
157
  return RetrievalResult(
148
158
  query_text=query_text,
149
159
  budget=budget,
150
- run_id=run.run_id,
151
- recipe_id=run.recipe.recipe_id,
152
- backend_id=self.backend_id,
160
+ snapshot_id=snapshot.snapshot_id,
161
+ configuration_id=snapshot.configuration.configuration_id,
162
+ retriever_id=snapshot.configuration.retriever_id,
153
163
  generated_at=utc_now_iso(),
154
164
  evidence=evidence,
155
165
  stats=stats,
@@ -157,33 +167,33 @@ class TfVectorBackend:
157
167
 
158
168
 
159
169
  def _resolve_extraction_reference(
160
- corpus: Corpus, recipe_config: TfVectorRecipeConfig
161
- ) -> Optional[ExtractionRunReference]:
170
+ corpus: Corpus, configuration: TfVectorConfiguration
171
+ ) -> Optional[ExtractionSnapshotReference]:
162
172
  """
163
- Resolve an extraction run reference from a recipe config.
173
+ Resolve an extraction snapshot reference from a configuration.
164
174
 
165
- :param corpus: Corpus associated with the recipe.
175
+ :param corpus: Corpus associated with the configuration.
166
176
  :type corpus: Corpus
167
- :param recipe_config: Parsed vector recipe configuration.
168
- :type recipe_config: TfVectorRecipeConfig
177
+ :param configuration: Parsed vector configuration.
178
+ :type configuration: TfVectorConfiguration
169
179
  :return: Parsed extraction reference or None.
170
- :rtype: ExtractionRunReference or None
171
- :raises FileNotFoundError: If an extraction run is referenced but not present.
180
+ :rtype: ExtractionSnapshotReference or None
181
+ :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
172
182
  """
173
- if not recipe_config.extraction_run:
183
+ if not configuration.extraction_snapshot:
174
184
  return None
175
- extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
176
- run_dir = corpus.extraction_run_dir(
185
+ extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
186
+ snapshot_dir = corpus.extraction_snapshot_dir(
177
187
  extractor_id=extraction_reference.extractor_id,
178
- run_id=extraction_reference.run_id,
188
+ snapshot_id=extraction_reference.snapshot_id,
179
189
  )
180
- if not run_dir.is_dir():
181
- raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
190
+ if not snapshot_dir.is_dir():
191
+ raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
182
192
  return extraction_reference
183
193
 
184
194
 
185
195
  def _count_text_items(
186
- corpus: Corpus, items: Iterable[object], recipe_config: TfVectorRecipeConfig
196
+ corpus: Corpus, items: Iterable[object], configuration: TfVectorConfiguration
187
197
  ) -> int:
188
198
  """
189
199
  Count catalog items that represent text content.
@@ -192,19 +202,19 @@ def _count_text_items(
192
202
  :type corpus: Corpus
193
203
  :param items: Catalog items to inspect.
194
204
  :type items: Iterable[object]
195
- :param recipe_config: Parsed vector recipe configuration.
196
- :type recipe_config: TfVectorRecipeConfig
205
+ :param configuration: Parsed vector configuration.
206
+ :type configuration: TfVectorConfiguration
197
207
  :return: Number of text items.
198
208
  :rtype: int
199
209
  """
200
210
  text_item_count = 0
201
- extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
211
+ extraction_reference = _resolve_extraction_reference(corpus, configuration)
202
212
  for catalog_item in items:
203
213
  item_id = str(getattr(catalog_item, "id", ""))
204
214
  if extraction_reference and item_id:
205
215
  extracted_text = corpus.read_extracted_text(
206
216
  extractor_id=extraction_reference.extractor_id,
207
- run_id=extraction_reference.run_id,
217
+ snapshot_id=extraction_reference.snapshot_id,
208
218
  item_id=item_id,
209
219
  )
210
220
  if isinstance(extracted_text, str) and extracted_text.strip():
@@ -292,7 +302,7 @@ def _load_text_from_item(
292
302
  item_id: str,
293
303
  relpath: str,
294
304
  media_type: str,
295
- extraction_reference: Optional[ExtractionRunReference],
305
+ extraction_reference: Optional[ExtractionSnapshotReference],
296
306
  ) -> Optional[str]:
297
307
  """
298
308
  Load a text payload from a catalog item.
@@ -305,15 +315,15 @@ def _load_text_from_item(
305
315
  :type relpath: str
306
316
  :param media_type: Media type for the stored content.
307
317
  :type media_type: str
308
- :param extraction_reference: Optional extraction run reference.
309
- :type extraction_reference: ExtractionRunReference or None
318
+ :param extraction_reference: Optional extraction snapshot reference.
319
+ :type extraction_reference: ExtractionSnapshotReference or None
310
320
  :return: Text payload or None if not decodable as text.
311
321
  :rtype: str or None
312
322
  """
313
323
  if extraction_reference:
314
324
  extracted_text = corpus.read_extracted_text(
315
325
  extractor_id=extraction_reference.extractor_id,
316
- run_id=extraction_reference.run_id,
326
+ snapshot_id=extraction_reference.snapshot_id,
317
327
  item_id=item_id,
318
328
  )
319
329
  if isinstance(extracted_text, str) and extracted_text.strip():
@@ -359,21 +369,13 @@ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]
359
369
  return best_start, best_end
360
370
 
361
371
 
362
- def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
363
- """
364
- Build a snippet around a match span, constrained by a character budget.
365
-
366
- :param text: Source text to slice.
367
- :type text: str
368
- :param span: Match span to center on.
369
- :type span: tuple[int, int] or None
370
- :param max_chars: Maximum snippet length.
371
- :type max_chars: int
372
- :return: Snippet text.
373
- :rtype: str
374
- """
372
+ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: Optional[int]) -> str:
373
+ if max_chars is None:
374
+ return text
375
375
  if not text:
376
376
  return ""
377
+ if max_chars <= 0:
378
+ return ""
377
379
  if span is None:
378
380
  return text[:max_chars]
379
381
  span_start, span_end = span
@@ -390,8 +392,8 @@ def _score_items(
390
392
  query_tokens: List[str],
391
393
  query_vector: Dict[str, float],
392
394
  query_norm: float,
393
- snippet_characters: int,
394
- extraction_reference: Optional[ExtractionRunReference],
395
+ extraction_reference: Optional[ExtractionSnapshotReference],
396
+ snippet_characters: Optional[int],
395
397
  ) -> List[Evidence]:
396
398
  """
397
399
  Score catalog items and return evidence candidates.
@@ -406,10 +408,10 @@ def _score_items(
406
408
  :type query_vector: dict[str, float]
407
409
  :param query_norm: Query vector norm.
408
410
  :type query_norm: float
409
- :param snippet_characters: Snippet length budget.
410
- :type snippet_characters: int
411
- :param extraction_reference: Optional extraction run reference.
412
- :type extraction_reference: ExtractionRunReference or None
411
+ :param extraction_reference: Optional extraction snapshot reference.
412
+ :type extraction_reference: ExtractionSnapshotReference or None
413
+ :param snippet_characters: Optional maximum character count for returned evidence text.
414
+ :type snippet_characters: int or None
413
415
  :return: Evidence candidates with provisional ranks.
414
416
  :rtype: list[Evidence]
415
417
  """
@@ -437,9 +439,9 @@ def _score_items(
437
439
  if similarity <= 0:
438
440
  continue
439
441
  span = _find_first_match(item_text, query_tokens)
440
- snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
441
442
  span_start = span[0] if span else None
442
443
  span_end = span[1] if span else None
444
+ evidence_text = _build_snippet(item_text, span, max_chars=snippet_characters)
443
445
  evidence_items.append(
444
446
  Evidence(
445
447
  item_id=str(getattr(catalog_item, "id")),
@@ -447,14 +449,15 @@ def _score_items(
447
449
  media_type=str(media_type),
448
450
  score=float(similarity),
449
451
  rank=1,
450
- text=snippet,
452
+ text=evidence_text,
451
453
  content_ref=None,
452
454
  span_start=span_start,
453
455
  span_end=span_end,
454
456
  stage="tf-vector",
455
- recipe_id="",
456
- run_id="",
457
- hash=hash_text(snippet),
457
+ configuration_id="",
458
+ snapshot_id="",
459
+ metadata=getattr(catalog_item, "metadata", {}) or {},
460
+ hash=hash_text(evidence_text or ""),
458
461
  )
459
462
  )
460
463
  return evidence_items
biblicus/sources.py CHANGED
@@ -8,7 +8,7 @@ import mimetypes
8
8
  from dataclasses import dataclass
9
9
  from pathlib import Path
10
10
  from typing import Optional
11
- from urllib.parse import unquote, urlparse
11
+ from urllib.parse import quote, unquote, urlparse
12
12
  from urllib.request import Request, urlopen
13
13
 
14
14
 
@@ -37,6 +37,27 @@ def _filename_from_url_path(path: str) -> str:
37
37
  return filename or "download"
38
38
 
39
39
 
40
+ def _sanitize_filename_component(name: str) -> str:
41
+ allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
42
+ sanitized_name = "".join(
43
+ (character if character in allowed_characters else "_") for character in name
44
+ ).strip()
45
+ return sanitized_name or "file"
46
+
47
+
48
+ def _namespaced_filename(
49
+ *, source_uri: Optional[str], fallback_name: Optional[str], media_type: str
50
+ ) -> str:
51
+ base_name = ""
52
+ if source_uri:
53
+ base_name = quote(source_uri, safe="")
54
+ if not base_name and fallback_name:
55
+ base_name = _sanitize_filename_component(fallback_name)
56
+ if not base_name:
57
+ base_name = "file"
58
+ return _ensure_extension_for_media_type(base_name, media_type)
59
+
60
+
40
61
  def _media_type_from_filename(name: str) -> str:
41
62
  """
42
63
  Guess media type from a filename.
@@ -119,8 +140,16 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
119
140
  """
120
141
  if Path(filename).suffix:
121
142
  return filename
122
- if media_type == "audio/ogg":
123
- ext = ".ogg"
143
+ media_type_overrides = {
144
+ "audio/mpeg": ".mp3",
145
+ "audio/ogg": ".ogg",
146
+ "audio/wav": ".wav",
147
+ "audio/x-wav": ".wav",
148
+ "image/jpeg": ".jpg",
149
+ "text/html": ".html",
150
+ }
151
+ if media_type in media_type_overrides:
152
+ ext = media_type_overrides[media_type]
124
153
  else:
125
154
  ext = mimetypes.guess_extension(media_type) or ""
126
155
  return filename + ext if ext else filename
@@ -165,11 +194,12 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
165
194
  media_type = _media_type_from_filename(path.name)
166
195
  if path.suffix.lower() in {".md", ".markdown"}:
167
196
  media_type = "text/markdown"
197
+ resolved_source_uri = source_uri or path.as_uri()
168
198
  return SourcePayload(
169
199
  data=path.read_bytes(),
170
200
  filename=path.name,
171
201
  media_type=media_type,
172
- source_uri=source_uri or path.as_uri(),
202
+ source_uri=resolved_source_uri,
173
203
  )
174
204
 
175
205
  if _looks_like_uri(source):
@@ -187,21 +217,26 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
187
217
  with urlopen(request, timeout=30) as response:
188
218
  response_bytes = response.read()
189
219
  content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
190
- filename = _filename_from_url_path(parsed.path)
191
- media_type = content_type or _media_type_from_filename(filename)
220
+ fallback_filename = _filename_from_url_path(parsed.path)
221
+ media_type = content_type or _media_type_from_filename(fallback_filename)
192
222
  if media_type == "application/octet-stream":
193
223
  sniffed = _sniff_media_type_from_bytes(response_bytes)
194
224
  if sniffed:
195
225
  media_type = sniffed
196
- filename = _ensure_extension_for_media_type(filename, media_type)
197
- media_type = _normalize_media_type(filename=filename, media_type=media_type)
198
- if Path(filename).suffix.lower() in {".md", ".markdown"}:
226
+ fallback_filename = _ensure_extension_for_media_type(
227
+ fallback_filename, media_type
228
+ )
229
+ media_type = _normalize_media_type(
230
+ filename=fallback_filename, media_type=media_type
231
+ )
232
+ if Path(fallback_filename).suffix.lower() in {".md", ".markdown"}:
199
233
  media_type = "text/markdown"
234
+ resolved_source_uri = source_uri or source
200
235
  return SourcePayload(
201
236
  data=response_bytes,
202
- filename=filename,
237
+ filename=fallback_filename,
203
238
  media_type=media_type,
204
- source_uri=source_uri or source,
239
+ source_uri=resolved_source_uri,
205
240
  )
206
241
 
207
242
  raise NotImplementedError(
biblicus/text/link.py CHANGED
@@ -159,6 +159,8 @@ def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
159
159
 
160
160
 
161
161
  def _validate_replace_text(old_str: str, new_str: str) -> None:
162
+ if "<span" in old_str or "</span>" in old_str:
163
+ raise ValueError("Text link replacements must target plain text without span tags")
162
164
  if strip_span_tags(old_str) != strip_span_tags(new_str):
163
165
  raise ValueError("Text link replacements may only insert span tags")
164
166
 
@@ -460,12 +462,16 @@ def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: st
460
462
  error_lines = "\n".join(f"- {error}" for error in errors)
461
463
  context_section = build_span_context_section(current_text, errors)
462
464
  coverage_guidance = _build_coverage_guidance(errors)
465
+ nested_guidance = ""
466
+ if any("nested span" in error for error in errors):
467
+ nested_guidance = "Do not create nested or overlapping spans. Remove nested spans and wrap only bare text.\n"
463
468
  return (
464
469
  "Your last edit did not validate.\n"
465
470
  "Issues:\n"
466
471
  f"{error_lines}\n\n"
467
472
  f"{context_section}"
468
473
  f"{coverage_guidance}"
474
+ f"{nested_guidance}"
469
475
  "Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
470
476
  "Reuse the same id for identical names and do not assign multiple ids to the same name. "
471
477
  f"Ids must start with '{id_prefix}'. Try again.\n"
biblicus/text/prompts.py CHANGED
@@ -11,14 +11,16 @@ DEFAULT_EXTRACT_SYSTEM_PROMPT = (
11
11
  "Interpret the word 'return' in the user's request as: wrap the returned text with "
12
12
  "<span>...</span> in-place in the current text.\n\n"
13
13
  "Use the str_replace tool to insert <span>...</span> tags and the done tool when finished.\n"
14
+ "For long spans, insert <span> and </span> using separate str_replace calls. "
15
+ "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
14
16
  "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
15
17
  "Rules:\n"
16
18
  "- Use str_replace only.\n"
17
19
  "- old_str must match exactly once in the current text.\n"
18
20
  "- When choosing old_str, copy the exact substring (including punctuation/case) from the current text.\n"
19
21
  "- old_str and new_str must be non-empty strings.\n"
20
- "- new_str must be identical to old_str with only <span> and </span> inserted.\n"
21
- "- Do not include <span> or </span> inside old_str or new_str.\n"
22
+ "- new_str must be identical to old_str with only <span> and/or </span> inserted.\n"
23
+ "- Do not include <span> or </span> inside old_str.\n"
22
24
  "- Do not insert nested spans.\n"
23
25
  "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
24
26
  "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
@@ -49,14 +51,18 @@ DEFAULT_ANNOTATE_SYSTEM_PROMPT = (
49
51
  '<span ATTRIBUTE="VALUE">...</span> in-place in the current text.\n'
50
52
  "Each span must include exactly one attribute from: {{ allowed_attributes }}.\n\n"
51
53
  "Use the str_replace tool to insert span tags and the done tool when finished.\n"
54
+ "For long spans, insert the opening and closing tags using separate str_replace calls. "
55
+ "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
52
56
  "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
53
57
  "Rules:\n"
54
58
  "- Use str_replace only.\n"
55
59
  "- old_str must match exactly once in the current text.\n"
56
60
  "- old_str and new_str must be non-empty strings.\n"
57
- "- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
58
- "- Do not include <span or </span> inside old_str or new_str.\n"
61
+ "- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
62
+ "- Do not include <span or </span> inside old_str.\n"
59
63
  "- Do not insert nested spans.\n"
64
+ "- Do not wrap text that is already inside a span; spans must never overlap.\n"
65
+ "- If a name appears inside an existing span, leave it alone and wrap only bare text.\n"
60
66
  "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
61
67
  "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
62
68
  "- Do not delete, reorder, paraphrase, or label text beyond the span attributes.\n\n"
@@ -78,13 +84,15 @@ DEFAULT_LINK_SYSTEM_PROMPT = (
78
84
  "- Do not call done until every repeated name or entity in the text is wrapped.\n"
79
85
  "- If a name appears multiple times, there must be one id and refs for every later occurrence.\n\n"
80
86
  "Use the str_replace tool to insert span tags and the done tool when finished.\n"
87
+ "For long spans, insert the opening and closing tags using separate str_replace calls. "
88
+ "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
81
89
  "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
82
90
  "Rules:\n"
83
91
  "- Use str_replace only.\n"
84
92
  "- old_str must match exactly once in the current text.\n"
85
93
  "- old_str and new_str must be non-empty strings.\n"
86
- "- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
87
- "- Do not include <span or </span> inside old_str or new_str.\n"
94
+ "- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
95
+ "- Do not include <span or </span> inside old_str.\n"
88
96
  "- Do not insert nested spans.\n"
89
97
  "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
90
98
  "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
@@ -98,13 +106,15 @@ DEFAULT_REDACT_SYSTEM_PROMPT = (
98
106
  "<span>...</span> in-place in the current text.\n"
99
107
  "If redaction types are provided, use a redact attribute with one of: {{ redaction_types }}.\n\n"
100
108
  "Use the str_replace tool to insert span tags and the done tool when finished.\n"
109
+ "For long spans, insert the opening and closing tags using separate str_replace calls. "
110
+ "For short spans (a few words), it is acceptable to insert both tags in one call.\n"
101
111
  "When finished, call done. Do NOT return JSON in the assistant message.\n\n"
102
112
  "Rules:\n"
103
113
  "- Use str_replace only.\n"
104
114
  "- old_str must match exactly once in the current text.\n"
105
115
  "- old_str and new_str must be non-empty strings.\n"
106
- "- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
107
- "- Do not include <span or </span> inside old_str or new_str.\n"
116
+ "- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
117
+ "- Do not include <span or </span> inside old_str.\n"
108
118
  "- Do not insert nested spans.\n"
109
119
  "- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
110
120
  "- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"