biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Deterministic term-frequency vector
|
|
2
|
+
Deterministic term-frequency vector retriever.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -8,93 +8,103 @@ import math
|
|
|
8
8
|
import re
|
|
9
9
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
10
10
|
|
|
11
|
-
from pydantic import BaseModel, ConfigDict
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
14
|
from ..frontmatter import parse_front_matter
|
|
15
15
|
from ..models import (
|
|
16
16
|
Evidence,
|
|
17
|
-
|
|
17
|
+
ExtractionSnapshotReference,
|
|
18
18
|
QueryBudget,
|
|
19
19
|
RetrievalResult,
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
RetrievalSnapshot,
|
|
21
|
+
parse_extraction_snapshot_reference,
|
|
22
|
+
)
|
|
23
|
+
from ..retrieval import (
|
|
24
|
+
apply_budget,
|
|
25
|
+
create_configuration_manifest,
|
|
26
|
+
create_snapshot_manifest,
|
|
27
|
+
hash_text,
|
|
22
28
|
)
|
|
23
|
-
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
24
29
|
from ..time import utc_now_iso
|
|
25
30
|
|
|
26
31
|
|
|
27
|
-
class
|
|
32
|
+
class TfVectorConfiguration(BaseModel):
|
|
28
33
|
"""
|
|
29
|
-
Configuration for the term-frequency vector
|
|
34
|
+
Configuration for the term-frequency vector retriever.
|
|
30
35
|
|
|
31
|
-
:ivar
|
|
32
|
-
:vartype
|
|
33
|
-
:ivar
|
|
34
|
-
:vartype
|
|
36
|
+
:ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
|
|
37
|
+
:vartype extraction_snapshot: str or None
|
|
38
|
+
:ivar snippet_characters: Optional maximum character count for returned evidence text.
|
|
39
|
+
:vartype snippet_characters: int or None
|
|
35
40
|
"""
|
|
36
41
|
|
|
37
42
|
model_config = ConfigDict(extra="forbid")
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
|
|
44
|
+
extraction_snapshot: Optional[str] = None
|
|
45
|
+
snippet_characters: Optional[int] = None
|
|
41
46
|
|
|
42
47
|
|
|
43
|
-
class
|
|
48
|
+
class TfVectorRetriever:
|
|
44
49
|
"""
|
|
45
|
-
Deterministic vector
|
|
50
|
+
Deterministic vector retriever using term-frequency cosine similarity.
|
|
46
51
|
|
|
47
|
-
:ivar
|
|
48
|
-
:vartype
|
|
52
|
+
:ivar retriever_id: Retriever identifier.
|
|
53
|
+
:vartype retriever_id: str
|
|
49
54
|
"""
|
|
50
55
|
|
|
51
|
-
|
|
56
|
+
retriever_id = "tf-vector"
|
|
52
57
|
|
|
53
|
-
def
|
|
54
|
-
self, corpus: Corpus, *,
|
|
55
|
-
) ->
|
|
58
|
+
def build_snapshot(
|
|
59
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
60
|
+
) -> RetrievalSnapshot:
|
|
56
61
|
"""
|
|
57
|
-
Register a vector
|
|
62
|
+
Register a vector retriever snapshot (no snapshot artifacts).
|
|
58
63
|
|
|
59
64
|
:param corpus: Corpus to build against.
|
|
60
65
|
:type corpus: Corpus
|
|
61
|
-
:param
|
|
62
|
-
:type
|
|
63
|
-
:param
|
|
64
|
-
:type
|
|
65
|
-
:return:
|
|
66
|
-
:rtype:
|
|
66
|
+
:param configuration_name: Human-readable configuration name.
|
|
67
|
+
:type configuration_name: str
|
|
68
|
+
:param configuration: Retriever-specific configuration values.
|
|
69
|
+
:type configuration: dict[str, object]
|
|
70
|
+
:return: Snapshot manifest describing the build.
|
|
71
|
+
:rtype: RetrievalSnapshot
|
|
67
72
|
"""
|
|
68
|
-
|
|
73
|
+
parsed_config = TfVectorConfiguration.model_validate(configuration)
|
|
69
74
|
catalog = corpus.load_catalog()
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
name=
|
|
73
|
-
|
|
75
|
+
configuration_manifest = create_configuration_manifest(
|
|
76
|
+
retriever_id=self.retriever_id,
|
|
77
|
+
name=configuration_name,
|
|
78
|
+
configuration=parsed_config.model_dump(),
|
|
74
79
|
)
|
|
75
80
|
stats = {
|
|
76
81
|
"items": len(catalog.items),
|
|
77
|
-
"text_items": _count_text_items(corpus, catalog.items.values(),
|
|
82
|
+
"text_items": _count_text_items(corpus, catalog.items.values(), parsed_config),
|
|
78
83
|
}
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
84
|
+
snapshot = create_snapshot_manifest(
|
|
85
|
+
corpus,
|
|
86
|
+
configuration=configuration_manifest,
|
|
87
|
+
stats=stats,
|
|
88
|
+
snapshot_artifacts=[],
|
|
89
|
+
)
|
|
90
|
+
corpus.write_snapshot(snapshot)
|
|
91
|
+
return snapshot
|
|
82
92
|
|
|
83
93
|
def query(
|
|
84
94
|
self,
|
|
85
95
|
corpus: Corpus,
|
|
86
96
|
*,
|
|
87
|
-
|
|
97
|
+
snapshot: RetrievalSnapshot,
|
|
88
98
|
query_text: str,
|
|
89
99
|
budget: QueryBudget,
|
|
90
100
|
) -> RetrievalResult:
|
|
91
101
|
"""
|
|
92
102
|
Query the corpus using term-frequency cosine similarity.
|
|
93
103
|
|
|
94
|
-
:param corpus: Corpus associated with the
|
|
104
|
+
:param corpus: Corpus associated with the snapshot.
|
|
95
105
|
:type corpus: Corpus
|
|
96
|
-
:param
|
|
97
|
-
:type
|
|
106
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
107
|
+
:type snapshot: RetrievalSnapshot
|
|
98
108
|
:param query_text: Query text to execute.
|
|
99
109
|
:type query_text: str
|
|
100
110
|
:param budget: Evidence selection budget.
|
|
@@ -102,15 +112,15 @@ class TfVectorBackend:
|
|
|
102
112
|
:return: Retrieval results containing evidence.
|
|
103
113
|
:rtype: RetrievalResult
|
|
104
114
|
"""
|
|
105
|
-
|
|
115
|
+
parsed_config = TfVectorConfiguration.model_validate(snapshot.configuration.configuration)
|
|
106
116
|
query_tokens = _tokenize_text(query_text)
|
|
107
117
|
if not query_tokens:
|
|
108
118
|
return RetrievalResult(
|
|
109
119
|
query_text=query_text,
|
|
110
120
|
budget=budget,
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
121
|
+
snapshot_id=snapshot.snapshot_id,
|
|
122
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
123
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
114
124
|
generated_at=utc_now_iso(),
|
|
115
125
|
evidence=[],
|
|
116
126
|
stats={"candidates": 0, "returned": 0},
|
|
@@ -118,15 +128,15 @@ class TfVectorBackend:
|
|
|
118
128
|
query_vector = _term_frequencies(query_tokens)
|
|
119
129
|
query_norm = _vector_norm(query_vector)
|
|
120
130
|
catalog = corpus.load_catalog()
|
|
121
|
-
extraction_reference = _resolve_extraction_reference(corpus,
|
|
131
|
+
extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
|
|
122
132
|
scored_candidates = _score_items(
|
|
123
133
|
corpus,
|
|
124
134
|
catalog.items.values(),
|
|
125
135
|
query_tokens=query_tokens,
|
|
126
136
|
query_vector=query_vector,
|
|
127
137
|
query_norm=query_norm,
|
|
128
|
-
snippet_characters=recipe_config.snippet_characters,
|
|
129
138
|
extraction_reference=extraction_reference,
|
|
139
|
+
snippet_characters=parsed_config.snippet_characters,
|
|
130
140
|
)
|
|
131
141
|
sorted_candidates = sorted(
|
|
132
142
|
scored_candidates,
|
|
@@ -136,8 +146,8 @@ class TfVectorBackend:
|
|
|
136
146
|
evidence_item.model_copy(
|
|
137
147
|
update={
|
|
138
148
|
"rank": index,
|
|
139
|
-
"
|
|
140
|
-
"
|
|
149
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
150
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
141
151
|
}
|
|
142
152
|
)
|
|
143
153
|
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
@@ -147,9 +157,9 @@ class TfVectorBackend:
|
|
|
147
157
|
return RetrievalResult(
|
|
148
158
|
query_text=query_text,
|
|
149
159
|
budget=budget,
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
160
|
+
snapshot_id=snapshot.snapshot_id,
|
|
161
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
162
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
153
163
|
generated_at=utc_now_iso(),
|
|
154
164
|
evidence=evidence,
|
|
155
165
|
stats=stats,
|
|
@@ -157,33 +167,33 @@ class TfVectorBackend:
|
|
|
157
167
|
|
|
158
168
|
|
|
159
169
|
def _resolve_extraction_reference(
|
|
160
|
-
corpus: Corpus,
|
|
161
|
-
) -> Optional[
|
|
170
|
+
corpus: Corpus, configuration: TfVectorConfiguration
|
|
171
|
+
) -> Optional[ExtractionSnapshotReference]:
|
|
162
172
|
"""
|
|
163
|
-
Resolve an extraction
|
|
173
|
+
Resolve an extraction snapshot reference from a configuration.
|
|
164
174
|
|
|
165
|
-
:param corpus: Corpus associated with the
|
|
175
|
+
:param corpus: Corpus associated with the configuration.
|
|
166
176
|
:type corpus: Corpus
|
|
167
|
-
:param
|
|
168
|
-
:type
|
|
177
|
+
:param configuration: Parsed vector configuration.
|
|
178
|
+
:type configuration: TfVectorConfiguration
|
|
169
179
|
:return: Parsed extraction reference or None.
|
|
170
|
-
:rtype:
|
|
171
|
-
:raises FileNotFoundError: If an extraction
|
|
180
|
+
:rtype: ExtractionSnapshotReference or None
|
|
181
|
+
:raises FileNotFoundError: If an extraction snapshot is referenced but not present.
|
|
172
182
|
"""
|
|
173
|
-
if not
|
|
183
|
+
if not configuration.extraction_snapshot:
|
|
174
184
|
return None
|
|
175
|
-
extraction_reference =
|
|
176
|
-
|
|
185
|
+
extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
|
|
186
|
+
snapshot_dir = corpus.extraction_snapshot_dir(
|
|
177
187
|
extractor_id=extraction_reference.extractor_id,
|
|
178
|
-
|
|
188
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
179
189
|
)
|
|
180
|
-
if not
|
|
181
|
-
raise FileNotFoundError(f"Missing extraction
|
|
190
|
+
if not snapshot_dir.is_dir():
|
|
191
|
+
raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
|
|
182
192
|
return extraction_reference
|
|
183
193
|
|
|
184
194
|
|
|
185
195
|
def _count_text_items(
|
|
186
|
-
corpus: Corpus, items: Iterable[object],
|
|
196
|
+
corpus: Corpus, items: Iterable[object], configuration: TfVectorConfiguration
|
|
187
197
|
) -> int:
|
|
188
198
|
"""
|
|
189
199
|
Count catalog items that represent text content.
|
|
@@ -192,19 +202,19 @@ def _count_text_items(
|
|
|
192
202
|
:type corpus: Corpus
|
|
193
203
|
:param items: Catalog items to inspect.
|
|
194
204
|
:type items: Iterable[object]
|
|
195
|
-
:param
|
|
196
|
-
:type
|
|
205
|
+
:param configuration: Parsed vector configuration.
|
|
206
|
+
:type configuration: TfVectorConfiguration
|
|
197
207
|
:return: Number of text items.
|
|
198
208
|
:rtype: int
|
|
199
209
|
"""
|
|
200
210
|
text_item_count = 0
|
|
201
|
-
extraction_reference = _resolve_extraction_reference(corpus,
|
|
211
|
+
extraction_reference = _resolve_extraction_reference(corpus, configuration)
|
|
202
212
|
for catalog_item in items:
|
|
203
213
|
item_id = str(getattr(catalog_item, "id", ""))
|
|
204
214
|
if extraction_reference and item_id:
|
|
205
215
|
extracted_text = corpus.read_extracted_text(
|
|
206
216
|
extractor_id=extraction_reference.extractor_id,
|
|
207
|
-
|
|
217
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
208
218
|
item_id=item_id,
|
|
209
219
|
)
|
|
210
220
|
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
@@ -292,7 +302,7 @@ def _load_text_from_item(
|
|
|
292
302
|
item_id: str,
|
|
293
303
|
relpath: str,
|
|
294
304
|
media_type: str,
|
|
295
|
-
extraction_reference: Optional[
|
|
305
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
296
306
|
) -> Optional[str]:
|
|
297
307
|
"""
|
|
298
308
|
Load a text payload from a catalog item.
|
|
@@ -305,15 +315,15 @@ def _load_text_from_item(
|
|
|
305
315
|
:type relpath: str
|
|
306
316
|
:param media_type: Media type for the stored content.
|
|
307
317
|
:type media_type: str
|
|
308
|
-
:param extraction_reference: Optional extraction
|
|
309
|
-
:type extraction_reference:
|
|
318
|
+
:param extraction_reference: Optional extraction snapshot reference.
|
|
319
|
+
:type extraction_reference: ExtractionSnapshotReference or None
|
|
310
320
|
:return: Text payload or None if not decodable as text.
|
|
311
321
|
:rtype: str or None
|
|
312
322
|
"""
|
|
313
323
|
if extraction_reference:
|
|
314
324
|
extracted_text = corpus.read_extracted_text(
|
|
315
325
|
extractor_id=extraction_reference.extractor_id,
|
|
316
|
-
|
|
326
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
317
327
|
item_id=item_id,
|
|
318
328
|
)
|
|
319
329
|
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
@@ -359,21 +369,13 @@ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]
|
|
|
359
369
|
return best_start, best_end
|
|
360
370
|
|
|
361
371
|
|
|
362
|
-
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
:param text: Source text to slice.
|
|
367
|
-
:type text: str
|
|
368
|
-
:param span: Match span to center on.
|
|
369
|
-
:type span: tuple[int, int] or None
|
|
370
|
-
:param max_chars: Maximum snippet length.
|
|
371
|
-
:type max_chars: int
|
|
372
|
-
:return: Snippet text.
|
|
373
|
-
:rtype: str
|
|
374
|
-
"""
|
|
372
|
+
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: Optional[int]) -> str:
|
|
373
|
+
if max_chars is None:
|
|
374
|
+
return text
|
|
375
375
|
if not text:
|
|
376
376
|
return ""
|
|
377
|
+
if max_chars <= 0:
|
|
378
|
+
return ""
|
|
377
379
|
if span is None:
|
|
378
380
|
return text[:max_chars]
|
|
379
381
|
span_start, span_end = span
|
|
@@ -390,8 +392,8 @@ def _score_items(
|
|
|
390
392
|
query_tokens: List[str],
|
|
391
393
|
query_vector: Dict[str, float],
|
|
392
394
|
query_norm: float,
|
|
393
|
-
|
|
394
|
-
|
|
395
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
396
|
+
snippet_characters: Optional[int],
|
|
395
397
|
) -> List[Evidence]:
|
|
396
398
|
"""
|
|
397
399
|
Score catalog items and return evidence candidates.
|
|
@@ -406,10 +408,10 @@ def _score_items(
|
|
|
406
408
|
:type query_vector: dict[str, float]
|
|
407
409
|
:param query_norm: Query vector norm.
|
|
408
410
|
:type query_norm: float
|
|
409
|
-
:param
|
|
410
|
-
:type
|
|
411
|
-
:param
|
|
412
|
-
:type
|
|
411
|
+
:param extraction_reference: Optional extraction snapshot reference.
|
|
412
|
+
:type extraction_reference: ExtractionSnapshotReference or None
|
|
413
|
+
:param snippet_characters: Optional maximum character count for returned evidence text.
|
|
414
|
+
:type snippet_characters: int or None
|
|
413
415
|
:return: Evidence candidates with provisional ranks.
|
|
414
416
|
:rtype: list[Evidence]
|
|
415
417
|
"""
|
|
@@ -437,9 +439,9 @@ def _score_items(
|
|
|
437
439
|
if similarity <= 0:
|
|
438
440
|
continue
|
|
439
441
|
span = _find_first_match(item_text, query_tokens)
|
|
440
|
-
snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
441
442
|
span_start = span[0] if span else None
|
|
442
443
|
span_end = span[1] if span else None
|
|
444
|
+
evidence_text = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
443
445
|
evidence_items.append(
|
|
444
446
|
Evidence(
|
|
445
447
|
item_id=str(getattr(catalog_item, "id")),
|
|
@@ -447,14 +449,15 @@ def _score_items(
|
|
|
447
449
|
media_type=str(media_type),
|
|
448
450
|
score=float(similarity),
|
|
449
451
|
rank=1,
|
|
450
|
-
text=
|
|
452
|
+
text=evidence_text,
|
|
451
453
|
content_ref=None,
|
|
452
454
|
span_start=span_start,
|
|
453
455
|
span_end=span_end,
|
|
454
456
|
stage="tf-vector",
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
457
|
+
configuration_id="",
|
|
458
|
+
snapshot_id="",
|
|
459
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
460
|
+
hash=hash_text(evidence_text or ""),
|
|
458
461
|
)
|
|
459
462
|
)
|
|
460
463
|
return evidence_items
|
biblicus/sources.py
CHANGED
|
@@ -8,7 +8,7 @@ import mimetypes
|
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Optional
|
|
11
|
-
from urllib.parse import unquote, urlparse
|
|
11
|
+
from urllib.parse import quote, unquote, urlparse
|
|
12
12
|
from urllib.request import Request, urlopen
|
|
13
13
|
|
|
14
14
|
|
|
@@ -37,6 +37,27 @@ def _filename_from_url_path(path: str) -> str:
|
|
|
37
37
|
return filename or "download"
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
def _sanitize_filename_component(name: str) -> str:
|
|
41
|
+
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
42
|
+
sanitized_name = "".join(
|
|
43
|
+
(character if character in allowed_characters else "_") for character in name
|
|
44
|
+
).strip()
|
|
45
|
+
return sanitized_name or "file"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _namespaced_filename(
|
|
49
|
+
*, source_uri: Optional[str], fallback_name: Optional[str], media_type: str
|
|
50
|
+
) -> str:
|
|
51
|
+
base_name = ""
|
|
52
|
+
if source_uri:
|
|
53
|
+
base_name = quote(source_uri, safe="")
|
|
54
|
+
if not base_name and fallback_name:
|
|
55
|
+
base_name = _sanitize_filename_component(fallback_name)
|
|
56
|
+
if not base_name:
|
|
57
|
+
base_name = "file"
|
|
58
|
+
return _ensure_extension_for_media_type(base_name, media_type)
|
|
59
|
+
|
|
60
|
+
|
|
40
61
|
def _media_type_from_filename(name: str) -> str:
|
|
41
62
|
"""
|
|
42
63
|
Guess media type from a filename.
|
|
@@ -119,8 +140,16 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
|
|
|
119
140
|
"""
|
|
120
141
|
if Path(filename).suffix:
|
|
121
142
|
return filename
|
|
122
|
-
|
|
123
|
-
|
|
143
|
+
media_type_overrides = {
|
|
144
|
+
"audio/mpeg": ".mp3",
|
|
145
|
+
"audio/ogg": ".ogg",
|
|
146
|
+
"audio/wav": ".wav",
|
|
147
|
+
"audio/x-wav": ".wav",
|
|
148
|
+
"image/jpeg": ".jpg",
|
|
149
|
+
"text/html": ".html",
|
|
150
|
+
}
|
|
151
|
+
if media_type in media_type_overrides:
|
|
152
|
+
ext = media_type_overrides[media_type]
|
|
124
153
|
else:
|
|
125
154
|
ext = mimetypes.guess_extension(media_type) or ""
|
|
126
155
|
return filename + ext if ext else filename
|
|
@@ -165,11 +194,12 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
165
194
|
media_type = _media_type_from_filename(path.name)
|
|
166
195
|
if path.suffix.lower() in {".md", ".markdown"}:
|
|
167
196
|
media_type = "text/markdown"
|
|
197
|
+
resolved_source_uri = source_uri or path.as_uri()
|
|
168
198
|
return SourcePayload(
|
|
169
199
|
data=path.read_bytes(),
|
|
170
200
|
filename=path.name,
|
|
171
201
|
media_type=media_type,
|
|
172
|
-
source_uri=
|
|
202
|
+
source_uri=resolved_source_uri,
|
|
173
203
|
)
|
|
174
204
|
|
|
175
205
|
if _looks_like_uri(source):
|
|
@@ -187,21 +217,26 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
187
217
|
with urlopen(request, timeout=30) as response:
|
|
188
218
|
response_bytes = response.read()
|
|
189
219
|
content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
|
|
190
|
-
|
|
191
|
-
media_type = content_type or _media_type_from_filename(
|
|
220
|
+
fallback_filename = _filename_from_url_path(parsed.path)
|
|
221
|
+
media_type = content_type or _media_type_from_filename(fallback_filename)
|
|
192
222
|
if media_type == "application/octet-stream":
|
|
193
223
|
sniffed = _sniff_media_type_from_bytes(response_bytes)
|
|
194
224
|
if sniffed:
|
|
195
225
|
media_type = sniffed
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
226
|
+
fallback_filename = _ensure_extension_for_media_type(
|
|
227
|
+
fallback_filename, media_type
|
|
228
|
+
)
|
|
229
|
+
media_type = _normalize_media_type(
|
|
230
|
+
filename=fallback_filename, media_type=media_type
|
|
231
|
+
)
|
|
232
|
+
if Path(fallback_filename).suffix.lower() in {".md", ".markdown"}:
|
|
199
233
|
media_type = "text/markdown"
|
|
234
|
+
resolved_source_uri = source_uri or source
|
|
200
235
|
return SourcePayload(
|
|
201
236
|
data=response_bytes,
|
|
202
|
-
filename=
|
|
237
|
+
filename=fallback_filename,
|
|
203
238
|
media_type=media_type,
|
|
204
|
-
source_uri=
|
|
239
|
+
source_uri=resolved_source_uri,
|
|
205
240
|
)
|
|
206
241
|
|
|
207
242
|
raise NotImplementedError(
|
biblicus/text/link.py
CHANGED
|
@@ -159,6 +159,8 @@ def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
|
|
|
159
159
|
|
|
160
160
|
|
|
161
161
|
def _validate_replace_text(old_str: str, new_str: str) -> None:
|
|
162
|
+
if "<span" in old_str or "</span>" in old_str:
|
|
163
|
+
raise ValueError("Text link replacements must target plain text without span tags")
|
|
162
164
|
if strip_span_tags(old_str) != strip_span_tags(new_str):
|
|
163
165
|
raise ValueError("Text link replacements may only insert span tags")
|
|
164
166
|
|
|
@@ -460,12 +462,16 @@ def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: st
|
|
|
460
462
|
error_lines = "\n".join(f"- {error}" for error in errors)
|
|
461
463
|
context_section = build_span_context_section(current_text, errors)
|
|
462
464
|
coverage_guidance = _build_coverage_guidance(errors)
|
|
465
|
+
nested_guidance = ""
|
|
466
|
+
if any("nested span" in error for error in errors):
|
|
467
|
+
nested_guidance = "Do not create nested or overlapping spans. Remove nested spans and wrap only bare text.\n"
|
|
463
468
|
return (
|
|
464
469
|
"Your last edit did not validate.\n"
|
|
465
470
|
"Issues:\n"
|
|
466
471
|
f"{error_lines}\n\n"
|
|
467
472
|
f"{context_section}"
|
|
468
473
|
f"{coverage_guidance}"
|
|
474
|
+
f"{nested_guidance}"
|
|
469
475
|
"Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
|
|
470
476
|
"Reuse the same id for identical names and do not assign multiple ids to the same name. "
|
|
471
477
|
f"Ids must start with '{id_prefix}'. Try again.\n"
|
biblicus/text/prompts.py
CHANGED
|
@@ -11,14 +11,16 @@ DEFAULT_EXTRACT_SYSTEM_PROMPT = (
|
|
|
11
11
|
"Interpret the word 'return' in the user's request as: wrap the returned text with "
|
|
12
12
|
"<span>...</span> in-place in the current text.\n\n"
|
|
13
13
|
"Use the str_replace tool to insert <span>...</span> tags and the done tool when finished.\n"
|
|
14
|
+
"For long spans, insert <span> and </span> using separate str_replace calls. "
|
|
15
|
+
"For short spans (a few words), it is acceptable to insert both tags in one call.\n"
|
|
14
16
|
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
15
17
|
"Rules:\n"
|
|
16
18
|
"- Use str_replace only.\n"
|
|
17
19
|
"- old_str must match exactly once in the current text.\n"
|
|
18
20
|
"- When choosing old_str, copy the exact substring (including punctuation/case) from the current text.\n"
|
|
19
21
|
"- old_str and new_str must be non-empty strings.\n"
|
|
20
|
-
"- new_str must be identical to old_str with only <span> and </span> inserted.\n"
|
|
21
|
-
"- Do not include <span> or </span> inside old_str
|
|
22
|
+
"- new_str must be identical to old_str with only <span> and/or </span> inserted.\n"
|
|
23
|
+
"- Do not include <span> or </span> inside old_str.\n"
|
|
22
24
|
"- Do not insert nested spans.\n"
|
|
23
25
|
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
24
26
|
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
@@ -49,14 +51,18 @@ DEFAULT_ANNOTATE_SYSTEM_PROMPT = (
|
|
|
49
51
|
'<span ATTRIBUTE="VALUE">...</span> in-place in the current text.\n'
|
|
50
52
|
"Each span must include exactly one attribute from: {{ allowed_attributes }}.\n\n"
|
|
51
53
|
"Use the str_replace tool to insert span tags and the done tool when finished.\n"
|
|
54
|
+
"For long spans, insert the opening and closing tags using separate str_replace calls. "
|
|
55
|
+
"For short spans (a few words), it is acceptable to insert both tags in one call.\n"
|
|
52
56
|
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
53
57
|
"Rules:\n"
|
|
54
58
|
"- Use str_replace only.\n"
|
|
55
59
|
"- old_str must match exactly once in the current text.\n"
|
|
56
60
|
"- old_str and new_str must be non-empty strings.\n"
|
|
57
|
-
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
58
|
-
"- Do not include <span or </span> inside old_str
|
|
61
|
+
"- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
|
|
62
|
+
"- Do not include <span or </span> inside old_str.\n"
|
|
59
63
|
"- Do not insert nested spans.\n"
|
|
64
|
+
"- Do not wrap text that is already inside a span; spans must never overlap.\n"
|
|
65
|
+
"- If a name appears inside an existing span, leave it alone and wrap only bare text.\n"
|
|
60
66
|
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
61
67
|
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
62
68
|
"- Do not delete, reorder, paraphrase, or label text beyond the span attributes.\n\n"
|
|
@@ -78,13 +84,15 @@ DEFAULT_LINK_SYSTEM_PROMPT = (
|
|
|
78
84
|
"- Do not call done until every repeated name or entity in the text is wrapped.\n"
|
|
79
85
|
"- If a name appears multiple times, there must be one id and refs for every later occurrence.\n\n"
|
|
80
86
|
"Use the str_replace tool to insert span tags and the done tool when finished.\n"
|
|
87
|
+
"For long spans, insert the opening and closing tags using separate str_replace calls. "
|
|
88
|
+
"For short spans (a few words), it is acceptable to insert both tags in one call.\n"
|
|
81
89
|
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
82
90
|
"Rules:\n"
|
|
83
91
|
"- Use str_replace only.\n"
|
|
84
92
|
"- old_str must match exactly once in the current text.\n"
|
|
85
93
|
"- old_str and new_str must be non-empty strings.\n"
|
|
86
|
-
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
87
|
-
"- Do not include <span or </span> inside old_str
|
|
94
|
+
"- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
|
|
95
|
+
"- Do not include <span or </span> inside old_str.\n"
|
|
88
96
|
"- Do not insert nested spans.\n"
|
|
89
97
|
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
90
98
|
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
@@ -98,13 +106,15 @@ DEFAULT_REDACT_SYSTEM_PROMPT = (
|
|
|
98
106
|
"<span>...</span> in-place in the current text.\n"
|
|
99
107
|
"If redaction types are provided, use a redact attribute with one of: {{ redaction_types }}.\n\n"
|
|
100
108
|
"Use the str_replace tool to insert span tags and the done tool when finished.\n"
|
|
109
|
+
"For long spans, insert the opening and closing tags using separate str_replace calls. "
|
|
110
|
+
"For short spans (a few words), it is acceptable to insert both tags in one call.\n"
|
|
101
111
|
"When finished, call done. Do NOT return JSON in the assistant message.\n\n"
|
|
102
112
|
"Rules:\n"
|
|
103
113
|
"- Use str_replace only.\n"
|
|
104
114
|
"- old_str must match exactly once in the current text.\n"
|
|
105
115
|
"- old_str and new_str must be non-empty strings.\n"
|
|
106
|
-
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
107
|
-
"- Do not include <span or </span> inside old_str
|
|
116
|
+
"- new_str must be identical to old_str with only <span ...> and/or </span> inserted.\n"
|
|
117
|
+
"- Do not include <span or </span> inside old_str.\n"
|
|
108
118
|
"- Do not insert nested spans.\n"
|
|
109
119
|
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
110
120
|
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|