biblicus 0.16.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +21 -1
- biblicus/backends/embedding_index_common.py +36 -3
- biblicus/backends/embedding_index_file.py +11 -5
- biblicus/backends/embedding_index_inmemory.py +14 -12
- biblicus/backends/hybrid.py +4 -3
- biblicus/backends/scan.py +1 -0
- biblicus/backends/tf_vector.py +17 -24
- biblicus/cli.py +25 -15
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1060 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +129 -0
- biblicus/corpus.py +117 -16
- biblicus/errors.py +24 -0
- biblicus/knowledge_base.py +1 -1
- biblicus/models.py +6 -3
- biblicus/retrieval.py +2 -2
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +2 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/METADATA +3 -3
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/RECORD +28 -23
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -11,6 +11,7 @@ import shutil
|
|
|
11
11
|
import uuid
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import Any, Dict, List, Optional, Sequence
|
|
14
|
+
from urllib.parse import quote, unquote, urlparse
|
|
14
15
|
|
|
15
16
|
import yaml
|
|
16
17
|
from pydantic import ValidationError
|
|
@@ -24,6 +25,7 @@ from .constants import (
|
|
|
24
25
|
SCHEMA_VERSION,
|
|
25
26
|
SIDECAR_SUFFIX,
|
|
26
27
|
)
|
|
28
|
+
from .errors import IngestCollisionError
|
|
27
29
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
28
30
|
from .hook_manager import HookManager
|
|
29
31
|
from .hooks import HookPoint
|
|
@@ -110,7 +112,10 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
|
110
112
|
"""
|
|
111
113
|
media_type_overrides = {
|
|
112
114
|
"image/jpeg": ".jpg",
|
|
115
|
+
"audio/mpeg": ".mp3",
|
|
113
116
|
"audio/ogg": ".ogg",
|
|
117
|
+
"audio/wav": ".wav",
|
|
118
|
+
"audio/x-wav": ".wav",
|
|
114
119
|
}
|
|
115
120
|
if media_type in media_type_overrides:
|
|
116
121
|
return media_type_overrides[media_type]
|
|
@@ -136,7 +141,16 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
136
141
|
return raw_name + ".md"
|
|
137
142
|
|
|
138
143
|
if Path(raw_name).suffix:
|
|
139
|
-
|
|
144
|
+
if "%2F" in raw_name or "%3A" in raw_name:
|
|
145
|
+
decoded = unquote(raw_name)
|
|
146
|
+
parsed = urlparse(decoded)
|
|
147
|
+
decoded_path = parsed.path if parsed.scheme else decoded
|
|
148
|
+
if not Path(decoded_path).suffix:
|
|
149
|
+
pass
|
|
150
|
+
else:
|
|
151
|
+
return raw_name
|
|
152
|
+
else:
|
|
153
|
+
return raw_name
|
|
140
154
|
|
|
141
155
|
ext = _preferred_extension_for_media_type(media_type)
|
|
142
156
|
if not ext:
|
|
@@ -144,6 +158,55 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
144
158
|
return raw_name + ext
|
|
145
159
|
|
|
146
160
|
|
|
161
|
+
def _encode_source_uri_for_filename(source_uri: str) -> str:
|
|
162
|
+
"""
|
|
163
|
+
Percent-encode a source uniform resource identifier for filename use.
|
|
164
|
+
|
|
165
|
+
:param source_uri: Source uniform resource identifier to encode.
|
|
166
|
+
:type source_uri: str
|
|
167
|
+
:return: Percent-encoded uniform resource identifier safe for filenames.
|
|
168
|
+
:rtype: str
|
|
169
|
+
"""
|
|
170
|
+
return quote(source_uri, safe="")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _storage_filename_for_ingest(
|
|
174
|
+
*, filename: Optional[str], media_type: str, source_uri: Optional[str]
|
|
175
|
+
) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Derive a collision-safe filename for corpus storage.
|
|
178
|
+
|
|
179
|
+
If a source uniform resource identifier is provided, the full uniform resource identifier is
|
|
180
|
+
percent-encoded to namespace the stored file, preventing collisions between identical basenames
|
|
181
|
+
from different sources. When no uniform resource identifier is available, fall back to a
|
|
182
|
+
sanitized filename.
|
|
183
|
+
|
|
184
|
+
:param filename: Optional filename hint from the caller.
|
|
185
|
+
:type filename: str or None
|
|
186
|
+
:param media_type: Media type of the payload.
|
|
187
|
+
:type media_type: str
|
|
188
|
+
:param source_uri: Optional source uniform resource identifier for provenance.
|
|
189
|
+
:type source_uri: str or None
|
|
190
|
+
:return: Storage filename with an appropriate extension, or an empty string when no hint exists.
|
|
191
|
+
:rtype: str
|
|
192
|
+
"""
|
|
193
|
+
base_name = ""
|
|
194
|
+
if source_uri:
|
|
195
|
+
base_name = _encode_source_uri_for_filename(source_uri)
|
|
196
|
+
if filename and not source_uri.startswith("file:"):
|
|
197
|
+
sanitized = _sanitize_filename(filename)
|
|
198
|
+
if sanitized:
|
|
199
|
+
base_name = f"{base_name}--{sanitized}"
|
|
200
|
+
if not base_name and filename:
|
|
201
|
+
base_name = _sanitize_filename(filename)
|
|
202
|
+
if not base_name:
|
|
203
|
+
return ""
|
|
204
|
+
if len(base_name) > 180:
|
|
205
|
+
digest = hashlib.sha256(base_name.encode("utf-8")).hexdigest()
|
|
206
|
+
base_name = f"hash-{digest}"
|
|
207
|
+
return _ensure_filename_extension(base_name, media_type=media_type)
|
|
208
|
+
|
|
209
|
+
|
|
147
210
|
def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
148
211
|
"""
|
|
149
212
|
Merge tags from explicit input and front matter values.
|
|
@@ -520,6 +583,24 @@ class Corpus:
|
|
|
520
583
|
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
521
584
|
temp_path.replace(self.catalog_path)
|
|
522
585
|
|
|
586
|
+
def _find_item_by_source_uri(self, source_uri: str) -> Optional[CatalogItem]:
|
|
587
|
+
"""
|
|
588
|
+
Locate an existing catalog item by source uniform resource identifier.
|
|
589
|
+
|
|
590
|
+
:param source_uri: Source uniform resource identifier to search for.
|
|
591
|
+
:type source_uri: str
|
|
592
|
+
:return: Matching catalog item or None.
|
|
593
|
+
:rtype: CatalogItem or None
|
|
594
|
+
"""
|
|
595
|
+
if not source_uri:
|
|
596
|
+
return None
|
|
597
|
+
self._init_catalog()
|
|
598
|
+
catalog = self._load_catalog()
|
|
599
|
+
for item in catalog.items.values():
|
|
600
|
+
if item.source_uri == source_uri:
|
|
601
|
+
return item
|
|
602
|
+
return None
|
|
603
|
+
|
|
523
604
|
@property
|
|
524
605
|
def runs_dir(self) -> Path:
|
|
525
606
|
"""
|
|
@@ -817,18 +898,26 @@ class Corpus:
|
|
|
817
898
|
:return: Ingestion result summary.
|
|
818
899
|
:rtype: IngestResult
|
|
819
900
|
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
901
|
+
:raises IngestCollisionError: If a source uniform resource identifier is already ingested.
|
|
820
902
|
"""
|
|
821
|
-
|
|
822
|
-
|
|
903
|
+
existing_item = self._find_item_by_source_uri(source_uri)
|
|
904
|
+
if existing_item is not None:
|
|
905
|
+
raise IngestCollisionError(
|
|
906
|
+
source_uri=source_uri,
|
|
907
|
+
existing_item_id=existing_item.id,
|
|
908
|
+
existing_relpath=existing_item.relpath,
|
|
909
|
+
)
|
|
823
910
|
|
|
824
|
-
|
|
825
|
-
|
|
911
|
+
item_id = str(uuid.uuid4())
|
|
912
|
+
storage_filename = _storage_filename_for_ingest(
|
|
913
|
+
filename=filename, media_type=media_type, source_uri=source_uri
|
|
914
|
+
)
|
|
826
915
|
|
|
827
916
|
if media_type == "text/markdown":
|
|
828
|
-
output_name = f"{item_id}--{
|
|
917
|
+
output_name = f"{item_id}--{storage_filename}" if storage_filename else f"{item_id}.md"
|
|
829
918
|
else:
|
|
830
|
-
if
|
|
831
|
-
output_name = f"{item_id}--{
|
|
919
|
+
if storage_filename:
|
|
920
|
+
output_name = f"{item_id}--{storage_filename}"
|
|
832
921
|
else:
|
|
833
922
|
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
834
923
|
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
@@ -991,13 +1080,21 @@ class Corpus:
|
|
|
991
1080
|
if media_type == "text/markdown":
|
|
992
1081
|
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
993
1082
|
|
|
1083
|
+
existing_item = self._find_item_by_source_uri(source_uri)
|
|
1084
|
+
if existing_item is not None:
|
|
1085
|
+
raise IngestCollisionError(
|
|
1086
|
+
source_uri=source_uri,
|
|
1087
|
+
existing_item_id=existing_item.id,
|
|
1088
|
+
existing_relpath=existing_item.relpath,
|
|
1089
|
+
)
|
|
1090
|
+
|
|
994
1091
|
item_id = str(uuid.uuid4())
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
1092
|
+
storage_filename = _storage_filename_for_ingest(
|
|
1093
|
+
filename=filename, media_type=media_type, source_uri=source_uri
|
|
1094
|
+
)
|
|
998
1095
|
|
|
999
|
-
if
|
|
1000
|
-
output_name = f"{item_id}--{
|
|
1096
|
+
if storage_filename:
|
|
1097
|
+
output_name = f"{item_id}--{storage_filename}"
|
|
1001
1098
|
else:
|
|
1002
1099
|
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
1003
1100
|
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
@@ -1085,7 +1182,7 @@ class Corpus:
|
|
|
1085
1182
|
*,
|
|
1086
1183
|
title: Optional[str] = None,
|
|
1087
1184
|
tags: Sequence[str] = (),
|
|
1088
|
-
source_uri: str =
|
|
1185
|
+
source_uri: Optional[str] = None,
|
|
1089
1186
|
) -> IngestResult:
|
|
1090
1187
|
"""
|
|
1091
1188
|
Ingest a text note as Markdown.
|
|
@@ -1096,11 +1193,15 @@ class Corpus:
|
|
|
1096
1193
|
:type title: str or None
|
|
1097
1194
|
:param tags: Tags to associate with the note.
|
|
1098
1195
|
:type tags: Sequence[str]
|
|
1099
|
-
:param source_uri:
|
|
1100
|
-
:type source_uri: str
|
|
1196
|
+
:param source_uri: Optional source uniform resource identifier for provenance.
|
|
1197
|
+
:type source_uri: str or None
|
|
1101
1198
|
:return: Ingestion result summary.
|
|
1102
1199
|
:rtype: IngestResult
|
|
1103
1200
|
"""
|
|
1201
|
+
if source_uri is None:
|
|
1202
|
+
digest_source = (title or "") + "\n" + text
|
|
1203
|
+
digest = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()
|
|
1204
|
+
source_uri = f"text:{digest}"
|
|
1104
1205
|
data = text.encode("utf-8")
|
|
1105
1206
|
return self.ingest_item(
|
|
1106
1207
|
data,
|
biblicus/errors.py
CHANGED
|
@@ -13,3 +13,27 @@ class ExtractionRunFatalError(RuntimeError):
|
|
|
13
13
|
rather than a per-item extraction failure. For example, a selection extractor that depends
|
|
14
14
|
on referenced extraction run manifests treats missing manifests as fatal.
|
|
15
15
|
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IngestCollisionError(RuntimeError):
|
|
19
|
+
"""
|
|
20
|
+
Ingest collision for an already ingested source.
|
|
21
|
+
|
|
22
|
+
:param source_uri: Source uniform resource identifier that caused the collision.
|
|
23
|
+
:type source_uri: str
|
|
24
|
+
:param existing_item_id: Identifier of the existing catalog item.
|
|
25
|
+
:type existing_item_id: str
|
|
26
|
+
:param existing_relpath: Raw storage relpath of the existing item.
|
|
27
|
+
:type existing_relpath: str
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, *, source_uri: str, existing_item_id: str, existing_relpath: str) -> None:
|
|
31
|
+
self.source_uri = source_uri
|
|
32
|
+
self.existing_item_id = existing_item_id
|
|
33
|
+
self.existing_relpath = existing_relpath
|
|
34
|
+
message = (
|
|
35
|
+
"Source already ingested"
|
|
36
|
+
f": source_uri={source_uri} existing_item_id={existing_item_id}"
|
|
37
|
+
f" existing_relpath={existing_relpath}"
|
|
38
|
+
)
|
|
39
|
+
super().__init__(message)
|
biblicus/knowledge_base.py
CHANGED
biblicus/models.py
CHANGED
|
@@ -234,8 +234,8 @@ class QueryBudget(BaseModel):
|
|
|
234
234
|
This enables simple pagination by re-running the same query with a
|
|
235
235
|
higher offset.
|
|
236
236
|
:vartype offset: int
|
|
237
|
-
:ivar
|
|
238
|
-
:vartype
|
|
237
|
+
:ivar maximum_total_characters: Optional maximum total characters across evidence text.
|
|
238
|
+
:vartype maximum_total_characters: int or None
|
|
239
239
|
:ivar max_items_per_source: Optional cap per source uniform resource identifier.
|
|
240
240
|
:vartype max_items_per_source: int or None
|
|
241
241
|
"""
|
|
@@ -244,7 +244,7 @@ class QueryBudget(BaseModel):
|
|
|
244
244
|
|
|
245
245
|
max_total_items: int = Field(ge=1)
|
|
246
246
|
offset: int = Field(default=0, ge=0)
|
|
247
|
-
|
|
247
|
+
maximum_total_characters: Optional[int] = Field(default=None, ge=1)
|
|
248
248
|
max_items_per_source: Optional[int] = Field(default=None, ge=1)
|
|
249
249
|
|
|
250
250
|
|
|
@@ -278,6 +278,8 @@ class Evidence(BaseModel):
|
|
|
278
278
|
:vartype recipe_id: str
|
|
279
279
|
:ivar run_id: Retrieval run identifier.
|
|
280
280
|
:vartype run_id: str
|
|
281
|
+
:ivar metadata: Optional metadata payload from the catalog item.
|
|
282
|
+
:vartype metadata: dict[str, Any]
|
|
281
283
|
:ivar hash: Optional content hash for provenance.
|
|
282
284
|
:vartype hash: str or None
|
|
283
285
|
"""
|
|
@@ -297,6 +299,7 @@ class Evidence(BaseModel):
|
|
|
297
299
|
stage_scores: Optional[Dict[str, float]] = None
|
|
298
300
|
recipe_id: str
|
|
299
301
|
run_id: str
|
|
302
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
300
303
|
hash: Optional[str] = None
|
|
301
304
|
|
|
302
305
|
@model_validator(mode="after")
|
biblicus/retrieval.py
CHANGED
|
@@ -124,8 +124,8 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
|
|
|
124
124
|
continue
|
|
125
125
|
|
|
126
126
|
text_character_count = len(candidate_evidence.text or "")
|
|
127
|
-
if budget.
|
|
128
|
-
if total_characters + text_character_count > budget.
|
|
127
|
+
if budget.maximum_total_characters is not None:
|
|
128
|
+
if total_characters + text_character_count > budget.maximum_total_characters:
|
|
129
129
|
continue
|
|
130
130
|
|
|
131
131
|
selected_evidence.append(candidate_evidence)
|
biblicus/sources.py
CHANGED
|
@@ -8,7 +8,7 @@ import mimetypes
|
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Optional
|
|
11
|
-
from urllib.parse import unquote, urlparse
|
|
11
|
+
from urllib.parse import quote, unquote, urlparse
|
|
12
12
|
from urllib.request import Request, urlopen
|
|
13
13
|
|
|
14
14
|
|
|
@@ -37,6 +37,27 @@ def _filename_from_url_path(path: str) -> str:
|
|
|
37
37
|
return filename or "download"
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
def _sanitize_filename_component(name: str) -> str:
|
|
41
|
+
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
42
|
+
sanitized_name = "".join(
|
|
43
|
+
(character if character in allowed_characters else "_") for character in name
|
|
44
|
+
).strip()
|
|
45
|
+
return sanitized_name or "file"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _namespaced_filename(
|
|
49
|
+
*, source_uri: Optional[str], fallback_name: Optional[str], media_type: str
|
|
50
|
+
) -> str:
|
|
51
|
+
base_name = ""
|
|
52
|
+
if source_uri:
|
|
53
|
+
base_name = quote(source_uri, safe="")
|
|
54
|
+
if not base_name and fallback_name:
|
|
55
|
+
base_name = _sanitize_filename_component(fallback_name)
|
|
56
|
+
if not base_name:
|
|
57
|
+
base_name = "file"
|
|
58
|
+
return _ensure_extension_for_media_type(base_name, media_type)
|
|
59
|
+
|
|
60
|
+
|
|
40
61
|
def _media_type_from_filename(name: str) -> str:
|
|
41
62
|
"""
|
|
42
63
|
Guess media type from a filename.
|
|
@@ -119,8 +140,16 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
|
|
|
119
140
|
"""
|
|
120
141
|
if Path(filename).suffix:
|
|
121
142
|
return filename
|
|
122
|
-
|
|
123
|
-
|
|
143
|
+
media_type_overrides = {
|
|
144
|
+
"audio/mpeg": ".mp3",
|
|
145
|
+
"audio/ogg": ".ogg",
|
|
146
|
+
"audio/wav": ".wav",
|
|
147
|
+
"audio/x-wav": ".wav",
|
|
148
|
+
"image/jpeg": ".jpg",
|
|
149
|
+
"text/html": ".html",
|
|
150
|
+
}
|
|
151
|
+
if media_type in media_type_overrides:
|
|
152
|
+
ext = media_type_overrides[media_type]
|
|
124
153
|
else:
|
|
125
154
|
ext = mimetypes.guess_extension(media_type) or ""
|
|
126
155
|
return filename + ext if ext else filename
|
|
@@ -165,11 +194,12 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
165
194
|
media_type = _media_type_from_filename(path.name)
|
|
166
195
|
if path.suffix.lower() in {".md", ".markdown"}:
|
|
167
196
|
media_type = "text/markdown"
|
|
197
|
+
resolved_source_uri = source_uri or path.as_uri()
|
|
168
198
|
return SourcePayload(
|
|
169
199
|
data=path.read_bytes(),
|
|
170
200
|
filename=path.name,
|
|
171
201
|
media_type=media_type,
|
|
172
|
-
source_uri=
|
|
202
|
+
source_uri=resolved_source_uri,
|
|
173
203
|
)
|
|
174
204
|
|
|
175
205
|
if _looks_like_uri(source):
|
|
@@ -187,21 +217,26 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
187
217
|
with urlopen(request, timeout=30) as response:
|
|
188
218
|
response_bytes = response.read()
|
|
189
219
|
content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
|
|
190
|
-
|
|
191
|
-
media_type = content_type or _media_type_from_filename(
|
|
220
|
+
fallback_filename = _filename_from_url_path(parsed.path)
|
|
221
|
+
media_type = content_type or _media_type_from_filename(fallback_filename)
|
|
192
222
|
if media_type == "application/octet-stream":
|
|
193
223
|
sniffed = _sniff_media_type_from_bytes(response_bytes)
|
|
194
224
|
if sniffed:
|
|
195
225
|
media_type = sniffed
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
226
|
+
fallback_filename = _ensure_extension_for_media_type(
|
|
227
|
+
fallback_filename, media_type
|
|
228
|
+
)
|
|
229
|
+
media_type = _normalize_media_type(
|
|
230
|
+
filename=fallback_filename, media_type=media_type
|
|
231
|
+
)
|
|
232
|
+
if Path(fallback_filename).suffix.lower() in {".md", ".markdown"}:
|
|
199
233
|
media_type = "text/markdown"
|
|
234
|
+
resolved_source_uri = source_uri or source
|
|
200
235
|
return SourcePayload(
|
|
201
236
|
data=response_bytes,
|
|
202
|
-
filename=
|
|
237
|
+
filename=fallback_filename,
|
|
203
238
|
media_type=media_type,
|
|
204
|
-
source_uri=
|
|
239
|
+
source_uri=resolved_source_uri,
|
|
205
240
|
)
|
|
206
241
|
|
|
207
242
|
raise NotImplementedError(
|
biblicus/text/link.py
CHANGED
|
@@ -159,6 +159,8 @@ def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
|
|
|
159
159
|
|
|
160
160
|
|
|
161
161
|
def _validate_replace_text(old_str: str, new_str: str) -> None:
|
|
162
|
+
if "<span" in old_str or "</span>" in old_str:
|
|
163
|
+
raise ValueError("Text link replacements must target plain text without span tags")
|
|
162
164
|
if strip_span_tags(old_str) != strip_span_tags(new_str):
|
|
163
165
|
raise ValueError("Text link replacements may only insert span tags")
|
|
164
166
|
|
|
@@ -460,12 +462,16 @@ def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: st
|
|
|
460
462
|
error_lines = "\n".join(f"- {error}" for error in errors)
|
|
461
463
|
context_section = build_span_context_section(current_text, errors)
|
|
462
464
|
coverage_guidance = _build_coverage_guidance(errors)
|
|
465
|
+
nested_guidance = ""
|
|
466
|
+
if any("nested span" in error for error in errors):
|
|
467
|
+
nested_guidance = "Do not create nested or overlapping spans. Remove nested spans and wrap only bare text.\n"
|
|
463
468
|
return (
|
|
464
469
|
"Your last edit did not validate.\n"
|
|
465
470
|
"Issues:\n"
|
|
466
471
|
f"{error_lines}\n\n"
|
|
467
472
|
f"{context_section}"
|
|
468
473
|
f"{coverage_guidance}"
|
|
474
|
+
f"{nested_guidance}"
|
|
469
475
|
"Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
|
|
470
476
|
"Reuse the same id for identical names and do not assign multiple ids to the same name. "
|
|
471
477
|
f"Ids must start with '{id_prefix}'. Try again.\n"
|
biblicus/text/prompts.py
CHANGED
|
@@ -57,6 +57,8 @@ DEFAULT_ANNOTATE_SYSTEM_PROMPT = (
|
|
|
57
57
|
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
58
58
|
"- Do not include <span or </span> inside old_str or new_str.\n"
|
|
59
59
|
"- Do not insert nested spans.\n"
|
|
60
|
+
"- Do not wrap text that is already inside a span; spans must never overlap.\n"
|
|
61
|
+
"- If a name appears inside an existing span, leave it alone and wrap only bare text.\n"
|
|
60
62
|
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
61
63
|
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
62
64
|
"- Do not delete, reorder, paraphrase, or label text beyond the span attributes.\n\n"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -293,7 +293,7 @@ for note_title, note_text in notes:
|
|
|
293
293
|
|
|
294
294
|
backend = get_backend("scan")
|
|
295
295
|
run = backend.build_run(corpus, recipe_name="Story demo", config={})
|
|
296
|
-
budget = QueryBudget(max_total_items=5,
|
|
296
|
+
budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
|
|
297
297
|
result = backend.query(
|
|
298
298
|
corpus,
|
|
299
299
|
run=run,
|
|
@@ -333,7 +333,7 @@ Example output:
|
|
|
333
333
|
"query_text": "Primary button style preference",
|
|
334
334
|
"budget": {
|
|
335
335
|
"max_total_items": 5,
|
|
336
|
-
"
|
|
336
|
+
"maximum_total_characters": 2000,
|
|
337
337
|
"max_items_per_source": null
|
|
338
338
|
},
|
|
339
339
|
"run_id": "RUN_ID",
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=z9Wif5-ZzIrptsUS8OELW5zG5_R3-4ZcSuVUkfqKbaA,989
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
3
|
biblicus/chunking.py,sha256=GdJr0skAAI0Su99mr7dXqCgR7eJ0sJu8n2XesVGyddY,13206
|
|
4
|
-
biblicus/cli.py,sha256=
|
|
4
|
+
biblicus/cli.py,sha256=DdEL8Uvl38Zn2w4egCxQ4zWNelrI3QDs4qh4tGWGuAI,43793
|
|
5
5
|
biblicus/constants.py,sha256=gAlEVJhxdFj-eWWJrlYbP7H1X3c5gwhrIBq9NQ1Vq_E,371
|
|
6
|
-
biblicus/context.py,sha256=
|
|
7
|
-
biblicus/corpus.py,sha256=
|
|
6
|
+
biblicus/context.py,sha256=I7L86ag2AbNr_QgiP5YSt1uwwULGx1cH73eR2nE9T3g,10842
|
|
7
|
+
biblicus/corpus.py,sha256=LySjqBpTF_B19nMyGBoeB8AMDlqohcgsBfmJILm3P5c,59546
|
|
8
8
|
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
9
9
|
biblicus/embedding_providers.py,sha256=phWEsq1vryyTFRRs6uZ0sx9FhrqWIkDsS3I52I64zqM,3839
|
|
10
|
-
biblicus/errors.py,sha256=
|
|
10
|
+
biblicus/errors.py,sha256=g5TRPdO2XGi-7Wi1C4CXMJ6dTQKYAyP--EWKCv6FGKs,1362
|
|
11
11
|
biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
|
|
12
12
|
biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
|
|
13
13
|
biblicus/extraction.py,sha256=qvrsq6zSz2Kg-cap-18HPHC9pQlqEGo7pyID2uKCyBo,19760
|
|
@@ -18,11 +18,11 @@ biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
|
18
18
|
biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
|
|
19
19
|
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
20
20
|
biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
|
|
21
|
-
biblicus/knowledge_base.py,sha256=
|
|
22
|
-
biblicus/models.py,sha256=
|
|
21
|
+
biblicus/knowledge_base.py,sha256=jpFEvo8gbEuwRUVYRRgQFvRTJZQml0WCHWSeY-CS4ag,6658
|
|
22
|
+
biblicus/models.py,sha256=nvuq5Y96hHvuhMCuHff38wNITyQJam6zFrgFxH5Kh7g,16475
|
|
23
23
|
biblicus/recipes.py,sha256=rqU66QnjOup6O8Y9Yq7XszmpoM0Pyrjw3RrfdnlVqgE,4210
|
|
24
|
-
biblicus/retrieval.py,sha256=
|
|
25
|
-
biblicus/sources.py,sha256=
|
|
24
|
+
biblicus/retrieval.py,sha256=qAauHbnQcxtWZzonyOuwgSsffPyZ--0Z8wW-dEYk0z4,4287
|
|
25
|
+
biblicus/sources.py,sha256=FNwW1FWts0jxWIL3AHon7D6c5ZatyG9AGFqzn1Id5mE,8504
|
|
26
26
|
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
27
27
|
biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
|
|
28
28
|
biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
|
|
@@ -43,13 +43,18 @@ biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,4
|
|
|
43
43
|
biblicus/analysis/topic_modeling.py,sha256=mNBiRMpY5Jtyz8Aj-WXYY8guEghx9jozTfgveinJLoc,22135
|
|
44
44
|
biblicus/backends/__init__.py,sha256=WJSvXc6boEj8PeFr__AC6l_0lfBPJpaVgMbVq30vtZU,1669
|
|
45
45
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
46
|
-
biblicus/backends/embedding_index_common.py,sha256=
|
|
47
|
-
biblicus/backends/embedding_index_file.py,sha256=
|
|
48
|
-
biblicus/backends/embedding_index_inmemory.py,sha256=
|
|
49
|
-
biblicus/backends/hybrid.py,sha256=
|
|
50
|
-
biblicus/backends/scan.py,sha256=
|
|
46
|
+
biblicus/backends/embedding_index_common.py,sha256=wwvp6DjcaAwq-cp2jaO9TvnxDM7JDi-kpgT9uQG9Cxs,11552
|
|
47
|
+
biblicus/backends/embedding_index_file.py,sha256=vibYEWa12Gx-Pm8WnuBnMfBaKiwlAvVW1dEzWJc6JO4,9856
|
|
48
|
+
biblicus/backends/embedding_index_inmemory.py,sha256=LYiNBRmnh4DB8hmlBxMrm_uNmWi46Jt2EvjCuJGm2DI,9711
|
|
49
|
+
biblicus/backends/hybrid.py,sha256=vlsN9N6FZ5A3dQtGXy0W89L4qNQX5EYJNvUuj2-Uqaw,10897
|
|
50
|
+
biblicus/backends/scan.py,sha256=NBlfFHkDS3vdv70bgggK-jHykQC3W_i-RDaa97LEwKE,12548
|
|
51
51
|
biblicus/backends/sqlite_full_text_search.py,sha256=tkFYdKwH6WvAF3En1fvGN_03Ud0_Z1igGxhUW4meCbA,24496
|
|
52
|
-
biblicus/backends/tf_vector.py,sha256=
|
|
52
|
+
biblicus/backends/tf_vector.py,sha256=Z5MiEpbZ7A4UtRLYPEU1g8ubjWV5vuyPG40FpElEVzA,15119
|
|
53
|
+
biblicus/context_engine/__init__.py,sha256=cIJWTUwOewW1x13a2n0YKfr4-XU0IwlVdAH_0pckfKk,1337
|
|
54
|
+
biblicus/context_engine/assembler.py,sha256=ot5mdGJTA1nO8uUP_J_yGXgfVqQhFuEQJ3BH-HF4ZaY,42336
|
|
55
|
+
biblicus/context_engine/compaction.py,sha256=2bLaCpT48d1TL7vt9rrcRCgfdHeWWp9LX85Cgij12o0,2921
|
|
56
|
+
biblicus/context_engine/models.py,sha256=jesVd83ZQcatO-7yNlzwKkactSQ-e1znYuWof4rxVFg,12762
|
|
57
|
+
biblicus/context_engine/retrieval.py,sha256=au_mN8VYc_MhIlbMGHfDf2IK0UWAigj7R5NFXFZ0Kz8,4143
|
|
53
58
|
biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
|
|
54
59
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
55
60
|
biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
|
|
@@ -71,16 +76,16 @@ biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w
|
|
|
71
76
|
biblicus/text/__init__.py,sha256=MiaGAY7xWlUCeBzDzNz6pJnSMiU_Ge5EmlSiEzhqTRo,947
|
|
72
77
|
biblicus/text/annotate.py,sha256=asmpj3_s_t8hl6stEg99apmqxAhDTkoPzHhZNggYE3Y,8355
|
|
73
78
|
biblicus/text/extract.py,sha256=pdnUiZWtfCUj7kZK5zhd-tjqokgmhYYheWhyN3iShRU,7669
|
|
74
|
-
biblicus/text/link.py,sha256=
|
|
79
|
+
biblicus/text/link.py,sha256=2IdOi3WgyBKPFau0bpS1eToV1q2v_6wq5RK5_P_qUDg,20448
|
|
75
80
|
biblicus/text/markup.py,sha256=8jj9aX03HiZTOWdPs_VC4JLpQ7TlPHgGuXj_QUQIHVw,6265
|
|
76
81
|
biblicus/text/models.py,sha256=REp6RowUWFdV-6y437JENP7XtGKt57BOvVtF91KmUqI,10853
|
|
77
|
-
biblicus/text/prompts.py,sha256
|
|
82
|
+
biblicus/text/prompts.py,sha256=-M-8sQ7Dfm1k4j6Kn4ekAuiYe_TkIwLu2VSgpas9rUU,6881
|
|
78
83
|
biblicus/text/redact.py,sha256=tkDRmA0VvOZwMryEmBPLEHf3Z6VHJkkaWjBaNIMyGZ0,8415
|
|
79
84
|
biblicus/text/slice.py,sha256=dlHxGO8c5P8BszXGwlNQoQ-cyWjJf6PfS1LUBJXXGEE,5762
|
|
80
85
|
biblicus/text/tool_loop.py,sha256=w1PGLBvIemOdi6l0ArdYDVL7zgx-RC76bBOO0PKqpt0,11831
|
|
81
|
-
biblicus-0.
|
|
82
|
-
biblicus-0.
|
|
83
|
-
biblicus-0.
|
|
84
|
-
biblicus-0.
|
|
85
|
-
biblicus-0.
|
|
86
|
-
biblicus-0.
|
|
86
|
+
biblicus-1.0.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
87
|
+
biblicus-1.0.0.dist-info/METADATA,sha256=oyWd6igX6I3o46-VjOAUVskj1pLzZ8DovsTV1mqpPoY,30940
|
|
88
|
+
biblicus-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
89
|
+
biblicus-1.0.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
90
|
+
biblicus-1.0.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
91
|
+
biblicus-1.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|