biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +21 -1
- biblicus/analysis/markov.py +35 -3
- biblicus/backends/__init__.py +6 -2
- biblicus/backends/embedding_index_common.py +334 -0
- biblicus/backends/embedding_index_file.py +272 -0
- biblicus/backends/embedding_index_inmemory.py +270 -0
- biblicus/backends/hybrid.py +8 -5
- biblicus/backends/scan.py +1 -0
- biblicus/backends/sqlite_full_text_search.py +1 -1
- biblicus/backends/{vector.py → tf_vector.py} +28 -35
- biblicus/chunking.py +396 -0
- biblicus/cli.py +75 -25
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1060 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +129 -0
- biblicus/corpus.py +117 -16
- biblicus/embedding_providers.py +122 -0
- biblicus/errors.py +24 -0
- biblicus/frontmatter.py +2 -0
- biblicus/knowledge_base.py +1 -1
- biblicus/models.py +15 -3
- biblicus/retrieval.py +7 -2
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +2 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
biblicus/corpus.py
CHANGED
|
@@ -11,6 +11,7 @@ import shutil
|
|
|
11
11
|
import uuid
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import Any, Dict, List, Optional, Sequence
|
|
14
|
+
from urllib.parse import quote, unquote, urlparse
|
|
14
15
|
|
|
15
16
|
import yaml
|
|
16
17
|
from pydantic import ValidationError
|
|
@@ -24,6 +25,7 @@ from .constants import (
|
|
|
24
25
|
SCHEMA_VERSION,
|
|
25
26
|
SIDECAR_SUFFIX,
|
|
26
27
|
)
|
|
28
|
+
from .errors import IngestCollisionError
|
|
27
29
|
from .frontmatter import parse_front_matter, render_front_matter
|
|
28
30
|
from .hook_manager import HookManager
|
|
29
31
|
from .hooks import HookPoint
|
|
@@ -110,7 +112,10 @@ def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
|
110
112
|
"""
|
|
111
113
|
media_type_overrides = {
|
|
112
114
|
"image/jpeg": ".jpg",
|
|
115
|
+
"audio/mpeg": ".mp3",
|
|
113
116
|
"audio/ogg": ".ogg",
|
|
117
|
+
"audio/wav": ".wav",
|
|
118
|
+
"audio/x-wav": ".wav",
|
|
114
119
|
}
|
|
115
120
|
if media_type in media_type_overrides:
|
|
116
121
|
return media_type_overrides[media_type]
|
|
@@ -136,7 +141,16 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
136
141
|
return raw_name + ".md"
|
|
137
142
|
|
|
138
143
|
if Path(raw_name).suffix:
|
|
139
|
-
|
|
144
|
+
if "%2F" in raw_name or "%3A" in raw_name:
|
|
145
|
+
decoded = unquote(raw_name)
|
|
146
|
+
parsed = urlparse(decoded)
|
|
147
|
+
decoded_path = parsed.path if parsed.scheme else decoded
|
|
148
|
+
if not Path(decoded_path).suffix:
|
|
149
|
+
pass
|
|
150
|
+
else:
|
|
151
|
+
return raw_name
|
|
152
|
+
else:
|
|
153
|
+
return raw_name
|
|
140
154
|
|
|
141
155
|
ext = _preferred_extension_for_media_type(media_type)
|
|
142
156
|
if not ext:
|
|
@@ -144,6 +158,55 @@ def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
|
144
158
|
return raw_name + ext
|
|
145
159
|
|
|
146
160
|
|
|
161
|
+
def _encode_source_uri_for_filename(source_uri: str) -> str:
|
|
162
|
+
"""
|
|
163
|
+
Percent-encode a source uniform resource identifier for filename use.
|
|
164
|
+
|
|
165
|
+
:param source_uri: Source uniform resource identifier to encode.
|
|
166
|
+
:type source_uri: str
|
|
167
|
+
:return: Percent-encoded uniform resource identifier safe for filenames.
|
|
168
|
+
:rtype: str
|
|
169
|
+
"""
|
|
170
|
+
return quote(source_uri, safe="")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _storage_filename_for_ingest(
|
|
174
|
+
*, filename: Optional[str], media_type: str, source_uri: Optional[str]
|
|
175
|
+
) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Derive a collision-safe filename for corpus storage.
|
|
178
|
+
|
|
179
|
+
If a source uniform resource identifier is provided, the full uniform resource identifier is
|
|
180
|
+
percent-encoded to namespace the stored file, preventing collisions between identical basenames
|
|
181
|
+
from different sources. When no uniform resource identifier is available, fall back to a
|
|
182
|
+
sanitized filename.
|
|
183
|
+
|
|
184
|
+
:param filename: Optional filename hint from the caller.
|
|
185
|
+
:type filename: str or None
|
|
186
|
+
:param media_type: Media type of the payload.
|
|
187
|
+
:type media_type: str
|
|
188
|
+
:param source_uri: Optional source uniform resource identifier for provenance.
|
|
189
|
+
:type source_uri: str or None
|
|
190
|
+
:return: Storage filename with an appropriate extension, or an empty string when no hint exists.
|
|
191
|
+
:rtype: str
|
|
192
|
+
"""
|
|
193
|
+
base_name = ""
|
|
194
|
+
if source_uri:
|
|
195
|
+
base_name = _encode_source_uri_for_filename(source_uri)
|
|
196
|
+
if filename and not source_uri.startswith("file:"):
|
|
197
|
+
sanitized = _sanitize_filename(filename)
|
|
198
|
+
if sanitized:
|
|
199
|
+
base_name = f"{base_name}--{sanitized}"
|
|
200
|
+
if not base_name and filename:
|
|
201
|
+
base_name = _sanitize_filename(filename)
|
|
202
|
+
if not base_name:
|
|
203
|
+
return ""
|
|
204
|
+
if len(base_name) > 180:
|
|
205
|
+
digest = hashlib.sha256(base_name.encode("utf-8")).hexdigest()
|
|
206
|
+
base_name = f"hash-{digest}"
|
|
207
|
+
return _ensure_filename_extension(base_name, media_type=media_type)
|
|
208
|
+
|
|
209
|
+
|
|
147
210
|
def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
148
211
|
"""
|
|
149
212
|
Merge tags from explicit input and front matter values.
|
|
@@ -520,6 +583,24 @@ class Corpus:
|
|
|
520
583
|
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
521
584
|
temp_path.replace(self.catalog_path)
|
|
522
585
|
|
|
586
|
+
def _find_item_by_source_uri(self, source_uri: str) -> Optional[CatalogItem]:
|
|
587
|
+
"""
|
|
588
|
+
Locate an existing catalog item by source uniform resource identifier.
|
|
589
|
+
|
|
590
|
+
:param source_uri: Source uniform resource identifier to search for.
|
|
591
|
+
:type source_uri: str
|
|
592
|
+
:return: Matching catalog item or None.
|
|
593
|
+
:rtype: CatalogItem or None
|
|
594
|
+
"""
|
|
595
|
+
if not source_uri:
|
|
596
|
+
return None
|
|
597
|
+
self._init_catalog()
|
|
598
|
+
catalog = self._load_catalog()
|
|
599
|
+
for item in catalog.items.values():
|
|
600
|
+
if item.source_uri == source_uri:
|
|
601
|
+
return item
|
|
602
|
+
return None
|
|
603
|
+
|
|
523
604
|
@property
|
|
524
605
|
def runs_dir(self) -> Path:
|
|
525
606
|
"""
|
|
@@ -817,18 +898,26 @@ class Corpus:
|
|
|
817
898
|
:return: Ingestion result summary.
|
|
818
899
|
:rtype: IngestResult
|
|
819
900
|
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
901
|
+
:raises IngestCollisionError: If a source uniform resource identifier is already ingested.
|
|
820
902
|
"""
|
|
821
|
-
|
|
822
|
-
|
|
903
|
+
existing_item = self._find_item_by_source_uri(source_uri)
|
|
904
|
+
if existing_item is not None:
|
|
905
|
+
raise IngestCollisionError(
|
|
906
|
+
source_uri=source_uri,
|
|
907
|
+
existing_item_id=existing_item.id,
|
|
908
|
+
existing_relpath=existing_item.relpath,
|
|
909
|
+
)
|
|
823
910
|
|
|
824
|
-
|
|
825
|
-
|
|
911
|
+
item_id = str(uuid.uuid4())
|
|
912
|
+
storage_filename = _storage_filename_for_ingest(
|
|
913
|
+
filename=filename, media_type=media_type, source_uri=source_uri
|
|
914
|
+
)
|
|
826
915
|
|
|
827
916
|
if media_type == "text/markdown":
|
|
828
|
-
output_name = f"{item_id}--{
|
|
917
|
+
output_name = f"{item_id}--{storage_filename}" if storage_filename else f"{item_id}.md"
|
|
829
918
|
else:
|
|
830
|
-
if
|
|
831
|
-
output_name = f"{item_id}--{
|
|
919
|
+
if storage_filename:
|
|
920
|
+
output_name = f"{item_id}--{storage_filename}"
|
|
832
921
|
else:
|
|
833
922
|
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
834
923
|
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
@@ -991,13 +1080,21 @@ class Corpus:
|
|
|
991
1080
|
if media_type == "text/markdown":
|
|
992
1081
|
raise ValueError("Stream ingestion is not supported for Markdown")
|
|
993
1082
|
|
|
1083
|
+
existing_item = self._find_item_by_source_uri(source_uri)
|
|
1084
|
+
if existing_item is not None:
|
|
1085
|
+
raise IngestCollisionError(
|
|
1086
|
+
source_uri=source_uri,
|
|
1087
|
+
existing_item_id=existing_item.id,
|
|
1088
|
+
existing_relpath=existing_item.relpath,
|
|
1089
|
+
)
|
|
1090
|
+
|
|
994
1091
|
item_id = str(uuid.uuid4())
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
1092
|
+
storage_filename = _storage_filename_for_ingest(
|
|
1093
|
+
filename=filename, media_type=media_type, source_uri=source_uri
|
|
1094
|
+
)
|
|
998
1095
|
|
|
999
|
-
if
|
|
1000
|
-
output_name = f"{item_id}--{
|
|
1096
|
+
if storage_filename:
|
|
1097
|
+
output_name = f"{item_id}--{storage_filename}"
|
|
1001
1098
|
else:
|
|
1002
1099
|
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
1003
1100
|
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
@@ -1085,7 +1182,7 @@ class Corpus:
|
|
|
1085
1182
|
*,
|
|
1086
1183
|
title: Optional[str] = None,
|
|
1087
1184
|
tags: Sequence[str] = (),
|
|
1088
|
-
source_uri: str =
|
|
1185
|
+
source_uri: Optional[str] = None,
|
|
1089
1186
|
) -> IngestResult:
|
|
1090
1187
|
"""
|
|
1091
1188
|
Ingest a text note as Markdown.
|
|
@@ -1096,11 +1193,15 @@ class Corpus:
|
|
|
1096
1193
|
:type title: str or None
|
|
1097
1194
|
:param tags: Tags to associate with the note.
|
|
1098
1195
|
:type tags: Sequence[str]
|
|
1099
|
-
:param source_uri:
|
|
1100
|
-
:type source_uri: str
|
|
1196
|
+
:param source_uri: Optional source uniform resource identifier for provenance.
|
|
1197
|
+
:type source_uri: str or None
|
|
1101
1198
|
:return: Ingestion result summary.
|
|
1102
1199
|
:rtype: IngestResult
|
|
1103
1200
|
"""
|
|
1201
|
+
if source_uri is None:
|
|
1202
|
+
digest_source = (title or "") + "\n" + text
|
|
1203
|
+
digest = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()
|
|
1204
|
+
source_uri = f"text:{digest}"
|
|
1104
1205
|
data = text.encode("utf-8")
|
|
1105
1206
|
return self.ingest_item(
|
|
1106
1207
|
data,
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding provider interfaces for retrieval backends.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Optional, Sequence
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EmbeddingProvider(ABC):
|
|
16
|
+
"""
|
|
17
|
+
Interface for producing dense embedding vectors from text.
|
|
18
|
+
|
|
19
|
+
:ivar provider_id: Provider identifier.
|
|
20
|
+
:vartype provider_id: str
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
provider_id: str
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def embed_texts(self, texts: Sequence[str]) -> np.ndarray:
|
|
27
|
+
"""
|
|
28
|
+
Embed a batch of texts.
|
|
29
|
+
|
|
30
|
+
:param texts: Text inputs.
|
|
31
|
+
:type texts: Sequence[str]
|
|
32
|
+
:return: 2D float array with shape (len(texts), dimensions).
|
|
33
|
+
:rtype: numpy.ndarray
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _l2_normalize_rows(matrix: np.ndarray) -> np.ndarray:
|
|
39
|
+
norms = np.linalg.norm(matrix, axis=1, keepdims=True)
|
|
40
|
+
norms = np.where(norms == 0, 1.0, norms)
|
|
41
|
+
return matrix / norms
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class HashEmbeddingProvider(EmbeddingProvider):
|
|
45
|
+
"""
|
|
46
|
+
Deterministic embedding provider for tests and demos.
|
|
47
|
+
|
|
48
|
+
The output vectors are stable across runs and require no external services.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
provider_id = "hash-embedding"
|
|
52
|
+
|
|
53
|
+
def __init__(self, *, dimensions: int, seed: str = "biblicus") -> None:
|
|
54
|
+
self._dimensions = int(dimensions)
|
|
55
|
+
self._seed = str(seed)
|
|
56
|
+
if self._dimensions <= 0:
|
|
57
|
+
raise ValueError("dimensions must be greater than 0")
|
|
58
|
+
|
|
59
|
+
def embed_texts(self, texts: Sequence[str]) -> np.ndarray:
|
|
60
|
+
"""
|
|
61
|
+
Embed a batch of texts deterministically.
|
|
62
|
+
|
|
63
|
+
:param texts: Text inputs.
|
|
64
|
+
:type texts: Sequence[str]
|
|
65
|
+
:return: Normalized embedding matrix.
|
|
66
|
+
:rtype: numpy.ndarray
|
|
67
|
+
"""
|
|
68
|
+
items = list(texts)
|
|
69
|
+
if not items:
|
|
70
|
+
return np.zeros((0, self._dimensions), dtype=np.float32)
|
|
71
|
+
|
|
72
|
+
vectors = np.zeros((len(items), self._dimensions), dtype=np.float32)
|
|
73
|
+
for row_index, text in enumerate(items):
|
|
74
|
+
vectors[row_index] = self._hash_to_vector(text)
|
|
75
|
+
return _l2_normalize_rows(vectors)
|
|
76
|
+
|
|
77
|
+
def _hash_to_vector(self, text: str) -> np.ndarray:
|
|
78
|
+
output = np.empty((self._dimensions,), dtype=np.float32)
|
|
79
|
+
remaining = self._dimensions
|
|
80
|
+
offset = 0
|
|
81
|
+
counter = 0
|
|
82
|
+
while remaining > 0:
|
|
83
|
+
digest = hashlib.sha256(f"{self._seed}:{counter}:{text}".encode("utf-8")).digest()
|
|
84
|
+
raw = np.frombuffer(digest, dtype=np.uint8).astype(np.float32)
|
|
85
|
+
raw = (raw / 255.0) * 2.0 - 1.0
|
|
86
|
+
take = min(remaining, raw.shape[0])
|
|
87
|
+
output[offset : offset + take] = raw[:take]
|
|
88
|
+
remaining -= take
|
|
89
|
+
offset += take
|
|
90
|
+
counter += 1
|
|
91
|
+
return output
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class EmbeddingProviderConfig(BaseModel):
|
|
95
|
+
"""
|
|
96
|
+
Configuration for embedding provider selection.
|
|
97
|
+
|
|
98
|
+
:ivar provider_id: Provider identifier.
|
|
99
|
+
:vartype provider_id: str
|
|
100
|
+
:ivar dimensions: Dimensionality of produced vectors.
|
|
101
|
+
:vartype dimensions: int
|
|
102
|
+
:ivar seed: Optional deterministic seed for test providers.
|
|
103
|
+
:vartype seed: str or None
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
model_config = ConfigDict(extra="forbid")
|
|
107
|
+
|
|
108
|
+
provider_id: str = Field(min_length=1)
|
|
109
|
+
dimensions: int = Field(ge=1)
|
|
110
|
+
seed: Optional[str] = None
|
|
111
|
+
|
|
112
|
+
def build_provider(self) -> EmbeddingProvider:
|
|
113
|
+
"""
|
|
114
|
+
Build an embedding provider instance from this configuration.
|
|
115
|
+
|
|
116
|
+
:return: Embedding provider instance.
|
|
117
|
+
:rtype: EmbeddingProvider
|
|
118
|
+
:raises ValueError: If the provider identifier is unknown.
|
|
119
|
+
"""
|
|
120
|
+
if self.provider_id == HashEmbeddingProvider.provider_id:
|
|
121
|
+
return HashEmbeddingProvider(dimensions=self.dimensions, seed=self.seed or "biblicus")
|
|
122
|
+
raise ValueError(f"Unknown embedding provider_id: {self.provider_id!r}")
|
biblicus/errors.py
CHANGED
|
@@ -13,3 +13,27 @@ class ExtractionRunFatalError(RuntimeError):
|
|
|
13
13
|
rather than a per-item extraction failure. For example, a selection extractor that depends
|
|
14
14
|
on referenced extraction run manifests treats missing manifests as fatal.
|
|
15
15
|
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IngestCollisionError(RuntimeError):
|
|
19
|
+
"""
|
|
20
|
+
Ingest collision for an already ingested source.
|
|
21
|
+
|
|
22
|
+
:param source_uri: Source uniform resource identifier that caused the collision.
|
|
23
|
+
:type source_uri: str
|
|
24
|
+
:param existing_item_id: Identifier of the existing catalog item.
|
|
25
|
+
:type existing_item_id: str
|
|
26
|
+
:param existing_relpath: Raw storage relpath of the existing item.
|
|
27
|
+
:type existing_relpath: str
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, *, source_uri: str, existing_item_id: str, existing_relpath: str) -> None:
|
|
31
|
+
self.source_uri = source_uri
|
|
32
|
+
self.existing_item_id = existing_item_id
|
|
33
|
+
self.existing_relpath = existing_relpath
|
|
34
|
+
message = (
|
|
35
|
+
"Source already ingested"
|
|
36
|
+
f": source_uri={source_uri} existing_item_id={existing_item_id}"
|
|
37
|
+
f" existing_relpath={existing_relpath}"
|
|
38
|
+
)
|
|
39
|
+
super().__init__(message)
|
biblicus/frontmatter.py
CHANGED
|
@@ -44,6 +44,8 @@ def parse_front_matter(text: str) -> FrontMatterDocument:
|
|
|
44
44
|
|
|
45
45
|
raw_yaml = text[4:front_matter_end]
|
|
46
46
|
body = text[front_matter_end + len("\n---\n") :]
|
|
47
|
+
if body.startswith("\n"):
|
|
48
|
+
body = body[1:]
|
|
47
49
|
|
|
48
50
|
metadata = yaml.safe_load(raw_yaml) or {}
|
|
49
51
|
if not isinstance(metadata, dict):
|
biblicus/knowledge_base.py
CHANGED
biblicus/models.py
CHANGED
|
@@ -224,10 +224,18 @@ class QueryBudget(BaseModel):
|
|
|
224
224
|
"""
|
|
225
225
|
Evidence selection budget for retrieval.
|
|
226
226
|
|
|
227
|
+
The budget constrains the *returned* evidence. It intentionally does not
|
|
228
|
+
change how a backend scores candidates, only how many evidence items are
|
|
229
|
+
selected and how much text is allowed through.
|
|
230
|
+
|
|
227
231
|
:ivar max_total_items: Maximum number of evidence items to return.
|
|
228
232
|
:vartype max_total_items: int
|
|
229
|
-
:ivar
|
|
230
|
-
|
|
233
|
+
:ivar offset: Number of ranked candidates to skip before selecting evidence.
|
|
234
|
+
This enables simple pagination by re-running the same query with a
|
|
235
|
+
higher offset.
|
|
236
|
+
:vartype offset: int
|
|
237
|
+
:ivar maximum_total_characters: Optional maximum total characters across evidence text.
|
|
238
|
+
:vartype maximum_total_characters: int or None
|
|
231
239
|
:ivar max_items_per_source: Optional cap per source uniform resource identifier.
|
|
232
240
|
:vartype max_items_per_source: int or None
|
|
233
241
|
"""
|
|
@@ -235,7 +243,8 @@ class QueryBudget(BaseModel):
|
|
|
235
243
|
model_config = ConfigDict(extra="forbid")
|
|
236
244
|
|
|
237
245
|
max_total_items: int = Field(ge=1)
|
|
238
|
-
|
|
246
|
+
offset: int = Field(default=0, ge=0)
|
|
247
|
+
maximum_total_characters: Optional[int] = Field(default=None, ge=1)
|
|
239
248
|
max_items_per_source: Optional[int] = Field(default=None, ge=1)
|
|
240
249
|
|
|
241
250
|
|
|
@@ -269,6 +278,8 @@ class Evidence(BaseModel):
|
|
|
269
278
|
:vartype recipe_id: str
|
|
270
279
|
:ivar run_id: Retrieval run identifier.
|
|
271
280
|
:vartype run_id: str
|
|
281
|
+
:ivar metadata: Optional metadata payload from the catalog item.
|
|
282
|
+
:vartype metadata: dict[str, Any]
|
|
272
283
|
:ivar hash: Optional content hash for provenance.
|
|
273
284
|
:vartype hash: str or None
|
|
274
285
|
"""
|
|
@@ -288,6 +299,7 @@ class Evidence(BaseModel):
|
|
|
288
299
|
stage_scores: Optional[Dict[str, float]] = None
|
|
289
300
|
recipe_id: str
|
|
290
301
|
run_id: str
|
|
302
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
291
303
|
hash: Optional[str] = None
|
|
292
304
|
|
|
293
305
|
@model_validator(mode="after")
|
biblicus/retrieval.py
CHANGED
|
@@ -108,8 +108,13 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
|
|
|
108
108
|
selected_evidence: List[Evidence] = []
|
|
109
109
|
source_counts: Dict[str, int] = {}
|
|
110
110
|
total_characters = 0
|
|
111
|
+
skipped = 0
|
|
111
112
|
|
|
112
113
|
for candidate_evidence in evidence:
|
|
114
|
+
if skipped < budget.offset:
|
|
115
|
+
skipped += 1
|
|
116
|
+
continue
|
|
117
|
+
|
|
113
118
|
if len(selected_evidence) >= budget.max_total_items:
|
|
114
119
|
break
|
|
115
120
|
|
|
@@ -119,8 +124,8 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
|
|
|
119
124
|
continue
|
|
120
125
|
|
|
121
126
|
text_character_count = len(candidate_evidence.text or "")
|
|
122
|
-
if budget.
|
|
123
|
-
if total_characters + text_character_count > budget.
|
|
127
|
+
if budget.maximum_total_characters is not None:
|
|
128
|
+
if total_characters + text_character_count > budget.maximum_total_characters:
|
|
124
129
|
continue
|
|
125
130
|
|
|
126
131
|
selected_evidence.append(candidate_evidence)
|
biblicus/sources.py
CHANGED
|
@@ -8,7 +8,7 @@ import mimetypes
|
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Optional
|
|
11
|
-
from urllib.parse import unquote, urlparse
|
|
11
|
+
from urllib.parse import quote, unquote, urlparse
|
|
12
12
|
from urllib.request import Request, urlopen
|
|
13
13
|
|
|
14
14
|
|
|
@@ -37,6 +37,27 @@ def _filename_from_url_path(path: str) -> str:
|
|
|
37
37
|
return filename or "download"
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
def _sanitize_filename_component(name: str) -> str:
|
|
41
|
+
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
42
|
+
sanitized_name = "".join(
|
|
43
|
+
(character if character in allowed_characters else "_") for character in name
|
|
44
|
+
).strip()
|
|
45
|
+
return sanitized_name or "file"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _namespaced_filename(
|
|
49
|
+
*, source_uri: Optional[str], fallback_name: Optional[str], media_type: str
|
|
50
|
+
) -> str:
|
|
51
|
+
base_name = ""
|
|
52
|
+
if source_uri:
|
|
53
|
+
base_name = quote(source_uri, safe="")
|
|
54
|
+
if not base_name and fallback_name:
|
|
55
|
+
base_name = _sanitize_filename_component(fallback_name)
|
|
56
|
+
if not base_name:
|
|
57
|
+
base_name = "file"
|
|
58
|
+
return _ensure_extension_for_media_type(base_name, media_type)
|
|
59
|
+
|
|
60
|
+
|
|
40
61
|
def _media_type_from_filename(name: str) -> str:
|
|
41
62
|
"""
|
|
42
63
|
Guess media type from a filename.
|
|
@@ -119,8 +140,16 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
|
|
|
119
140
|
"""
|
|
120
141
|
if Path(filename).suffix:
|
|
121
142
|
return filename
|
|
122
|
-
|
|
123
|
-
|
|
143
|
+
media_type_overrides = {
|
|
144
|
+
"audio/mpeg": ".mp3",
|
|
145
|
+
"audio/ogg": ".ogg",
|
|
146
|
+
"audio/wav": ".wav",
|
|
147
|
+
"audio/x-wav": ".wav",
|
|
148
|
+
"image/jpeg": ".jpg",
|
|
149
|
+
"text/html": ".html",
|
|
150
|
+
}
|
|
151
|
+
if media_type in media_type_overrides:
|
|
152
|
+
ext = media_type_overrides[media_type]
|
|
124
153
|
else:
|
|
125
154
|
ext = mimetypes.guess_extension(media_type) or ""
|
|
126
155
|
return filename + ext if ext else filename
|
|
@@ -165,11 +194,12 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
165
194
|
media_type = _media_type_from_filename(path.name)
|
|
166
195
|
if path.suffix.lower() in {".md", ".markdown"}:
|
|
167
196
|
media_type = "text/markdown"
|
|
197
|
+
resolved_source_uri = source_uri or path.as_uri()
|
|
168
198
|
return SourcePayload(
|
|
169
199
|
data=path.read_bytes(),
|
|
170
200
|
filename=path.name,
|
|
171
201
|
media_type=media_type,
|
|
172
|
-
source_uri=
|
|
202
|
+
source_uri=resolved_source_uri,
|
|
173
203
|
)
|
|
174
204
|
|
|
175
205
|
if _looks_like_uri(source):
|
|
@@ -187,21 +217,26 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
187
217
|
with urlopen(request, timeout=30) as response:
|
|
188
218
|
response_bytes = response.read()
|
|
189
219
|
content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
|
|
190
|
-
|
|
191
|
-
media_type = content_type or _media_type_from_filename(
|
|
220
|
+
fallback_filename = _filename_from_url_path(parsed.path)
|
|
221
|
+
media_type = content_type or _media_type_from_filename(fallback_filename)
|
|
192
222
|
if media_type == "application/octet-stream":
|
|
193
223
|
sniffed = _sniff_media_type_from_bytes(response_bytes)
|
|
194
224
|
if sniffed:
|
|
195
225
|
media_type = sniffed
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
226
|
+
fallback_filename = _ensure_extension_for_media_type(
|
|
227
|
+
fallback_filename, media_type
|
|
228
|
+
)
|
|
229
|
+
media_type = _normalize_media_type(
|
|
230
|
+
filename=fallback_filename, media_type=media_type
|
|
231
|
+
)
|
|
232
|
+
if Path(fallback_filename).suffix.lower() in {".md", ".markdown"}:
|
|
199
233
|
media_type = "text/markdown"
|
|
234
|
+
resolved_source_uri = source_uri or source
|
|
200
235
|
return SourcePayload(
|
|
201
236
|
data=response_bytes,
|
|
202
|
-
filename=
|
|
237
|
+
filename=fallback_filename,
|
|
203
238
|
media_type=media_type,
|
|
204
|
-
source_uri=
|
|
239
|
+
source_uri=resolved_source_uri,
|
|
205
240
|
)
|
|
206
241
|
|
|
207
242
|
raise NotImplementedError(
|
biblicus/text/link.py
CHANGED
|
@@ -159,6 +159,8 @@ def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
|
|
|
159
159
|
|
|
160
160
|
|
|
161
161
|
def _validate_replace_text(old_str: str, new_str: str) -> None:
|
|
162
|
+
if "<span" in old_str or "</span>" in old_str:
|
|
163
|
+
raise ValueError("Text link replacements must target plain text without span tags")
|
|
162
164
|
if strip_span_tags(old_str) != strip_span_tags(new_str):
|
|
163
165
|
raise ValueError("Text link replacements may only insert span tags")
|
|
164
166
|
|
|
@@ -460,12 +462,16 @@ def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: st
|
|
|
460
462
|
error_lines = "\n".join(f"- {error}" for error in errors)
|
|
461
463
|
context_section = build_span_context_section(current_text, errors)
|
|
462
464
|
coverage_guidance = _build_coverage_guidance(errors)
|
|
465
|
+
nested_guidance = ""
|
|
466
|
+
if any("nested span" in error for error in errors):
|
|
467
|
+
nested_guidance = "Do not create nested or overlapping spans. Remove nested spans and wrap only bare text.\n"
|
|
463
468
|
return (
|
|
464
469
|
"Your last edit did not validate.\n"
|
|
465
470
|
"Issues:\n"
|
|
466
471
|
f"{error_lines}\n\n"
|
|
467
472
|
f"{context_section}"
|
|
468
473
|
f"{coverage_guidance}"
|
|
474
|
+
f"{nested_guidance}"
|
|
469
475
|
"Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
|
|
470
476
|
"Reuse the same id for identical names and do not assign multiple ids to the same name. "
|
|
471
477
|
f"Ids must start with '{id_prefix}'. Try again.\n"
|
biblicus/text/prompts.py
CHANGED
|
@@ -57,6 +57,8 @@ DEFAULT_ANNOTATE_SYSTEM_PROMPT = (
|
|
|
57
57
|
"- new_str must be identical to old_str with only <span ...> and </span> inserted.\n"
|
|
58
58
|
"- Do not include <span or </span> inside old_str or new_str.\n"
|
|
59
59
|
"- Do not insert nested spans.\n"
|
|
60
|
+
"- Do not wrap text that is already inside a span; spans must never overlap.\n"
|
|
61
|
+
"- If a name appears inside an existing span, leave it alone and wrap only bare text.\n"
|
|
60
62
|
"- If a tool call fails due to non-unique old_str, retry with a longer unique old_str.\n"
|
|
61
63
|
"- If a tool call fails, read the error and keep editing. Do not call done until spans are inserted.\n"
|
|
62
64
|
"- Do not delete, reorder, paraphrase, or label text beyond the span attributes.\n\n"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -11,6 +11,7 @@ Requires-Dist: PyYAML>=6.0
|
|
|
11
11
|
Requires-Dist: pypdf>=4.0
|
|
12
12
|
Requires-Dist: Jinja2>=3.1
|
|
13
13
|
Requires-Dist: dotyaml>=0.1.3
|
|
14
|
+
Requires-Dist: numpy>=1.24
|
|
14
15
|
Provides-Extra: dev
|
|
15
16
|
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
16
17
|
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
@@ -292,7 +293,7 @@ for note_title, note_text in notes:
|
|
|
292
293
|
|
|
293
294
|
backend = get_backend("scan")
|
|
294
295
|
run = backend.build_run(corpus, recipe_name="Story demo", config={})
|
|
295
|
-
budget = QueryBudget(max_total_items=5,
|
|
296
|
+
budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
|
|
296
297
|
result = backend.query(
|
|
297
298
|
corpus,
|
|
298
299
|
run=run,
|
|
@@ -332,7 +333,7 @@ Example output:
|
|
|
332
333
|
"query_text": "Primary button style preference",
|
|
333
334
|
"budget": {
|
|
334
335
|
"max_total_items": 5,
|
|
335
|
-
"
|
|
336
|
+
"maximum_total_characters": 2000,
|
|
336
337
|
"max_items_per_source": null
|
|
337
338
|
},
|
|
338
339
|
"run_id": "RUN_ID",
|