biblicus 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/markov.py +35 -3
- biblicus/backends/__init__.py +6 -2
- biblicus/backends/embedding_index_common.py +301 -0
- biblicus/backends/embedding_index_file.py +266 -0
- biblicus/backends/embedding_index_inmemory.py +268 -0
- biblicus/backends/hybrid.py +4 -2
- biblicus/backends/sqlite_full_text_search.py +1 -1
- biblicus/backends/{vector.py → tf_vector.py} +11 -11
- biblicus/chunking.py +396 -0
- biblicus/cli.py +50 -10
- biblicus/embedding_providers.py +122 -0
- biblicus/frontmatter.py +2 -0
- biblicus/models.py +9 -0
- biblicus/retrieval.py +5 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/METADATA +12 -4
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/RECORD +21 -16
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/WHEEL +0 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/top_level.txt +0 -0
biblicus/chunking.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunking primitives for text retrieval backends.
|
|
3
|
+
|
|
4
|
+
Chunking converts a document-sized text string into smaller spans that can be embedded or indexed.
|
|
5
|
+
The chunk span offsets are expressed as character positions into the original text string.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TokenSpan(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
A token with its character span in a source string.
|
|
19
|
+
|
|
20
|
+
:ivar token: Token text.
|
|
21
|
+
:vartype token: str
|
|
22
|
+
:ivar span_start: Inclusive start character offset.
|
|
23
|
+
:vartype span_start: int
|
|
24
|
+
:ivar span_end: Exclusive end character offset.
|
|
25
|
+
:vartype span_end: int
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
token: str = Field(min_length=1)
|
|
31
|
+
span_start: int = Field(ge=0)
|
|
32
|
+
span_end: int = Field(ge=0)
|
|
33
|
+
|
|
34
|
+
@model_validator(mode="after")
|
|
35
|
+
def _validate_span(self) -> "TokenSpan":
|
|
36
|
+
if self.span_end <= self.span_start:
|
|
37
|
+
raise ValueError("token span_end must be greater than span_start")
|
|
38
|
+
return self
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Tokenizer(ABC):
|
|
42
|
+
"""
|
|
43
|
+
Interface for producing token spans from text.
|
|
44
|
+
|
|
45
|
+
Tokenizers are used by token-window chunking strategies to convert token indices into
|
|
46
|
+
stable character spans.
|
|
47
|
+
|
|
48
|
+
:ivar tokenizer_id: Tokenizer identifier.
|
|
49
|
+
:vartype tokenizer_id: str
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
tokenizer_id: str
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def tokenize(self, text: str) -> List[TokenSpan]:
|
|
56
|
+
"""
|
|
57
|
+
Tokenize a string and return spans for each token.
|
|
58
|
+
|
|
59
|
+
:param text: Input text.
|
|
60
|
+
:type text: str
|
|
61
|
+
:return: Token spans.
|
|
62
|
+
:rtype: list[TokenSpan]
|
|
63
|
+
"""
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class WhitespaceTokenizer(Tokenizer):
|
|
68
|
+
"""
|
|
69
|
+
Tokenizer that treats runs of non-whitespace characters as tokens.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
tokenizer_id = "whitespace"
|
|
73
|
+
|
|
74
|
+
def tokenize(self, text: str) -> List[TokenSpan]:
|
|
75
|
+
"""
|
|
76
|
+
Tokenize a string by whitespace boundaries.
|
|
77
|
+
|
|
78
|
+
:param text: Input text.
|
|
79
|
+
:type text: str
|
|
80
|
+
:return: Token spans for each non-whitespace token.
|
|
81
|
+
:rtype: list[TokenSpan]
|
|
82
|
+
"""
|
|
83
|
+
import re
|
|
84
|
+
|
|
85
|
+
spans: List[TokenSpan] = []
|
|
86
|
+
for match in re.finditer(r"\S+", text):
|
|
87
|
+
spans.append(
|
|
88
|
+
TokenSpan(
|
|
89
|
+
token=match.group(0),
|
|
90
|
+
span_start=int(match.start()),
|
|
91
|
+
span_end=int(match.end()),
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
return spans
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class TextChunk(BaseModel):
|
|
98
|
+
"""
|
|
99
|
+
A chunk extracted from a larger text string.
|
|
100
|
+
|
|
101
|
+
:ivar chunk_id: Stable chunk identifier within a build.
|
|
102
|
+
:vartype chunk_id: int
|
|
103
|
+
:ivar item_id: Source item identifier.
|
|
104
|
+
:vartype item_id: str
|
|
105
|
+
:ivar span_start: Inclusive start character offset.
|
|
106
|
+
:vartype span_start: int
|
|
107
|
+
:ivar span_end: Exclusive end character offset.
|
|
108
|
+
:vartype span_end: int
|
|
109
|
+
:ivar text: Chunk text.
|
|
110
|
+
:vartype text: str
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
model_config = ConfigDict(extra="forbid")
|
|
114
|
+
|
|
115
|
+
chunk_id: int = Field(ge=0)
|
|
116
|
+
item_id: str = Field(min_length=1)
|
|
117
|
+
span_start: int = Field(ge=0)
|
|
118
|
+
span_end: int = Field(ge=0)
|
|
119
|
+
text: str
|
|
120
|
+
|
|
121
|
+
@model_validator(mode="after")
|
|
122
|
+
def _validate_span(self) -> "TextChunk":
|
|
123
|
+
if self.span_end <= self.span_start:
|
|
124
|
+
raise ValueError("chunk span_end must be greater than span_start")
|
|
125
|
+
if not isinstance(self.text, str) or not self.text:
|
|
126
|
+
raise ValueError("chunk text must be non-empty")
|
|
127
|
+
return self
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Chunker(ABC):
|
|
131
|
+
"""
|
|
132
|
+
Interface for converting text into chunks.
|
|
133
|
+
|
|
134
|
+
:ivar chunker_id: Chunker identifier.
|
|
135
|
+
:vartype chunker_id: str
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
chunker_id: str
|
|
139
|
+
|
|
140
|
+
@abstractmethod
|
|
141
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
142
|
+
"""
|
|
143
|
+
Split a text string into chunks.
|
|
144
|
+
|
|
145
|
+
:param item_id: Item identifier that produced the text.
|
|
146
|
+
:type item_id: str
|
|
147
|
+
:param text: Full text to chunk.
|
|
148
|
+
:type text: str
|
|
149
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
150
|
+
:type starting_chunk_id: int
|
|
151
|
+
:return: Chunk list.
|
|
152
|
+
:rtype: list[TextChunk]
|
|
153
|
+
"""
|
|
154
|
+
raise NotImplementedError
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class FixedCharWindowChunker(Chunker):
|
|
158
|
+
"""
|
|
159
|
+
Chunker that produces overlapping fixed-size character windows.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
chunker_id = "fixed-char-window"
|
|
163
|
+
|
|
164
|
+
def __init__(self, *, window_characters: int, overlap_characters: int) -> None:
|
|
165
|
+
self._window_characters = int(window_characters)
|
|
166
|
+
self._overlap_characters = int(overlap_characters)
|
|
167
|
+
if self._window_characters <= 0:
|
|
168
|
+
raise ValueError("window_characters must be greater than 0")
|
|
169
|
+
if self._overlap_characters < 0:
|
|
170
|
+
raise ValueError("overlap_characters must be greater than or equal to 0")
|
|
171
|
+
if self._overlap_characters >= self._window_characters:
|
|
172
|
+
raise ValueError("overlap_characters must be less than window_characters")
|
|
173
|
+
|
|
174
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
175
|
+
"""
|
|
176
|
+
Chunk a text string into fixed-size character windows.
|
|
177
|
+
|
|
178
|
+
:param item_id: Item identifier.
|
|
179
|
+
:type item_id: str
|
|
180
|
+
:param text: Input text.
|
|
181
|
+
:type text: str
|
|
182
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
183
|
+
:type starting_chunk_id: int
|
|
184
|
+
:return: Chunk list.
|
|
185
|
+
:rtype: list[TextChunk]
|
|
186
|
+
"""
|
|
187
|
+
chunks: List[TextChunk] = []
|
|
188
|
+
chunk_id = int(starting_chunk_id)
|
|
189
|
+
position = 0
|
|
190
|
+
stride = self._window_characters - self._overlap_characters
|
|
191
|
+
while position < len(text):
|
|
192
|
+
span_start = position
|
|
193
|
+
span_end = min(position + self._window_characters, len(text))
|
|
194
|
+
chunk_text = text[span_start:span_end]
|
|
195
|
+
if chunk_text.strip():
|
|
196
|
+
chunks.append(
|
|
197
|
+
TextChunk(
|
|
198
|
+
chunk_id=chunk_id,
|
|
199
|
+
item_id=item_id,
|
|
200
|
+
span_start=span_start,
|
|
201
|
+
span_end=span_end,
|
|
202
|
+
text=chunk_text,
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
chunk_id += 1
|
|
206
|
+
if span_end >= len(text):
|
|
207
|
+
position = len(text)
|
|
208
|
+
else:
|
|
209
|
+
position += stride
|
|
210
|
+
return chunks
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class ParagraphChunker(Chunker):
|
|
214
|
+
"""
|
|
215
|
+
Chunker that produces paragraph spans separated by blank lines.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
chunker_id = "paragraph"
|
|
219
|
+
|
|
220
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
221
|
+
"""
|
|
222
|
+
Chunk a text string by paragraph boundaries.
|
|
223
|
+
|
|
224
|
+
:param item_id: Item identifier.
|
|
225
|
+
:type item_id: str
|
|
226
|
+
:param text: Input text.
|
|
227
|
+
:type text: str
|
|
228
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
229
|
+
:type starting_chunk_id: int
|
|
230
|
+
:return: Chunk list.
|
|
231
|
+
:rtype: list[TextChunk]
|
|
232
|
+
"""
|
|
233
|
+
import re
|
|
234
|
+
|
|
235
|
+
chunks: List[TextChunk] = []
|
|
236
|
+
chunk_id = int(starting_chunk_id)
|
|
237
|
+
for match in re.finditer(r"(?:[^\n]|\n(?!\n))+", text):
|
|
238
|
+
span_start = int(match.start())
|
|
239
|
+
span_end = int(match.end())
|
|
240
|
+
chunk_text = text[span_start:span_end]
|
|
241
|
+
if not chunk_text.strip():
|
|
242
|
+
continue
|
|
243
|
+
chunks.append(
|
|
244
|
+
TextChunk(
|
|
245
|
+
chunk_id=chunk_id,
|
|
246
|
+
item_id=item_id,
|
|
247
|
+
span_start=span_start,
|
|
248
|
+
span_end=span_end,
|
|
249
|
+
text=chunk_text,
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
chunk_id += 1
|
|
253
|
+
return chunks
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class FixedTokenWindowChunker(Chunker):
|
|
257
|
+
"""
|
|
258
|
+
Chunker that produces overlapping fixed-size token windows.
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
chunker_id = "fixed-token-window"
|
|
262
|
+
|
|
263
|
+
def __init__(self, *, window_tokens: int, overlap_tokens: int, tokenizer: Tokenizer) -> None:
|
|
264
|
+
self._window_tokens = int(window_tokens)
|
|
265
|
+
self._overlap_tokens = int(overlap_tokens)
|
|
266
|
+
self._tokenizer = tokenizer
|
|
267
|
+
if self._window_tokens <= 0:
|
|
268
|
+
raise ValueError("window_tokens must be greater than 0")
|
|
269
|
+
if self._overlap_tokens < 0:
|
|
270
|
+
raise ValueError("overlap_tokens must be greater than or equal to 0")
|
|
271
|
+
if self._overlap_tokens >= self._window_tokens:
|
|
272
|
+
raise ValueError("overlap_tokens must be less than window_tokens")
|
|
273
|
+
|
|
274
|
+
def chunk_text(self, *, item_id: str, text: str, starting_chunk_id: int) -> List[TextChunk]:
|
|
275
|
+
"""
|
|
276
|
+
Chunk a text string into fixed-size token windows.
|
|
277
|
+
|
|
278
|
+
:param item_id: Item identifier.
|
|
279
|
+
:type item_id: str
|
|
280
|
+
:param text: Input text.
|
|
281
|
+
:type text: str
|
|
282
|
+
:param starting_chunk_id: Starting chunk identifier.
|
|
283
|
+
:type starting_chunk_id: int
|
|
284
|
+
:return: Chunk list.
|
|
285
|
+
:rtype: list[TextChunk]
|
|
286
|
+
"""
|
|
287
|
+
token_spans = self._tokenizer.tokenize(text)
|
|
288
|
+
if not token_spans:
|
|
289
|
+
return []
|
|
290
|
+
chunks: List[TextChunk] = []
|
|
291
|
+
chunk_id = int(starting_chunk_id)
|
|
292
|
+
stride = self._window_tokens - self._overlap_tokens
|
|
293
|
+
token_index = 0
|
|
294
|
+
while token_index < len(token_spans):
|
|
295
|
+
window_end = min(token_index + self._window_tokens, len(token_spans))
|
|
296
|
+
span_start = token_spans[token_index].span_start
|
|
297
|
+
span_end = token_spans[window_end - 1].span_end
|
|
298
|
+
chunk_text = text[span_start:span_end]
|
|
299
|
+
chunks.append(
|
|
300
|
+
TextChunk(
|
|
301
|
+
chunk_id=chunk_id,
|
|
302
|
+
item_id=item_id,
|
|
303
|
+
span_start=span_start,
|
|
304
|
+
span_end=span_end,
|
|
305
|
+
text=chunk_text,
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
chunk_id += 1
|
|
309
|
+
if window_end >= len(token_spans):
|
|
310
|
+
token_index = len(token_spans)
|
|
311
|
+
else:
|
|
312
|
+
token_index += stride
|
|
313
|
+
return chunks
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
class TokenizerConfig(BaseModel):
|
|
317
|
+
"""
|
|
318
|
+
Configuration for tokenizer selection.
|
|
319
|
+
|
|
320
|
+
:ivar tokenizer_id: Tokenizer identifier.
|
|
321
|
+
:vartype tokenizer_id: str
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
model_config = ConfigDict(extra="forbid")
|
|
325
|
+
|
|
326
|
+
tokenizer_id: str = Field(min_length=1)
|
|
327
|
+
|
|
328
|
+
def build_tokenizer(self) -> Tokenizer:
|
|
329
|
+
"""
|
|
330
|
+
Build a tokenizer instance from this configuration.
|
|
331
|
+
|
|
332
|
+
:return: Tokenizer instance.
|
|
333
|
+
:rtype: Tokenizer
|
|
334
|
+
:raises ValueError: If the tokenizer identifier is unknown.
|
|
335
|
+
"""
|
|
336
|
+
if self.tokenizer_id == WhitespaceTokenizer.tokenizer_id:
|
|
337
|
+
return WhitespaceTokenizer()
|
|
338
|
+
raise ValueError(f"Unknown tokenizer_id: {self.tokenizer_id!r}")
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class ChunkerConfig(BaseModel):
|
|
342
|
+
"""
|
|
343
|
+
Configuration for chunker selection.
|
|
344
|
+
|
|
345
|
+
:ivar chunker_id: Chunker identifier.
|
|
346
|
+
:vartype chunker_id: str
|
|
347
|
+
:ivar window_characters: Window size for fixed character chunking.
|
|
348
|
+
:vartype window_characters: int or None
|
|
349
|
+
:ivar overlap_characters: Overlap size for fixed character chunking.
|
|
350
|
+
:vartype overlap_characters: int or None
|
|
351
|
+
:ivar window_tokens: Window size for fixed token chunking.
|
|
352
|
+
:vartype window_tokens: int or None
|
|
353
|
+
:ivar overlap_tokens: Overlap size for fixed token chunking.
|
|
354
|
+
:vartype overlap_tokens: int or None
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
model_config = ConfigDict(extra="forbid")
|
|
358
|
+
|
|
359
|
+
chunker_id: str = Field(min_length=1)
|
|
360
|
+
window_characters: Optional[int] = Field(default=None, ge=1)
|
|
361
|
+
overlap_characters: Optional[int] = Field(default=None, ge=0)
|
|
362
|
+
window_tokens: Optional[int] = Field(default=None, ge=1)
|
|
363
|
+
overlap_tokens: Optional[int] = Field(default=None, ge=0)
|
|
364
|
+
|
|
365
|
+
def build_chunker(self, *, tokenizer: Optional[Tokenizer]) -> Chunker:
|
|
366
|
+
"""
|
|
367
|
+
Build a chunker instance from this configuration.
|
|
368
|
+
|
|
369
|
+
:param tokenizer: Tokenizer used by token-window chunking strategies.
|
|
370
|
+
:type tokenizer: Tokenizer or None
|
|
371
|
+
:return: Chunker instance.
|
|
372
|
+
:rtype: Chunker
|
|
373
|
+
:raises ValueError: If required configuration is missing or unknown.
|
|
374
|
+
"""
|
|
375
|
+
if self.chunker_id == FixedCharWindowChunker.chunker_id:
|
|
376
|
+
if self.window_characters is None or self.overlap_characters is None:
|
|
377
|
+
raise ValueError(
|
|
378
|
+
"fixed-char-window requires window_characters and overlap_characters"
|
|
379
|
+
)
|
|
380
|
+
return FixedCharWindowChunker(
|
|
381
|
+
window_characters=self.window_characters,
|
|
382
|
+
overlap_characters=self.overlap_characters,
|
|
383
|
+
)
|
|
384
|
+
if self.chunker_id == ParagraphChunker.chunker_id:
|
|
385
|
+
return ParagraphChunker()
|
|
386
|
+
if self.chunker_id == FixedTokenWindowChunker.chunker_id:
|
|
387
|
+
if self.window_tokens is None or self.overlap_tokens is None:
|
|
388
|
+
raise ValueError("fixed-token-window requires window_tokens and overlap_tokens")
|
|
389
|
+
if tokenizer is None:
|
|
390
|
+
raise ValueError("tokenizer configuration is required for fixed-token-window")
|
|
391
|
+
return FixedTokenWindowChunker(
|
|
392
|
+
window_tokens=self.window_tokens,
|
|
393
|
+
overlap_tokens=self.overlap_tokens,
|
|
394
|
+
tokenizer=tokenizer,
|
|
395
|
+
)
|
|
396
|
+
raise ValueError(f"Unknown chunker_id: {self.chunker_id!r}")
|
biblicus/cli.py
CHANGED
|
@@ -8,7 +8,7 @@ import argparse
|
|
|
8
8
|
import json
|
|
9
9
|
import sys
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Dict, List, Optional
|
|
11
|
+
from typing import Dict, Iterable, List, Optional
|
|
12
12
|
|
|
13
13
|
from pydantic import ValidationError
|
|
14
14
|
|
|
@@ -239,15 +239,23 @@ def cmd_purge(arguments: argparse.Namespace) -> int:
|
|
|
239
239
|
return 0
|
|
240
240
|
|
|
241
241
|
|
|
242
|
-
def _parse_config_pairs(pairs: Optional[
|
|
242
|
+
def _parse_config_pairs(pairs: Optional[Iterable[str]]) -> Dict[str, object]:
|
|
243
243
|
"""
|
|
244
|
-
Parse
|
|
244
|
+
Parse key=value pairs into a configuration mapping.
|
|
245
245
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
246
|
+
This is used by a few command-line options that accept repeated key=value items.
|
|
247
|
+
Values are coerced to useful types in a predictable way:
|
|
248
|
+
|
|
249
|
+
- JSON objects/arrays (leading ``{`` or ``[``) are parsed as JSON.
|
|
250
|
+
- Whole numbers are parsed as integers.
|
|
251
|
+
- Other numeric forms are parsed as floats.
|
|
252
|
+
- Everything else remains a string.
|
|
253
|
+
|
|
254
|
+
:param pairs: Iterable of key=value strings.
|
|
255
|
+
:type pairs: Iterable[str] or None
|
|
256
|
+
:return: Parsed configuration mapping.
|
|
249
257
|
:rtype: dict[str, object]
|
|
250
|
-
:raises ValueError: If any entry is not key=value.
|
|
258
|
+
:raises ValueError: If any entry is not a key=value pair or values are invalid.
|
|
251
259
|
"""
|
|
252
260
|
config: Dict[str, object] = {}
|
|
253
261
|
for item in pairs or []:
|
|
@@ -257,8 +265,14 @@ def _parse_config_pairs(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
|
257
265
|
key = key.strip()
|
|
258
266
|
if not key:
|
|
259
267
|
raise ValueError("Config keys must be non-empty")
|
|
268
|
+
raw = raw.strip()
|
|
260
269
|
value: object = raw
|
|
261
|
-
if raw.
|
|
270
|
+
if raw.startswith("{") or raw.startswith("["):
|
|
271
|
+
try:
|
|
272
|
+
value = json.loads(raw)
|
|
273
|
+
except json.JSONDecodeError as exc:
|
|
274
|
+
raise ValueError(f"Config value must be valid JSON for key {key!r}") from exc
|
|
275
|
+
elif raw.isdigit():
|
|
262
276
|
value = int(raw)
|
|
263
277
|
else:
|
|
264
278
|
try:
|
|
@@ -359,6 +373,7 @@ def _budget_from_args(arguments: argparse.Namespace) -> QueryBudget:
|
|
|
359
373
|
"""
|
|
360
374
|
return QueryBudget(
|
|
361
375
|
max_total_items=arguments.max_total_items,
|
|
376
|
+
offset=getattr(arguments, "offset", 0),
|
|
362
377
|
max_total_characters=arguments.max_total_characters,
|
|
363
378
|
max_items_per_source=arguments.max_items_per_source,
|
|
364
379
|
)
|
|
@@ -373,13 +388,26 @@ def cmd_build(arguments: argparse.Namespace) -> int:
|
|
|
373
388
|
:return: Exit code.
|
|
374
389
|
:rtype: int
|
|
375
390
|
"""
|
|
391
|
+
from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
|
|
392
|
+
|
|
376
393
|
corpus = (
|
|
377
394
|
Corpus.open(arguments.corpus)
|
|
378
395
|
if getattr(arguments, "corpus", None)
|
|
379
396
|
else Corpus.find(Path.cwd())
|
|
380
397
|
)
|
|
381
398
|
backend = get_backend(arguments.backend)
|
|
382
|
-
|
|
399
|
+
|
|
400
|
+
base_config: Dict[str, object] = {}
|
|
401
|
+
if getattr(arguments, "recipe", None):
|
|
402
|
+
base_config = load_recipe_view(
|
|
403
|
+
arguments.recipe,
|
|
404
|
+
recipe_label="Recipe file",
|
|
405
|
+
mapping_error_message="Retrieval build recipe must be a mapping/object",
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
overrides = parse_dotted_overrides(arguments.config)
|
|
409
|
+
config = apply_dotted_overrides(base_config, overrides)
|
|
410
|
+
|
|
383
411
|
run = backend.build_run(corpus, recipe_name=arguments.recipe_name, config=config)
|
|
384
412
|
print(run.model_dump_json(indent=2))
|
|
385
413
|
return 0
|
|
@@ -947,11 +975,17 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
947
975
|
help="Backend identifier (for example, scan, sqlite-full-text-search).",
|
|
948
976
|
)
|
|
949
977
|
p_build.add_argument("--recipe-name", default="default", help="Human-readable recipe name.")
|
|
978
|
+
p_build.add_argument(
|
|
979
|
+
"--recipe",
|
|
980
|
+
default=None,
|
|
981
|
+
action="append",
|
|
982
|
+
help="Path to YAML recipe file (repeatable). If provided, recipes are composed in precedence order.",
|
|
983
|
+
)
|
|
950
984
|
p_build.add_argument(
|
|
951
985
|
"--config",
|
|
952
986
|
action="append",
|
|
953
987
|
default=None,
|
|
954
|
-
help="Backend config as key=value (repeatable).",
|
|
988
|
+
help="Backend config override as key=value (repeatable). Dotted keys create nested config mappings.",
|
|
955
989
|
)
|
|
956
990
|
p_build.set_defaults(func=cmd_build)
|
|
957
991
|
|
|
@@ -1030,6 +1064,12 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1030
1064
|
p_query.add_argument("--run", default=None, help="Run identifier (defaults to latest run).")
|
|
1031
1065
|
p_query.add_argument("--backend", default=None, help="Validate backend identifier.")
|
|
1032
1066
|
p_query.add_argument("--query", default=None, help="Query text (defaults to standard input).")
|
|
1067
|
+
p_query.add_argument(
|
|
1068
|
+
"--offset",
|
|
1069
|
+
type=int,
|
|
1070
|
+
default=0,
|
|
1071
|
+
help="Skip this many ranked candidates before selecting evidence (pagination).",
|
|
1072
|
+
)
|
|
1033
1073
|
p_query.add_argument("--max-total-items", type=int, default=5)
|
|
1034
1074
|
p_query.add_argument("--max-total-characters", type=int, default=2000)
|
|
1035
1075
|
p_query.add_argument("--max-items-per-source", type=int, default=5)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding provider interfaces for retrieval backends.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Optional, Sequence
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EmbeddingProvider(ABC):
|
|
16
|
+
"""
|
|
17
|
+
Interface for producing dense embedding vectors from text.
|
|
18
|
+
|
|
19
|
+
:ivar provider_id: Provider identifier.
|
|
20
|
+
:vartype provider_id: str
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
provider_id: str
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def embed_texts(self, texts: Sequence[str]) -> np.ndarray:
|
|
27
|
+
"""
|
|
28
|
+
Embed a batch of texts.
|
|
29
|
+
|
|
30
|
+
:param texts: Text inputs.
|
|
31
|
+
:type texts: Sequence[str]
|
|
32
|
+
:return: 2D float array with shape (len(texts), dimensions).
|
|
33
|
+
:rtype: numpy.ndarray
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _l2_normalize_rows(matrix: np.ndarray) -> np.ndarray:
|
|
39
|
+
norms = np.linalg.norm(matrix, axis=1, keepdims=True)
|
|
40
|
+
norms = np.where(norms == 0, 1.0, norms)
|
|
41
|
+
return matrix / norms
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class HashEmbeddingProvider(EmbeddingProvider):
|
|
45
|
+
"""
|
|
46
|
+
Deterministic embedding provider for tests and demos.
|
|
47
|
+
|
|
48
|
+
The output vectors are stable across runs and require no external services.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
provider_id = "hash-embedding"
|
|
52
|
+
|
|
53
|
+
def __init__(self, *, dimensions: int, seed: str = "biblicus") -> None:
|
|
54
|
+
self._dimensions = int(dimensions)
|
|
55
|
+
self._seed = str(seed)
|
|
56
|
+
if self._dimensions <= 0:
|
|
57
|
+
raise ValueError("dimensions must be greater than 0")
|
|
58
|
+
|
|
59
|
+
def embed_texts(self, texts: Sequence[str]) -> np.ndarray:
|
|
60
|
+
"""
|
|
61
|
+
Embed a batch of texts deterministically.
|
|
62
|
+
|
|
63
|
+
:param texts: Text inputs.
|
|
64
|
+
:type texts: Sequence[str]
|
|
65
|
+
:return: Normalized embedding matrix.
|
|
66
|
+
:rtype: numpy.ndarray
|
|
67
|
+
"""
|
|
68
|
+
items = list(texts)
|
|
69
|
+
if not items:
|
|
70
|
+
return np.zeros((0, self._dimensions), dtype=np.float32)
|
|
71
|
+
|
|
72
|
+
vectors = np.zeros((len(items), self._dimensions), dtype=np.float32)
|
|
73
|
+
for row_index, text in enumerate(items):
|
|
74
|
+
vectors[row_index] = self._hash_to_vector(text)
|
|
75
|
+
return _l2_normalize_rows(vectors)
|
|
76
|
+
|
|
77
|
+
def _hash_to_vector(self, text: str) -> np.ndarray:
|
|
78
|
+
output = np.empty((self._dimensions,), dtype=np.float32)
|
|
79
|
+
remaining = self._dimensions
|
|
80
|
+
offset = 0
|
|
81
|
+
counter = 0
|
|
82
|
+
while remaining > 0:
|
|
83
|
+
digest = hashlib.sha256(f"{self._seed}:{counter}:{text}".encode("utf-8")).digest()
|
|
84
|
+
raw = np.frombuffer(digest, dtype=np.uint8).astype(np.float32)
|
|
85
|
+
raw = (raw / 255.0) * 2.0 - 1.0
|
|
86
|
+
take = min(remaining, raw.shape[0])
|
|
87
|
+
output[offset : offset + take] = raw[:take]
|
|
88
|
+
remaining -= take
|
|
89
|
+
offset += take
|
|
90
|
+
counter += 1
|
|
91
|
+
return output
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class EmbeddingProviderConfig(BaseModel):
|
|
95
|
+
"""
|
|
96
|
+
Configuration for embedding provider selection.
|
|
97
|
+
|
|
98
|
+
:ivar provider_id: Provider identifier.
|
|
99
|
+
:vartype provider_id: str
|
|
100
|
+
:ivar dimensions: Dimensionality of produced vectors.
|
|
101
|
+
:vartype dimensions: int
|
|
102
|
+
:ivar seed: Optional deterministic seed for test providers.
|
|
103
|
+
:vartype seed: str or None
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
model_config = ConfigDict(extra="forbid")
|
|
107
|
+
|
|
108
|
+
provider_id: str = Field(min_length=1)
|
|
109
|
+
dimensions: int = Field(ge=1)
|
|
110
|
+
seed: Optional[str] = None
|
|
111
|
+
|
|
112
|
+
def build_provider(self) -> EmbeddingProvider:
|
|
113
|
+
"""
|
|
114
|
+
Build an embedding provider instance from this configuration.
|
|
115
|
+
|
|
116
|
+
:return: Embedding provider instance.
|
|
117
|
+
:rtype: EmbeddingProvider
|
|
118
|
+
:raises ValueError: If the provider identifier is unknown.
|
|
119
|
+
"""
|
|
120
|
+
if self.provider_id == HashEmbeddingProvider.provider_id:
|
|
121
|
+
return HashEmbeddingProvider(dimensions=self.dimensions, seed=self.seed or "biblicus")
|
|
122
|
+
raise ValueError(f"Unknown embedding provider_id: {self.provider_id!r}")
|
biblicus/frontmatter.py
CHANGED
|
@@ -44,6 +44,8 @@ def parse_front_matter(text: str) -> FrontMatterDocument:
|
|
|
44
44
|
|
|
45
45
|
raw_yaml = text[4:front_matter_end]
|
|
46
46
|
body = text[front_matter_end + len("\n---\n") :]
|
|
47
|
+
if body.startswith("\n"):
|
|
48
|
+
body = body[1:]
|
|
47
49
|
|
|
48
50
|
metadata = yaml.safe_load(raw_yaml) or {}
|
|
49
51
|
if not isinstance(metadata, dict):
|
biblicus/models.py
CHANGED
|
@@ -224,8 +224,16 @@ class QueryBudget(BaseModel):
|
|
|
224
224
|
"""
|
|
225
225
|
Evidence selection budget for retrieval.
|
|
226
226
|
|
|
227
|
+
The budget constrains the *returned* evidence. It intentionally does not
|
|
228
|
+
change how a backend scores candidates, only how many evidence items are
|
|
229
|
+
selected and how much text is allowed through.
|
|
230
|
+
|
|
227
231
|
:ivar max_total_items: Maximum number of evidence items to return.
|
|
228
232
|
:vartype max_total_items: int
|
|
233
|
+
:ivar offset: Number of ranked candidates to skip before selecting evidence.
|
|
234
|
+
This enables simple pagination by re-running the same query with a
|
|
235
|
+
higher offset.
|
|
236
|
+
:vartype offset: int
|
|
229
237
|
:ivar max_total_characters: Optional maximum total characters across evidence text.
|
|
230
238
|
:vartype max_total_characters: int or None
|
|
231
239
|
:ivar max_items_per_source: Optional cap per source uniform resource identifier.
|
|
@@ -235,6 +243,7 @@ class QueryBudget(BaseModel):
|
|
|
235
243
|
model_config = ConfigDict(extra="forbid")
|
|
236
244
|
|
|
237
245
|
max_total_items: int = Field(ge=1)
|
|
246
|
+
offset: int = Field(default=0, ge=0)
|
|
238
247
|
max_total_characters: Optional[int] = Field(default=None, ge=1)
|
|
239
248
|
max_items_per_source: Optional[int] = Field(default=None, ge=1)
|
|
240
249
|
|
biblicus/retrieval.py
CHANGED
|
@@ -108,8 +108,13 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
|
|
|
108
108
|
selected_evidence: List[Evidence] = []
|
|
109
109
|
source_counts: Dict[str, int] = {}
|
|
110
110
|
total_characters = 0
|
|
111
|
+
skipped = 0
|
|
111
112
|
|
|
112
113
|
for candidate_evidence in evidence:
|
|
114
|
+
if skipped < budget.offset:
|
|
115
|
+
skipped += 1
|
|
116
|
+
continue
|
|
117
|
+
|
|
113
118
|
if len(selected_evidence) >= budget.max_total_items:
|
|
114
119
|
break
|
|
115
120
|
|