biblicus 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/markov.py +35 -3
- biblicus/backends/__init__.py +6 -2
- biblicus/backends/embedding_index_common.py +301 -0
- biblicus/backends/embedding_index_file.py +266 -0
- biblicus/backends/embedding_index_inmemory.py +268 -0
- biblicus/backends/hybrid.py +4 -2
- biblicus/backends/sqlite_full_text_search.py +1 -1
- biblicus/backends/{vector.py → tf_vector.py} +11 -11
- biblicus/chunking.py +396 -0
- biblicus/cli.py +50 -10
- biblicus/embedding_providers.py +122 -0
- biblicus/frontmatter.py +2 -0
- biblicus/models.py +9 -0
- biblicus/retrieval.py +5 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/METADATA +12 -4
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/RECORD +21 -16
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/WHEEL +0 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.15.0.dist-info → biblicus-0.16.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/analysis/markov.py
CHANGED
|
@@ -686,6 +686,15 @@ def _build_observations(
|
|
|
686
686
|
llm = config.llm_observations
|
|
687
687
|
assert llm.client is not None and llm.prompt_template is not None
|
|
688
688
|
for index, observation in enumerate(observations):
|
|
689
|
+
if observation.segment_text in {"START", "END"}:
|
|
690
|
+
observations[index] = observation.model_copy(
|
|
691
|
+
update={
|
|
692
|
+
"llm_label": observation.segment_text,
|
|
693
|
+
"llm_label_confidence": 1.0,
|
|
694
|
+
"llm_summary": observation.segment_text,
|
|
695
|
+
}
|
|
696
|
+
)
|
|
697
|
+
continue
|
|
689
698
|
prompt = llm.prompt_template.format(segment=observation.segment_text)
|
|
690
699
|
response_text = generate_completion(
|
|
691
700
|
client=llm.client,
|
|
@@ -707,8 +716,12 @@ def _build_observations(
|
|
|
707
716
|
if config.embeddings.enabled:
|
|
708
717
|
embedding_config = config.embeddings
|
|
709
718
|
assert embedding_config.client is not None
|
|
719
|
+
embed_indices: List[int] = []
|
|
710
720
|
embed_texts: List[str] = []
|
|
711
|
-
for observation in observations:
|
|
721
|
+
for index, observation in enumerate(observations):
|
|
722
|
+
if observation.segment_text in {"START", "END"}:
|
|
723
|
+
continue
|
|
724
|
+
embed_indices.append(index)
|
|
712
725
|
if embedding_config.text_source == "segment_text":
|
|
713
726
|
embed_texts.append(observation.segment_text)
|
|
714
727
|
else:
|
|
@@ -717,10 +730,29 @@ def _build_observations(
|
|
|
717
730
|
"embeddings.text_source is 'llm_summary' but llm_summary is missing"
|
|
718
731
|
)
|
|
719
732
|
embed_texts.append(observation.llm_summary)
|
|
733
|
+
|
|
734
|
+
if not embed_indices:
|
|
735
|
+
raise ValueError("Embeddings require at least one non-boundary segment")
|
|
736
|
+
|
|
720
737
|
vectors = generate_embeddings_batch(client=embedding_config.client, texts=embed_texts)
|
|
738
|
+
if len(vectors) != len(embed_indices):
|
|
739
|
+
raise ValueError(
|
|
740
|
+
"Embedding provider returned unexpected vector count: "
|
|
741
|
+
f"expected {len(embed_indices)} but got {len(vectors)}"
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
vector_by_observation_index: Dict[int, List[float]] = {}
|
|
745
|
+
for observation_index, vector in zip(embed_indices, vectors):
|
|
746
|
+
vector_by_observation_index[observation_index] = list(vector)
|
|
747
|
+
|
|
748
|
+
embedding_dimension = len(next(iter(vector_by_observation_index.values())))
|
|
749
|
+
boundary_embedding = [0.0 for _ in range(embedding_dimension)]
|
|
721
750
|
updated: List[MarkovAnalysisObservation] = []
|
|
722
|
-
for
|
|
723
|
-
|
|
751
|
+
for index, observation in enumerate(observations):
|
|
752
|
+
vector = vector_by_observation_index.get(index)
|
|
753
|
+
updated.append(
|
|
754
|
+
observation.model_copy(update={"embedding": vector or boundary_embedding})
|
|
755
|
+
)
|
|
724
756
|
observations = updated
|
|
725
757
|
|
|
726
758
|
return observations
|
biblicus/backends/__init__.py
CHANGED
|
@@ -7,10 +7,12 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict, Type
|
|
8
8
|
|
|
9
9
|
from .base import RetrievalBackend
|
|
10
|
+
from .embedding_index_file import EmbeddingIndexFileBackend
|
|
11
|
+
from .embedding_index_inmemory import EmbeddingIndexInMemoryBackend
|
|
10
12
|
from .hybrid import HybridBackend
|
|
11
13
|
from .scan import ScanBackend
|
|
12
14
|
from .sqlite_full_text_search import SqliteFullTextSearchBackend
|
|
13
|
-
from .
|
|
15
|
+
from .tf_vector import TfVectorBackend
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
@@ -21,10 +23,12 @@ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
|
21
23
|
:rtype: dict[str, Type[RetrievalBackend]]
|
|
22
24
|
"""
|
|
23
25
|
return {
|
|
26
|
+
EmbeddingIndexFileBackend.backend_id: EmbeddingIndexFileBackend,
|
|
27
|
+
EmbeddingIndexInMemoryBackend.backend_id: EmbeddingIndexInMemoryBackend,
|
|
24
28
|
HybridBackend.backend_id: HybridBackend,
|
|
25
29
|
ScanBackend.backend_id: ScanBackend,
|
|
26
30
|
SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
|
|
27
|
-
|
|
31
|
+
TfVectorBackend.backend_id: TfVectorBackend,
|
|
28
32
|
}
|
|
29
33
|
|
|
30
34
|
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared primitives for embedding-index retrieval backends.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
13
|
+
|
|
14
|
+
from ..chunking import ChunkerConfig, TextChunk, TokenizerConfig
|
|
15
|
+
from ..corpus import CORPUS_DIR_NAME, RUNS_DIR_NAME, Corpus
|
|
16
|
+
from ..embedding_providers import EmbeddingProviderConfig, _l2_normalize_rows
|
|
17
|
+
from ..frontmatter import parse_front_matter
|
|
18
|
+
from ..models import ExtractionRunReference, parse_extraction_run_reference
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ChunkRecord(BaseModel):
|
|
22
|
+
"""
|
|
23
|
+
Minimal persisted representation of a chunk.
|
|
24
|
+
|
|
25
|
+
:ivar item_id: Item identifier that produced the chunk.
|
|
26
|
+
:vartype item_id: str
|
|
27
|
+
:ivar span_start: Inclusive start character offset.
|
|
28
|
+
:vartype span_start: int
|
|
29
|
+
:ivar span_end: Exclusive end character offset.
|
|
30
|
+
:vartype span_end: int
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
model_config = ConfigDict(extra="forbid")
|
|
34
|
+
|
|
35
|
+
item_id: str = Field(min_length=1)
|
|
36
|
+
span_start: int = Field(ge=0)
|
|
37
|
+
span_end: int = Field(ge=0)
|
|
38
|
+
|
|
39
|
+
@model_validator(mode="after")
|
|
40
|
+
def _validate_span(self) -> "ChunkRecord":
|
|
41
|
+
if self.span_end <= self.span_start:
|
|
42
|
+
raise ValueError("chunk span_end must be greater than span_start")
|
|
43
|
+
return self
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EmbeddingIndexRecipeConfig(BaseModel):
|
|
47
|
+
"""
|
|
48
|
+
Configuration for embedding-index retrieval backends.
|
|
49
|
+
|
|
50
|
+
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
51
|
+
:vartype snippet_characters: int
|
|
52
|
+
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
53
|
+
:vartype extraction_run: str or None
|
|
54
|
+
:ivar chunker: Chunker configuration.
|
|
55
|
+
:vartype chunker: biblicus.chunking.ChunkerConfig
|
|
56
|
+
:ivar tokenizer: Optional tokenizer configuration.
|
|
57
|
+
:vartype tokenizer: biblicus.chunking.TokenizerConfig or None
|
|
58
|
+
:ivar embedding_provider: Embedding provider configuration.
|
|
59
|
+
:vartype embedding_provider: biblicus.embedding_providers.EmbeddingProviderConfig
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
model_config = ConfigDict(extra="forbid")
|
|
63
|
+
|
|
64
|
+
snippet_characters: int = Field(default=400, ge=1)
|
|
65
|
+
extraction_run: Optional[str] = None
|
|
66
|
+
chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
|
|
67
|
+
tokenizer: Optional[TokenizerConfig] = None
|
|
68
|
+
embedding_provider: EmbeddingProviderConfig
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def resolve_extraction_reference(
|
|
72
|
+
corpus: Corpus, recipe_config: EmbeddingIndexRecipeConfig
|
|
73
|
+
) -> Optional[ExtractionRunReference]:
|
|
74
|
+
"""
|
|
75
|
+
Resolve an extraction run reference from an embedding-index recipe config.
|
|
76
|
+
|
|
77
|
+
:param corpus: Corpus associated with the recipe.
|
|
78
|
+
:type corpus: Corpus
|
|
79
|
+
:param recipe_config: Parsed embedding-index recipe configuration.
|
|
80
|
+
:type recipe_config: EmbeddingIndexRecipeConfig
|
|
81
|
+
:return: Parsed extraction reference or None.
|
|
82
|
+
:rtype: ExtractionRunReference or None
|
|
83
|
+
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
84
|
+
"""
|
|
85
|
+
if not recipe_config.extraction_run:
|
|
86
|
+
return None
|
|
87
|
+
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
88
|
+
run_dir = corpus.extraction_run_dir(
|
|
89
|
+
extractor_id=extraction_reference.extractor_id,
|
|
90
|
+
run_id=extraction_reference.run_id,
|
|
91
|
+
)
|
|
92
|
+
if not run_dir.is_dir():
|
|
93
|
+
raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
|
|
94
|
+
return extraction_reference
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _load_text_from_item(
|
|
98
|
+
corpus: Corpus,
|
|
99
|
+
*,
|
|
100
|
+
item_id: str,
|
|
101
|
+
relpath: str,
|
|
102
|
+
media_type: str,
|
|
103
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
104
|
+
) -> Optional[str]:
|
|
105
|
+
if extraction_reference:
|
|
106
|
+
extracted_text = corpus.read_extracted_text(
|
|
107
|
+
extractor_id=extraction_reference.extractor_id,
|
|
108
|
+
run_id=extraction_reference.run_id,
|
|
109
|
+
item_id=item_id,
|
|
110
|
+
)
|
|
111
|
+
if isinstance(extracted_text, str):
|
|
112
|
+
return extracted_text
|
|
113
|
+
|
|
114
|
+
if media_type == "text/markdown":
|
|
115
|
+
raw = (corpus.root / relpath).read_text(encoding="utf-8")
|
|
116
|
+
return parse_front_matter(raw).body
|
|
117
|
+
if media_type.startswith("text/"):
|
|
118
|
+
return (corpus.root / relpath).read_text(encoding="utf-8")
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def iter_text_payloads(
|
|
123
|
+
corpus: Corpus, *, extraction_reference: Optional[ExtractionRunReference]
|
|
124
|
+
) -> Iterator[Tuple[object, str]]:
|
|
125
|
+
"""
|
|
126
|
+
Yield catalog items and their text payloads.
|
|
127
|
+
|
|
128
|
+
:param corpus: Corpus containing the items.
|
|
129
|
+
:type corpus: Corpus
|
|
130
|
+
:param extraction_reference: Optional extraction reference.
|
|
131
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
132
|
+
:yield: (catalog_item, text) pairs.
|
|
133
|
+
:rtype: Iterator[tuple[object, str]]
|
|
134
|
+
"""
|
|
135
|
+
catalog = corpus.load_catalog()
|
|
136
|
+
for catalog_item in catalog.items.values():
|
|
137
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
138
|
+
relpath = str(getattr(catalog_item, "relpath", ""))
|
|
139
|
+
media_type = str(getattr(catalog_item, "media_type", ""))
|
|
140
|
+
if not item_id or not relpath or not media_type:
|
|
141
|
+
continue
|
|
142
|
+
text = _load_text_from_item(
|
|
143
|
+
corpus,
|
|
144
|
+
item_id=item_id,
|
|
145
|
+
relpath=relpath,
|
|
146
|
+
media_type=media_type,
|
|
147
|
+
extraction_reference=extraction_reference,
|
|
148
|
+
)
|
|
149
|
+
if not isinstance(text, str) or not text.strip():
|
|
150
|
+
continue
|
|
151
|
+
yield catalog_item, text
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def collect_chunks(
|
|
155
|
+
corpus: Corpus, *, recipe_config: EmbeddingIndexRecipeConfig
|
|
156
|
+
) -> Tuple[List[TextChunk], int]:
|
|
157
|
+
"""
|
|
158
|
+
Collect chunks from text payloads in a corpus.
|
|
159
|
+
|
|
160
|
+
:param corpus: Corpus to chunk.
|
|
161
|
+
:type corpus: Corpus
|
|
162
|
+
:param recipe_config: Parsed embedding-index recipe configuration.
|
|
163
|
+
:type recipe_config: EmbeddingIndexRecipeConfig
|
|
164
|
+
:return: (chunks, text_item_count)
|
|
165
|
+
:rtype: tuple[list[TextChunk], int]
|
|
166
|
+
"""
|
|
167
|
+
tokenizer = recipe_config.tokenizer.build_tokenizer() if recipe_config.tokenizer else None
|
|
168
|
+
chunker = recipe_config.chunker.build_chunker(tokenizer=tokenizer)
|
|
169
|
+
extraction_reference = resolve_extraction_reference(corpus, recipe_config)
|
|
170
|
+
|
|
171
|
+
chunks: List[TextChunk] = []
|
|
172
|
+
next_chunk_id = 0
|
|
173
|
+
text_items = 0
|
|
174
|
+
for catalog_item, text in iter_text_payloads(corpus, extraction_reference=extraction_reference):
|
|
175
|
+
text_items += 1
|
|
176
|
+
item_id = str(getattr(catalog_item, "id"))
|
|
177
|
+
item_chunks = chunker.chunk_text(
|
|
178
|
+
item_id=item_id, text=text, starting_chunk_id=next_chunk_id
|
|
179
|
+
)
|
|
180
|
+
if item_chunks:
|
|
181
|
+
next_chunk_id = item_chunks[-1].chunk_id + 1
|
|
182
|
+
chunks.extend(item_chunks)
|
|
183
|
+
return chunks, text_items
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def chunks_to_records(chunks: Iterable[TextChunk]) -> List[ChunkRecord]:
|
|
187
|
+
"""
|
|
188
|
+
Convert chunk objects to persisted chunk records.
|
|
189
|
+
|
|
190
|
+
:param chunks: Chunk list.
|
|
191
|
+
:type chunks: Iterable[TextChunk]
|
|
192
|
+
:return: Chunk record list.
|
|
193
|
+
:rtype: list[ChunkRecord]
|
|
194
|
+
"""
|
|
195
|
+
records: List[ChunkRecord] = []
|
|
196
|
+
for chunk in chunks:
|
|
197
|
+
records.append(
|
|
198
|
+
ChunkRecord(
|
|
199
|
+
item_id=chunk.item_id,
|
|
200
|
+
span_start=chunk.span_start,
|
|
201
|
+
span_end=chunk.span_end,
|
|
202
|
+
)
|
|
203
|
+
)
|
|
204
|
+
return records
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def write_chunks_jsonl(path: Path, records: Iterable[ChunkRecord]) -> None:
|
|
208
|
+
"""
|
|
209
|
+
Write chunk records as newline-delimited JSON.
|
|
210
|
+
|
|
211
|
+
:param path: Destination path.
|
|
212
|
+
:type path: pathlib.Path
|
|
213
|
+
:param records: Chunk records.
|
|
214
|
+
:type records: Iterable[ChunkRecord]
|
|
215
|
+
:return: None.
|
|
216
|
+
:rtype: None
|
|
217
|
+
"""
|
|
218
|
+
with path.open("w", encoding="utf-8") as handle:
|
|
219
|
+
for record in records:
|
|
220
|
+
handle.write(json.dumps(record.model_dump(), separators=(",", ":")) + "\n")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def read_chunks_jsonl(path: Path) -> List[ChunkRecord]:
|
|
224
|
+
"""
|
|
225
|
+
Read chunk records from a JSON Lines file.
|
|
226
|
+
|
|
227
|
+
:param path: Source path.
|
|
228
|
+
:type path: pathlib.Path
|
|
229
|
+
:return: Chunk record list.
|
|
230
|
+
:rtype: list[ChunkRecord]
|
|
231
|
+
"""
|
|
232
|
+
records: List[ChunkRecord] = []
|
|
233
|
+
for line in path.read_text(encoding="utf-8").splitlines():
|
|
234
|
+
if not line.strip():
|
|
235
|
+
continue
|
|
236
|
+
records.append(ChunkRecord.model_validate(json.loads(line)))
|
|
237
|
+
return records
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def write_embeddings(path: Path, embeddings: np.ndarray) -> None:
|
|
241
|
+
"""
|
|
242
|
+
Write embeddings to disk.
|
|
243
|
+
|
|
244
|
+
:param path: Destination path.
|
|
245
|
+
:type path: pathlib.Path
|
|
246
|
+
:param embeddings: Embedding matrix.
|
|
247
|
+
:type embeddings: numpy.ndarray
|
|
248
|
+
:return: None.
|
|
249
|
+
:rtype: None
|
|
250
|
+
"""
|
|
251
|
+
np.save(path, embeddings.astype(np.float32))
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def read_embeddings(path: Path, *, mmap: bool) -> np.ndarray:
|
|
255
|
+
"""
|
|
256
|
+
Read embeddings from disk.
|
|
257
|
+
|
|
258
|
+
:param path: Source path.
|
|
259
|
+
:type path: pathlib.Path
|
|
260
|
+
:param mmap: Whether to memory-map the file.
|
|
261
|
+
:type mmap: bool
|
|
262
|
+
:return: Embedding matrix.
|
|
263
|
+
:rtype: numpy.ndarray
|
|
264
|
+
"""
|
|
265
|
+
mode = "r" if mmap else None
|
|
266
|
+
return np.load(path, mmap_mode=mode)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def cosine_similarity_scores(embeddings: np.ndarray, query_vector: np.ndarray) -> np.ndarray:
|
|
270
|
+
"""
|
|
271
|
+
Compute cosine similarity scores for a query vector.
|
|
272
|
+
|
|
273
|
+
The embedding matrix must already be L2-normalized.
|
|
274
|
+
|
|
275
|
+
:param embeddings: Embedding matrix of shape (n, d).
|
|
276
|
+
:type embeddings: numpy.ndarray
|
|
277
|
+
:param query_vector: Query vector of shape (d,).
|
|
278
|
+
:type query_vector: numpy.ndarray
|
|
279
|
+
:return: Score vector of shape (n,).
|
|
280
|
+
:rtype: numpy.ndarray
|
|
281
|
+
"""
|
|
282
|
+
query_vector = query_vector.astype(np.float32).reshape(-1)
|
|
283
|
+
query_vector = _l2_normalize_rows(query_vector.reshape(1, -1)).reshape(-1)
|
|
284
|
+
return embeddings @ query_vector
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def artifact_paths_for_run(*, run_id: str, backend_id: str) -> Dict[str, str]:
|
|
288
|
+
"""
|
|
289
|
+
Build deterministic artifact relative paths for an embedding index run.
|
|
290
|
+
|
|
291
|
+
:param run_id: Run identifier.
|
|
292
|
+
:type run_id: str
|
|
293
|
+
:param backend_id: Backend identifier.
|
|
294
|
+
:type backend_id: str
|
|
295
|
+
:return: Mapping with keys embeddings and chunks.
|
|
296
|
+
:rtype: dict[str, str]
|
|
297
|
+
"""
|
|
298
|
+
prefix = f"{run_id}.{backend_id}"
|
|
299
|
+
embeddings_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.embeddings.npy")
|
|
300
|
+
chunks_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.chunks.jsonl")
|
|
301
|
+
return {"embeddings": embeddings_relpath, "chunks": chunks_relpath}
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding-index retrieval backend that reads the embedding matrix via memory mapping.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from ..corpus import Corpus
|
|
13
|
+
from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
|
|
14
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
15
|
+
from ..time import utc_now_iso
|
|
16
|
+
from .embedding_index_common import (
|
|
17
|
+
ChunkRecord,
|
|
18
|
+
EmbeddingIndexRecipeConfig,
|
|
19
|
+
artifact_paths_for_run,
|
|
20
|
+
chunks_to_records,
|
|
21
|
+
collect_chunks,
|
|
22
|
+
cosine_similarity_scores,
|
|
23
|
+
read_chunks_jsonl,
|
|
24
|
+
read_embeddings,
|
|
25
|
+
resolve_extraction_reference,
|
|
26
|
+
write_chunks_jsonl,
|
|
27
|
+
write_embeddings,
|
|
28
|
+
)
|
|
29
|
+
from .scan import _build_snippet
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class EmbeddingIndexFileBackend:
|
|
33
|
+
"""
|
|
34
|
+
Embedding retrieval backend using memory-mapped similarity scanning.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
backend_id = "embedding-index-file"
|
|
38
|
+
|
|
39
|
+
def build_run(
|
|
40
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
41
|
+
) -> RetrievalRun:
|
|
42
|
+
"""
|
|
43
|
+
Build an embedding index run by chunking text payloads and materializing embeddings.
|
|
44
|
+
|
|
45
|
+
:param corpus: Corpus to build against.
|
|
46
|
+
:type corpus: Corpus
|
|
47
|
+
:param recipe_name: Human-readable recipe name.
|
|
48
|
+
:type recipe_name: str
|
|
49
|
+
:param config: Backend-specific configuration values.
|
|
50
|
+
:type config: dict[str, object]
|
|
51
|
+
:return: Run manifest describing the build.
|
|
52
|
+
:rtype: biblicus.models.RetrievalRun
|
|
53
|
+
"""
|
|
54
|
+
recipe_config = EmbeddingIndexRecipeConfig.model_validate(config)
|
|
55
|
+
chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
|
|
56
|
+
|
|
57
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
58
|
+
embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
|
|
59
|
+
|
|
60
|
+
recipe = create_recipe_manifest(
|
|
61
|
+
backend_id=self.backend_id,
|
|
62
|
+
name=recipe_name,
|
|
63
|
+
config=recipe_config.model_dump(),
|
|
64
|
+
)
|
|
65
|
+
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
66
|
+
|
|
67
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
68
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
69
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
70
|
+
corpus.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
|
|
72
|
+
write_embeddings(embeddings_path, embeddings)
|
|
73
|
+
write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
|
|
74
|
+
|
|
75
|
+
stats = {
|
|
76
|
+
"items": len(corpus.load_catalog().items),
|
|
77
|
+
"text_items": text_items,
|
|
78
|
+
"chunks": len(chunks),
|
|
79
|
+
"dimensions": (
|
|
80
|
+
int(embeddings.shape[1])
|
|
81
|
+
if embeddings.size
|
|
82
|
+
else recipe_config.embedding_provider.dimensions
|
|
83
|
+
),
|
|
84
|
+
}
|
|
85
|
+
run = run.model_copy(
|
|
86
|
+
update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
|
|
87
|
+
)
|
|
88
|
+
corpus.write_run(run)
|
|
89
|
+
return run
|
|
90
|
+
|
|
91
|
+
def query(
|
|
92
|
+
self,
|
|
93
|
+
corpus: Corpus,
|
|
94
|
+
*,
|
|
95
|
+
run: RetrievalRun,
|
|
96
|
+
query_text: str,
|
|
97
|
+
budget: QueryBudget,
|
|
98
|
+
) -> RetrievalResult:
|
|
99
|
+
"""
|
|
100
|
+
Query an embedding index run and return ranked evidence.
|
|
101
|
+
|
|
102
|
+
:param corpus: Corpus associated with the run.
|
|
103
|
+
:type corpus: Corpus
|
|
104
|
+
:param run: Run manifest to use for querying.
|
|
105
|
+
:type run: biblicus.models.RetrievalRun
|
|
106
|
+
:param query_text: Query text to embed.
|
|
107
|
+
:type query_text: str
|
|
108
|
+
:param budget: Evidence selection budget.
|
|
109
|
+
:type budget: biblicus.models.QueryBudget
|
|
110
|
+
:return: Retrieval results containing evidence.
|
|
111
|
+
:rtype: biblicus.models.RetrievalResult
|
|
112
|
+
"""
|
|
113
|
+
recipe_config = EmbeddingIndexRecipeConfig.model_validate(run.recipe.config)
|
|
114
|
+
extraction_reference = resolve_extraction_reference(corpus, recipe_config)
|
|
115
|
+
|
|
116
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
117
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
118
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
119
|
+
if not embeddings_path.is_file() or not chunks_path.is_file():
|
|
120
|
+
raise FileNotFoundError("Embedding index artifacts are missing for this run")
|
|
121
|
+
|
|
122
|
+
embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
|
|
123
|
+
chunk_records = read_chunks_jsonl(chunks_path)
|
|
124
|
+
if embeddings.shape[0] != len(chunk_records):
|
|
125
|
+
raise ValueError(
|
|
126
|
+
"Embedding index artifacts are inconsistent: "
|
|
127
|
+
"embeddings row count does not match chunk record count"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
131
|
+
query_embedding = provider.embed_texts([query_text]).astype(np.float32)
|
|
132
|
+
if query_embedding.shape[0] != 1:
|
|
133
|
+
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
134
|
+
|
|
135
|
+
candidates = _top_indices_batched(
|
|
136
|
+
embeddings=embeddings,
|
|
137
|
+
query_vector=query_embedding[0],
|
|
138
|
+
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
139
|
+
)
|
|
140
|
+
evidence_items = _build_evidence(
|
|
141
|
+
corpus,
|
|
142
|
+
run=run,
|
|
143
|
+
recipe_config=recipe_config,
|
|
144
|
+
candidates=candidates,
|
|
145
|
+
embeddings=embeddings,
|
|
146
|
+
query_vector=query_embedding[0],
|
|
147
|
+
chunk_records=chunk_records,
|
|
148
|
+
extraction_reference=extraction_reference,
|
|
149
|
+
)
|
|
150
|
+
ranked = [
|
|
151
|
+
item.model_copy(
|
|
152
|
+
update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
|
|
153
|
+
)
|
|
154
|
+
for index, item in enumerate(evidence_items, start=1)
|
|
155
|
+
]
|
|
156
|
+
evidence = apply_budget(ranked, budget)
|
|
157
|
+
return RetrievalResult(
|
|
158
|
+
query_text=query_text,
|
|
159
|
+
budget=budget,
|
|
160
|
+
run_id=run.run_id,
|
|
161
|
+
recipe_id=run.recipe.recipe_id,
|
|
162
|
+
backend_id=self.backend_id,
|
|
163
|
+
generated_at=utc_now_iso(),
|
|
164
|
+
evidence=evidence,
|
|
165
|
+
stats={"candidates": len(evidence_items), "returned": len(evidence)},
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
|
|
170
|
+
return max(1, int(max_total_items) * int(multiplier))
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@dataclass(frozen=True)
|
|
174
|
+
class _ScoredIndex:
|
|
175
|
+
score: float
|
|
176
|
+
index: int
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _top_indices_batched(
|
|
180
|
+
*, embeddings: np.ndarray, query_vector: np.ndarray, limit: int, batch_rows: int = 4096
|
|
181
|
+
) -> List[int]:
|
|
182
|
+
if embeddings.size == 0:
|
|
183
|
+
return []
|
|
184
|
+
limit = min(int(limit), int(embeddings.shape[0]))
|
|
185
|
+
|
|
186
|
+
best: List[_ScoredIndex] = []
|
|
187
|
+
for start in range(0, embeddings.shape[0], int(batch_rows)):
|
|
188
|
+
end = min(start + int(batch_rows), embeddings.shape[0])
|
|
189
|
+
scores = cosine_similarity_scores(embeddings[start:end], query_vector)
|
|
190
|
+
batch_limit = min(limit, int(scores.size))
|
|
191
|
+
if batch_limit <= 0:
|
|
192
|
+
continue
|
|
193
|
+
indices = np.argpartition(-scores, batch_limit - 1)[:batch_limit]
|
|
194
|
+
for local_index in indices:
|
|
195
|
+
global_index = int(start + int(local_index))
|
|
196
|
+
best.append(_ScoredIndex(score=float(scores[int(local_index)]), index=global_index))
|
|
197
|
+
|
|
198
|
+
best.sort(key=lambda item: (-item.score, item.index))
|
|
199
|
+
return [int(item.index) for item in best[:limit]]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _build_evidence(
|
|
203
|
+
corpus: Corpus,
|
|
204
|
+
*,
|
|
205
|
+
run: RetrievalRun,
|
|
206
|
+
recipe_config: EmbeddingIndexRecipeConfig,
|
|
207
|
+
candidates: List[int],
|
|
208
|
+
embeddings: np.ndarray,
|
|
209
|
+
query_vector: np.ndarray,
|
|
210
|
+
chunk_records: List[ChunkRecord],
|
|
211
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
212
|
+
) -> List[Evidence]:
|
|
213
|
+
catalog = corpus.load_catalog()
|
|
214
|
+
evidence_items: List[Evidence] = []
|
|
215
|
+
for idx in candidates:
|
|
216
|
+
record = chunk_records[idx]
|
|
217
|
+
catalog_item = catalog.items[record.item_id]
|
|
218
|
+
text = _load_text_for_evidence(
|
|
219
|
+
corpus,
|
|
220
|
+
item_id=record.item_id,
|
|
221
|
+
relpath=str(getattr(catalog_item, "relpath")),
|
|
222
|
+
media_type=str(getattr(catalog_item, "media_type")),
|
|
223
|
+
extraction_reference=extraction_reference,
|
|
224
|
+
)
|
|
225
|
+
snippet = _build_snippet(
|
|
226
|
+
text, (record.span_start, record.span_end), max_chars=recipe_config.snippet_characters
|
|
227
|
+
)
|
|
228
|
+
score = float(cosine_similarity_scores(embeddings[idx : idx + 1], query_vector)[0])
|
|
229
|
+
evidence_items.append(
|
|
230
|
+
Evidence(
|
|
231
|
+
item_id=record.item_id,
|
|
232
|
+
source_uri=getattr(catalog_item, "source_uri", None),
|
|
233
|
+
media_type=str(getattr(catalog_item, "media_type")),
|
|
234
|
+
score=score,
|
|
235
|
+
rank=1,
|
|
236
|
+
text=snippet,
|
|
237
|
+
content_ref=None,
|
|
238
|
+
span_start=record.span_start,
|
|
239
|
+
span_end=record.span_end,
|
|
240
|
+
stage=EmbeddingIndexFileBackend.backend_id,
|
|
241
|
+
stage_scores=None,
|
|
242
|
+
recipe_id=run.recipe.recipe_id,
|
|
243
|
+
run_id=run.run_id,
|
|
244
|
+
hash=hash_text(snippet),
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
return evidence_items
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _load_text_for_evidence(
|
|
251
|
+
corpus: Corpus,
|
|
252
|
+
*,
|
|
253
|
+
item_id: str,
|
|
254
|
+
relpath: str,
|
|
255
|
+
media_type: str,
|
|
256
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
257
|
+
) -> Optional[str]:
|
|
258
|
+
from .embedding_index_common import _load_text_from_item
|
|
259
|
+
|
|
260
|
+
return _load_text_from_item(
|
|
261
|
+
corpus,
|
|
262
|
+
item_id=item_id,
|
|
263
|
+
relpath=relpath,
|
|
264
|
+
media_type=media_type,
|
|
265
|
+
extraction_reference=extraction_reference,
|
|
266
|
+
)
|