biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +21 -1
- biblicus/analysis/markov.py +35 -3
- biblicus/backends/__init__.py +6 -2
- biblicus/backends/embedding_index_common.py +334 -0
- biblicus/backends/embedding_index_file.py +272 -0
- biblicus/backends/embedding_index_inmemory.py +270 -0
- biblicus/backends/hybrid.py +8 -5
- biblicus/backends/scan.py +1 -0
- biblicus/backends/sqlite_full_text_search.py +1 -1
- biblicus/backends/{vector.py → tf_vector.py} +28 -35
- biblicus/chunking.py +396 -0
- biblicus/cli.py +75 -25
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1060 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +129 -0
- biblicus/corpus.py +117 -16
- biblicus/embedding_providers.py +122 -0
- biblicus/errors.py +24 -0
- biblicus/frontmatter.py +2 -0
- biblicus/knowledge_base.py +1 -1
- biblicus/models.py +15 -3
- biblicus/retrieval.py +7 -2
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +2 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding-index retrieval backend that reads the embedding matrix via memory mapping.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from ..corpus import Corpus
|
|
13
|
+
from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
|
|
14
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
15
|
+
from ..time import utc_now_iso
|
|
16
|
+
from .embedding_index_common import (
|
|
17
|
+
ChunkRecord,
|
|
18
|
+
EmbeddingIndexRecipeConfig,
|
|
19
|
+
_build_snippet,
|
|
20
|
+
_extract_span_text,
|
|
21
|
+
artifact_paths_for_run,
|
|
22
|
+
chunks_to_records,
|
|
23
|
+
collect_chunks,
|
|
24
|
+
cosine_similarity_scores,
|
|
25
|
+
read_chunks_jsonl,
|
|
26
|
+
read_embeddings,
|
|
27
|
+
resolve_extraction_reference,
|
|
28
|
+
write_chunks_jsonl,
|
|
29
|
+
write_embeddings,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EmbeddingIndexFileBackend:
|
|
34
|
+
"""
|
|
35
|
+
Embedding retrieval backend using memory-mapped similarity scanning.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
backend_id = "embedding-index-file"
|
|
39
|
+
|
|
40
|
+
def build_run(
|
|
41
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
42
|
+
) -> RetrievalRun:
|
|
43
|
+
"""
|
|
44
|
+
Build an embedding index run by chunking text payloads and materializing embeddings.
|
|
45
|
+
|
|
46
|
+
:param corpus: Corpus to build against.
|
|
47
|
+
:type corpus: Corpus
|
|
48
|
+
:param recipe_name: Human-readable recipe name.
|
|
49
|
+
:type recipe_name: str
|
|
50
|
+
:param config: Backend-specific configuration values.
|
|
51
|
+
:type config: dict[str, object]
|
|
52
|
+
:return: Run manifest describing the build.
|
|
53
|
+
:rtype: biblicus.models.RetrievalRun
|
|
54
|
+
"""
|
|
55
|
+
recipe_config = EmbeddingIndexRecipeConfig.model_validate(config)
|
|
56
|
+
chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
|
|
57
|
+
|
|
58
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
59
|
+
embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
|
|
60
|
+
|
|
61
|
+
recipe = create_recipe_manifest(
|
|
62
|
+
backend_id=self.backend_id,
|
|
63
|
+
name=recipe_name,
|
|
64
|
+
config=recipe_config.model_dump(),
|
|
65
|
+
)
|
|
66
|
+
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
67
|
+
|
|
68
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
69
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
70
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
71
|
+
corpus.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
write_embeddings(embeddings_path, embeddings)
|
|
74
|
+
write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
|
|
75
|
+
|
|
76
|
+
stats = {
|
|
77
|
+
"items": len(corpus.load_catalog().items),
|
|
78
|
+
"text_items": text_items,
|
|
79
|
+
"chunks": len(chunks),
|
|
80
|
+
"dimensions": (
|
|
81
|
+
int(embeddings.shape[1])
|
|
82
|
+
if embeddings.size
|
|
83
|
+
else recipe_config.embedding_provider.dimensions
|
|
84
|
+
),
|
|
85
|
+
}
|
|
86
|
+
run = run.model_copy(
|
|
87
|
+
update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
|
|
88
|
+
)
|
|
89
|
+
corpus.write_run(run)
|
|
90
|
+
return run
|
|
91
|
+
|
|
92
|
+
def query(
|
|
93
|
+
self,
|
|
94
|
+
corpus: Corpus,
|
|
95
|
+
*,
|
|
96
|
+
run: RetrievalRun,
|
|
97
|
+
query_text: str,
|
|
98
|
+
budget: QueryBudget,
|
|
99
|
+
) -> RetrievalResult:
|
|
100
|
+
"""
|
|
101
|
+
Query an embedding index run and return ranked evidence.
|
|
102
|
+
|
|
103
|
+
:param corpus: Corpus associated with the run.
|
|
104
|
+
:type corpus: Corpus
|
|
105
|
+
:param run: Run manifest to use for querying.
|
|
106
|
+
:type run: biblicus.models.RetrievalRun
|
|
107
|
+
:param query_text: Query text to embed.
|
|
108
|
+
:type query_text: str
|
|
109
|
+
:param budget: Evidence selection budget.
|
|
110
|
+
:type budget: biblicus.models.QueryBudget
|
|
111
|
+
:return: Retrieval results containing evidence.
|
|
112
|
+
:rtype: biblicus.models.RetrievalResult
|
|
113
|
+
"""
|
|
114
|
+
recipe_config = EmbeddingIndexRecipeConfig.model_validate(run.recipe.config)
|
|
115
|
+
extraction_reference = resolve_extraction_reference(corpus, recipe_config)
|
|
116
|
+
|
|
117
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
118
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
119
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
120
|
+
if not embeddings_path.is_file() or not chunks_path.is_file():
|
|
121
|
+
raise FileNotFoundError("Embedding index artifacts are missing for this run")
|
|
122
|
+
|
|
123
|
+
embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
|
|
124
|
+
chunk_records = read_chunks_jsonl(chunks_path)
|
|
125
|
+
if embeddings.shape[0] != len(chunk_records):
|
|
126
|
+
raise ValueError(
|
|
127
|
+
"Embedding index artifacts are inconsistent: "
|
|
128
|
+
"embeddings row count does not match chunk record count"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
132
|
+
query_embedding = provider.embed_texts([query_text]).astype(np.float32)
|
|
133
|
+
if query_embedding.shape[0] != 1:
|
|
134
|
+
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
135
|
+
|
|
136
|
+
batch_rows = recipe_config.maximum_cache_total_items or 4096
|
|
137
|
+
candidates = _top_indices_batched(
|
|
138
|
+
embeddings=embeddings,
|
|
139
|
+
query_vector=query_embedding[0],
|
|
140
|
+
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
141
|
+
batch_rows=batch_rows,
|
|
142
|
+
)
|
|
143
|
+
evidence_items = _build_evidence(
|
|
144
|
+
corpus,
|
|
145
|
+
run=run,
|
|
146
|
+
recipe_config=recipe_config,
|
|
147
|
+
candidates=candidates,
|
|
148
|
+
embeddings=embeddings,
|
|
149
|
+
query_vector=query_embedding[0],
|
|
150
|
+
chunk_records=chunk_records,
|
|
151
|
+
extraction_reference=extraction_reference,
|
|
152
|
+
)
|
|
153
|
+
ranked = [
|
|
154
|
+
item.model_copy(
|
|
155
|
+
update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
|
|
156
|
+
)
|
|
157
|
+
for index, item in enumerate(evidence_items, start=1)
|
|
158
|
+
]
|
|
159
|
+
evidence = apply_budget(ranked, budget)
|
|
160
|
+
return RetrievalResult(
|
|
161
|
+
query_text=query_text,
|
|
162
|
+
budget=budget,
|
|
163
|
+
run_id=run.run_id,
|
|
164
|
+
recipe_id=run.recipe.recipe_id,
|
|
165
|
+
backend_id=self.backend_id,
|
|
166
|
+
generated_at=utc_now_iso(),
|
|
167
|
+
evidence=evidence,
|
|
168
|
+
stats={"candidates": len(evidence_items), "returned": len(evidence)},
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
|
|
173
|
+
return max(1, int(max_total_items) * int(multiplier))
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass(frozen=True)
|
|
177
|
+
class _ScoredIndex:
|
|
178
|
+
score: float
|
|
179
|
+
index: int
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _top_indices_batched(
|
|
183
|
+
*, embeddings: np.ndarray, query_vector: np.ndarray, limit: int, batch_rows: int = 4096
|
|
184
|
+
) -> List[int]:
|
|
185
|
+
if embeddings.size == 0:
|
|
186
|
+
return []
|
|
187
|
+
limit = min(int(limit), int(embeddings.shape[0]))
|
|
188
|
+
|
|
189
|
+
best: List[_ScoredIndex] = []
|
|
190
|
+
for start in range(0, embeddings.shape[0], int(batch_rows)):
|
|
191
|
+
end = min(start + int(batch_rows), embeddings.shape[0])
|
|
192
|
+
scores = cosine_similarity_scores(embeddings[start:end], query_vector)
|
|
193
|
+
batch_limit = min(limit, int(scores.size))
|
|
194
|
+
if batch_limit <= 0:
|
|
195
|
+
continue
|
|
196
|
+
indices = np.argpartition(-scores, batch_limit - 1)[:batch_limit]
|
|
197
|
+
for local_index in indices:
|
|
198
|
+
global_index = int(start + int(local_index))
|
|
199
|
+
best.append(_ScoredIndex(score=float(scores[int(local_index)]), index=global_index))
|
|
200
|
+
|
|
201
|
+
best.sort(key=lambda item: (-item.score, item.index))
|
|
202
|
+
return [int(item.index) for item in best[:limit]]
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _build_evidence(
|
|
206
|
+
corpus: Corpus,
|
|
207
|
+
*,
|
|
208
|
+
run: RetrievalRun,
|
|
209
|
+
recipe_config: EmbeddingIndexRecipeConfig,
|
|
210
|
+
candidates: List[int],
|
|
211
|
+
embeddings: np.ndarray,
|
|
212
|
+
query_vector: np.ndarray,
|
|
213
|
+
chunk_records: List[ChunkRecord],
|
|
214
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
215
|
+
) -> List[Evidence]:
|
|
216
|
+
catalog = corpus.load_catalog()
|
|
217
|
+
evidence_items: List[Evidence] = []
|
|
218
|
+
for idx in candidates:
|
|
219
|
+
record = chunk_records[idx]
|
|
220
|
+
catalog_item = catalog.items[record.item_id]
|
|
221
|
+
text = _load_text_for_evidence(
|
|
222
|
+
corpus,
|
|
223
|
+
item_id=record.item_id,
|
|
224
|
+
relpath=str(getattr(catalog_item, "relpath")),
|
|
225
|
+
media_type=str(getattr(catalog_item, "media_type")),
|
|
226
|
+
extraction_reference=extraction_reference,
|
|
227
|
+
)
|
|
228
|
+
span_text = _build_snippet(
|
|
229
|
+
text, (record.span_start, record.span_end), recipe_config.snippet_characters
|
|
230
|
+
)
|
|
231
|
+
if span_text is None:
|
|
232
|
+
span_text = _extract_span_text(text, (record.span_start, record.span_end))
|
|
233
|
+
score = float(cosine_similarity_scores(embeddings[idx : idx + 1], query_vector)[0])
|
|
234
|
+
evidence_items.append(
|
|
235
|
+
Evidence(
|
|
236
|
+
item_id=record.item_id,
|
|
237
|
+
source_uri=getattr(catalog_item, "source_uri", None),
|
|
238
|
+
media_type=str(getattr(catalog_item, "media_type")),
|
|
239
|
+
score=score,
|
|
240
|
+
rank=1,
|
|
241
|
+
text=span_text,
|
|
242
|
+
content_ref=None,
|
|
243
|
+
span_start=record.span_start,
|
|
244
|
+
span_end=record.span_end,
|
|
245
|
+
stage=EmbeddingIndexFileBackend.backend_id,
|
|
246
|
+
stage_scores=None,
|
|
247
|
+
recipe_id=run.recipe.recipe_id,
|
|
248
|
+
run_id=run.run_id,
|
|
249
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
250
|
+
hash=hash_text(span_text or ""),
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
return evidence_items
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _load_text_for_evidence(
|
|
257
|
+
corpus: Corpus,
|
|
258
|
+
*,
|
|
259
|
+
item_id: str,
|
|
260
|
+
relpath: str,
|
|
261
|
+
media_type: str,
|
|
262
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
263
|
+
) -> Optional[str]:
|
|
264
|
+
from .embedding_index_common import _load_text_from_item
|
|
265
|
+
|
|
266
|
+
return _load_text_from_item(
|
|
267
|
+
corpus,
|
|
268
|
+
item_id=item_id,
|
|
269
|
+
relpath=relpath,
|
|
270
|
+
media_type=media_type,
|
|
271
|
+
extraction_reference=extraction_reference,
|
|
272
|
+
)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding-index retrieval backend that loads the full embedding matrix into memory at query time.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from pydantic import ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
from ..corpus import Corpus
|
|
13
|
+
from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
|
|
14
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
15
|
+
from ..time import utc_now_iso
|
|
16
|
+
from .embedding_index_common import (
|
|
17
|
+
ChunkRecord,
|
|
18
|
+
EmbeddingIndexRecipeConfig,
|
|
19
|
+
_build_snippet,
|
|
20
|
+
_extract_span_text,
|
|
21
|
+
artifact_paths_for_run,
|
|
22
|
+
chunks_to_records,
|
|
23
|
+
collect_chunks,
|
|
24
|
+
cosine_similarity_scores,
|
|
25
|
+
read_chunks_jsonl,
|
|
26
|
+
read_embeddings,
|
|
27
|
+
resolve_extraction_reference,
|
|
28
|
+
write_chunks_jsonl,
|
|
29
|
+
write_embeddings,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
|
|
34
|
+
"""
|
|
35
|
+
Configuration for embedding-index-inmemory retrieval.
|
|
36
|
+
|
|
37
|
+
:ivar maximum_cache_total_items: Maximum chunks allowed for in-memory query loading.
|
|
38
|
+
:vartype maximum_cache_total_items: int
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(extra="forbid")
|
|
42
|
+
|
|
43
|
+
maximum_cache_total_items: int = Field(default=25000, ge=1)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EmbeddingIndexInMemoryBackend:
|
|
47
|
+
"""
|
|
48
|
+
Embedding retrieval backend using an in-memory similarity scan.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
backend_id = "embedding-index-inmemory"
|
|
52
|
+
|
|
53
|
+
def build_run(
|
|
54
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
55
|
+
) -> RetrievalRun:
|
|
56
|
+
"""
|
|
57
|
+
Build an embedding index run by chunking text payloads and materializing embeddings.
|
|
58
|
+
|
|
59
|
+
:param corpus: Corpus to build against.
|
|
60
|
+
:type corpus: Corpus
|
|
61
|
+
:param recipe_name: Human-readable recipe name.
|
|
62
|
+
:type recipe_name: str
|
|
63
|
+
:param config: Backend-specific configuration values.
|
|
64
|
+
:type config: dict[str, object]
|
|
65
|
+
:return: Run manifest describing the build.
|
|
66
|
+
:rtype: biblicus.models.RetrievalRun
|
|
67
|
+
"""
|
|
68
|
+
recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(config)
|
|
69
|
+
chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
|
|
70
|
+
if len(chunks) > recipe_config.maximum_cache_total_items:
|
|
71
|
+
raise ValueError(
|
|
72
|
+
"embedding-index-inmemory exceeded maximum_cache_total_items. "
|
|
73
|
+
"Use embedding-index-file or increase maximum_cache_total_items."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
77
|
+
chunk_texts = [chunk.text for chunk in chunks]
|
|
78
|
+
embeddings = provider.embed_texts(chunk_texts)
|
|
79
|
+
embeddings = embeddings.astype(np.float32)
|
|
80
|
+
|
|
81
|
+
recipe = create_recipe_manifest(
|
|
82
|
+
backend_id=self.backend_id,
|
|
83
|
+
name=recipe_name,
|
|
84
|
+
config=recipe_config.model_dump(),
|
|
85
|
+
)
|
|
86
|
+
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
87
|
+
|
|
88
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
89
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
90
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
91
|
+
corpus.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
write_embeddings(embeddings_path, embeddings)
|
|
94
|
+
write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
|
|
95
|
+
|
|
96
|
+
stats = {
|
|
97
|
+
"items": len(corpus.load_catalog().items),
|
|
98
|
+
"text_items": text_items,
|
|
99
|
+
"chunks": len(chunks),
|
|
100
|
+
"dimensions": (
|
|
101
|
+
int(embeddings.shape[1])
|
|
102
|
+
if embeddings.size
|
|
103
|
+
else recipe_config.embedding_provider.dimensions
|
|
104
|
+
),
|
|
105
|
+
}
|
|
106
|
+
run = run.model_copy(
|
|
107
|
+
update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
|
|
108
|
+
)
|
|
109
|
+
corpus.write_run(run)
|
|
110
|
+
return run
|
|
111
|
+
|
|
112
|
+
def query(
|
|
113
|
+
self,
|
|
114
|
+
corpus: Corpus,
|
|
115
|
+
*,
|
|
116
|
+
run: RetrievalRun,
|
|
117
|
+
query_text: str,
|
|
118
|
+
budget: QueryBudget,
|
|
119
|
+
) -> RetrievalResult:
|
|
120
|
+
"""
|
|
121
|
+
Query an embedding index run and return ranked evidence.
|
|
122
|
+
|
|
123
|
+
:param corpus: Corpus associated with the run.
|
|
124
|
+
:type corpus: Corpus
|
|
125
|
+
:param run: Run manifest to use for querying.
|
|
126
|
+
:type run: biblicus.models.RetrievalRun
|
|
127
|
+
:param query_text: Query text to embed.
|
|
128
|
+
:type query_text: str
|
|
129
|
+
:param budget: Evidence selection budget.
|
|
130
|
+
:type budget: biblicus.models.QueryBudget
|
|
131
|
+
:return: Retrieval results containing evidence.
|
|
132
|
+
:rtype: biblicus.models.RetrievalResult
|
|
133
|
+
"""
|
|
134
|
+
recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(run.recipe.config)
|
|
135
|
+
extraction_reference = resolve_extraction_reference(corpus, recipe_config)
|
|
136
|
+
|
|
137
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
138
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
139
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
140
|
+
if not embeddings_path.is_file() or not chunks_path.is_file():
|
|
141
|
+
raise FileNotFoundError("Embedding index artifacts are missing for this run")
|
|
142
|
+
|
|
143
|
+
embeddings = read_embeddings(embeddings_path, mmap=False).astype(np.float32)
|
|
144
|
+
chunk_records = read_chunks_jsonl(chunks_path)
|
|
145
|
+
if embeddings.shape[0] != len(chunk_records):
|
|
146
|
+
raise ValueError(
|
|
147
|
+
"Embedding index artifacts are inconsistent: "
|
|
148
|
+
"embeddings row count does not match chunk record count"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
152
|
+
query_embedding = provider.embed_texts([query_text]).astype(np.float32)
|
|
153
|
+
if query_embedding.shape[0] != 1:
|
|
154
|
+
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
155
|
+
scores = cosine_similarity_scores(embeddings, query_embedding[0])
|
|
156
|
+
|
|
157
|
+
candidates = _top_indices(
|
|
158
|
+
scores,
|
|
159
|
+
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
160
|
+
)
|
|
161
|
+
evidence_items = _build_evidence(
|
|
162
|
+
corpus,
|
|
163
|
+
run=run,
|
|
164
|
+
recipe_config=recipe_config,
|
|
165
|
+
candidates=candidates,
|
|
166
|
+
scores=scores,
|
|
167
|
+
chunk_records=chunk_records,
|
|
168
|
+
extraction_reference=extraction_reference,
|
|
169
|
+
)
|
|
170
|
+
ranked = [
|
|
171
|
+
item.model_copy(
|
|
172
|
+
update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
|
|
173
|
+
)
|
|
174
|
+
for index, item in enumerate(evidence_items, start=1)
|
|
175
|
+
]
|
|
176
|
+
evidence = apply_budget(ranked, budget)
|
|
177
|
+
return RetrievalResult(
|
|
178
|
+
query_text=query_text,
|
|
179
|
+
budget=budget,
|
|
180
|
+
run_id=run.run_id,
|
|
181
|
+
recipe_id=run.recipe.recipe_id,
|
|
182
|
+
backend_id=self.backend_id,
|
|
183
|
+
generated_at=utc_now_iso(),
|
|
184
|
+
evidence=evidence,
|
|
185
|
+
stats={"candidates": len(evidence_items), "returned": len(evidence)},
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
|
|
190
|
+
return max(1, int(max_total_items) * int(multiplier))
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _top_indices(scores: np.ndarray, *, limit: int) -> List[int]:
|
|
194
|
+
if scores.size == 0:
|
|
195
|
+
return []
|
|
196
|
+
limit = min(int(limit), int(scores.size))
|
|
197
|
+
indices = np.argpartition(-scores, limit - 1)[:limit]
|
|
198
|
+
sorted_indices = indices[np.argsort(-scores[indices])]
|
|
199
|
+
return [int(index) for index in sorted_indices]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _build_evidence(
|
|
203
|
+
corpus: Corpus,
|
|
204
|
+
*,
|
|
205
|
+
run: RetrievalRun,
|
|
206
|
+
recipe_config: EmbeddingIndexInMemoryRecipeConfig,
|
|
207
|
+
candidates: List[int],
|
|
208
|
+
scores: np.ndarray,
|
|
209
|
+
chunk_records: List[ChunkRecord],
|
|
210
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
211
|
+
) -> List[Evidence]:
|
|
212
|
+
catalog = corpus.load_catalog()
|
|
213
|
+
evidence_items: List[Evidence] = []
|
|
214
|
+
for idx in candidates:
|
|
215
|
+
record = chunk_records[idx]
|
|
216
|
+
item_id = record.item_id
|
|
217
|
+
span_start = record.span_start
|
|
218
|
+
span_end = record.span_end
|
|
219
|
+
catalog_item = catalog.items[item_id]
|
|
220
|
+
relpath = str(getattr(catalog_item, "relpath"))
|
|
221
|
+
media_type = str(getattr(catalog_item, "media_type"))
|
|
222
|
+
text = _load_text_for_evidence(
|
|
223
|
+
corpus,
|
|
224
|
+
item_id=item_id,
|
|
225
|
+
relpath=relpath,
|
|
226
|
+
media_type=media_type,
|
|
227
|
+
extraction_reference=extraction_reference,
|
|
228
|
+
)
|
|
229
|
+
span_text = _build_snippet(text, (span_start, span_end), recipe_config.snippet_characters)
|
|
230
|
+
if span_text is None:
|
|
231
|
+
span_text = _extract_span_text(text, (span_start, span_end))
|
|
232
|
+
evidence_items.append(
|
|
233
|
+
Evidence(
|
|
234
|
+
item_id=item_id,
|
|
235
|
+
source_uri=getattr(catalog_item, "source_uri", None),
|
|
236
|
+
media_type=media_type,
|
|
237
|
+
score=float(scores[idx]),
|
|
238
|
+
rank=1,
|
|
239
|
+
text=span_text,
|
|
240
|
+
content_ref=None,
|
|
241
|
+
span_start=span_start,
|
|
242
|
+
span_end=span_end,
|
|
243
|
+
stage=EmbeddingIndexInMemoryBackend.backend_id,
|
|
244
|
+
stage_scores=None,
|
|
245
|
+
recipe_id=run.recipe.recipe_id,
|
|
246
|
+
run_id=run.run_id,
|
|
247
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
248
|
+
hash=hash_text(span_text or ""),
|
|
249
|
+
)
|
|
250
|
+
)
|
|
251
|
+
return evidence_items
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _load_text_for_evidence(
|
|
255
|
+
corpus: Corpus,
|
|
256
|
+
*,
|
|
257
|
+
item_id: str,
|
|
258
|
+
relpath: str,
|
|
259
|
+
media_type: str,
|
|
260
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
261
|
+
) -> Optional[str]:
|
|
262
|
+
from .embedding_index_common import _load_text_from_item
|
|
263
|
+
|
|
264
|
+
return _load_text_from_item(
|
|
265
|
+
corpus,
|
|
266
|
+
item_id=item_id,
|
|
267
|
+
relpath=relpath,
|
|
268
|
+
media_type=media_type,
|
|
269
|
+
extraction_reference=extraction_reference,
|
|
270
|
+
)
|
biblicus/backends/hybrid.py
CHANGED
|
@@ -35,7 +35,7 @@ class HybridRecipeConfig(BaseModel):
|
|
|
35
35
|
model_config = ConfigDict(extra="forbid")
|
|
36
36
|
|
|
37
37
|
lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
|
|
38
|
-
embedding_backend: str = Field(default="vector", min_length=1)
|
|
38
|
+
embedding_backend: str = Field(default="tf-vector", min_length=1)
|
|
39
39
|
lexical_weight: float = Field(default=0.5, ge=0, le=1)
|
|
40
40
|
embedding_weight: float = Field(default=0.5, ge=0, le=1)
|
|
41
41
|
lexical_config: Dict[str, object] = Field(default_factory=dict)
|
|
@@ -217,18 +217,20 @@ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> Que
|
|
|
217
217
|
:return: Expanded budget for component backends.
|
|
218
218
|
:rtype: QueryBudget
|
|
219
219
|
"""
|
|
220
|
-
|
|
220
|
+
maximum_total_characters = budget.maximum_total_characters
|
|
221
221
|
expanded_characters = (
|
|
222
|
-
|
|
222
|
+
maximum_total_characters * multiplier if maximum_total_characters is not None else None
|
|
223
223
|
)
|
|
224
224
|
expanded_max_items_per_source = (
|
|
225
225
|
budget.max_items_per_source * multiplier
|
|
226
226
|
if budget.max_items_per_source is not None
|
|
227
227
|
else None
|
|
228
228
|
)
|
|
229
|
+
requested_items = budget.max_total_items + budget.offset
|
|
229
230
|
return QueryBudget(
|
|
230
|
-
max_total_items=
|
|
231
|
-
|
|
231
|
+
max_total_items=requested_items * multiplier,
|
|
232
|
+
offset=0,
|
|
233
|
+
maximum_total_characters=expanded_characters,
|
|
232
234
|
max_items_per_source=expanded_max_items_per_source,
|
|
233
235
|
)
|
|
234
236
|
|
|
@@ -283,6 +285,7 @@ def _fuse_evidence(
|
|
|
283
285
|
stage_scores={"lexical": lexical_score, "embedding": embedding_score},
|
|
284
286
|
recipe_id="",
|
|
285
287
|
run_id="",
|
|
288
|
+
metadata=base_evidence.metadata,
|
|
286
289
|
hash=base_evidence.hash,
|
|
287
290
|
)
|
|
288
291
|
)
|
biblicus/backends/scan.py
CHANGED
|
@@ -231,7 +231,7 @@ class SqliteFullTextSearchBackend:
|
|
|
231
231
|
candidates = _query_full_text_search_index(
|
|
232
232
|
db_path=db_path,
|
|
233
233
|
query_text=" ".join(filtered_tokens),
|
|
234
|
-
limit=_candidate_limit(budget.max_total_items),
|
|
234
|
+
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
235
235
|
snippet_characters=recipe_config.snippet_characters,
|
|
236
236
|
)
|
|
237
237
|
sorted_candidates = _rank_candidates(candidates)
|