biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Embedding-index
|
|
2
|
+
Embedding-index retriever that loads the full embedding matrix into memory at query time.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -10,13 +10,26 @@ import numpy as np
|
|
|
10
10
|
from pydantic import ConfigDict, Field
|
|
11
11
|
|
|
12
12
|
from ..corpus import Corpus
|
|
13
|
-
from ..models import
|
|
14
|
-
|
|
13
|
+
from ..models import (
|
|
14
|
+
Evidence,
|
|
15
|
+
ExtractionSnapshotReference,
|
|
16
|
+
QueryBudget,
|
|
17
|
+
RetrievalResult,
|
|
18
|
+
RetrievalSnapshot,
|
|
19
|
+
)
|
|
20
|
+
from ..retrieval import (
|
|
21
|
+
apply_budget,
|
|
22
|
+
create_configuration_manifest,
|
|
23
|
+
create_snapshot_manifest,
|
|
24
|
+
hash_text,
|
|
25
|
+
)
|
|
15
26
|
from ..time import utc_now_iso
|
|
16
27
|
from .embedding_index_common import (
|
|
17
28
|
ChunkRecord,
|
|
18
|
-
|
|
19
|
-
|
|
29
|
+
EmbeddingIndexConfiguration,
|
|
30
|
+
_build_snippet,
|
|
31
|
+
_extract_span_text,
|
|
32
|
+
artifact_paths_for_snapshot,
|
|
20
33
|
chunks_to_records,
|
|
21
34
|
collect_chunks,
|
|
22
35
|
cosine_similarity_scores,
|
|
@@ -26,68 +39,74 @@ from .embedding_index_common import (
|
|
|
26
39
|
write_chunks_jsonl,
|
|
27
40
|
write_embeddings,
|
|
28
41
|
)
|
|
29
|
-
from .scan import _build_snippet
|
|
30
42
|
|
|
31
43
|
|
|
32
|
-
class
|
|
44
|
+
class EmbeddingIndexInMemoryConfiguration(EmbeddingIndexConfiguration):
|
|
33
45
|
"""
|
|
34
46
|
Configuration for embedding-index-inmemory retrieval.
|
|
35
47
|
|
|
36
|
-
:ivar
|
|
37
|
-
:vartype
|
|
48
|
+
:ivar maximum_cache_total_items: Maximum chunks allowed for in-memory query loading.
|
|
49
|
+
:vartype maximum_cache_total_items: int
|
|
38
50
|
"""
|
|
39
51
|
|
|
40
52
|
model_config = ConfigDict(extra="forbid")
|
|
41
53
|
|
|
42
|
-
|
|
54
|
+
maximum_cache_total_items: int = Field(default=25000, ge=1)
|
|
43
55
|
|
|
44
56
|
|
|
45
|
-
class
|
|
57
|
+
class EmbeddingIndexInMemoryRetriever:
|
|
46
58
|
"""
|
|
47
|
-
Embedding retrieval
|
|
59
|
+
Embedding retrieval retriever using an in-memory similarity scan.
|
|
48
60
|
"""
|
|
49
61
|
|
|
50
|
-
|
|
62
|
+
retriever_id = "embedding-index-inmemory"
|
|
51
63
|
|
|
52
|
-
def
|
|
53
|
-
self, corpus: Corpus, *,
|
|
54
|
-
) ->
|
|
64
|
+
def build_snapshot(
|
|
65
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
66
|
+
) -> RetrievalSnapshot:
|
|
55
67
|
"""
|
|
56
|
-
Build an embedding index
|
|
68
|
+
Build an embedding index snapshot by chunking text payloads and materializing embeddings.
|
|
57
69
|
|
|
58
70
|
:param corpus: Corpus to build against.
|
|
59
71
|
:type corpus: Corpus
|
|
60
|
-
:param
|
|
61
|
-
:type
|
|
62
|
-
:param
|
|
63
|
-
:type
|
|
64
|
-
:return:
|
|
65
|
-
:rtype: biblicus.models.
|
|
72
|
+
:param configuration_name: Human-readable configuration name.
|
|
73
|
+
:type configuration_name: str
|
|
74
|
+
:param configuration: Retriever-specific configuration values.
|
|
75
|
+
:type configuration: dict[str, object]
|
|
76
|
+
:return: Snapshot manifest describing the build.
|
|
77
|
+
:rtype: biblicus.models.RetrievalSnapshot
|
|
66
78
|
"""
|
|
67
|
-
|
|
68
|
-
chunks, text_items = collect_chunks(corpus,
|
|
69
|
-
if len(chunks) >
|
|
79
|
+
parsed_config = EmbeddingIndexInMemoryConfiguration.model_validate(configuration)
|
|
80
|
+
chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
|
|
81
|
+
if len(chunks) > parsed_config.maximum_cache_total_items:
|
|
70
82
|
raise ValueError(
|
|
71
|
-
"embedding-index-inmemory exceeded
|
|
72
|
-
"Use embedding-index-file or increase
|
|
83
|
+
"embedding-index-inmemory exceeded maximum_cache_total_items. "
|
|
84
|
+
"Use embedding-index-file or increase maximum_cache_total_items."
|
|
73
85
|
)
|
|
74
86
|
|
|
75
|
-
provider =
|
|
87
|
+
provider = parsed_config.embedding_provider.build_provider()
|
|
76
88
|
chunk_texts = [chunk.text for chunk in chunks]
|
|
77
89
|
embeddings = provider.embed_texts(chunk_texts)
|
|
78
90
|
embeddings = embeddings.astype(np.float32)
|
|
79
91
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
name=
|
|
83
|
-
|
|
92
|
+
configuration_manifest = create_configuration_manifest(
|
|
93
|
+
retriever_id=self.retriever_id,
|
|
94
|
+
name=configuration_name,
|
|
95
|
+
configuration=parsed_config.model_dump(),
|
|
96
|
+
)
|
|
97
|
+
snapshot = create_snapshot_manifest(
|
|
98
|
+
corpus,
|
|
99
|
+
configuration=configuration_manifest,
|
|
100
|
+
stats={},
|
|
101
|
+
snapshot_artifacts=[],
|
|
84
102
|
)
|
|
85
|
-
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
86
103
|
|
|
87
|
-
paths =
|
|
104
|
+
paths = artifact_paths_for_snapshot(
|
|
105
|
+
snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
|
|
106
|
+
)
|
|
88
107
|
embeddings_path = corpus.root / paths["embeddings"]
|
|
89
108
|
chunks_path = corpus.root / paths["chunks"]
|
|
90
|
-
corpus.
|
|
109
|
+
corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
91
110
|
|
|
92
111
|
write_embeddings(embeddings_path, embeddings)
|
|
93
112
|
write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
|
|
@@ -99,30 +118,33 @@ class EmbeddingIndexInMemoryBackend:
|
|
|
99
118
|
"dimensions": (
|
|
100
119
|
int(embeddings.shape[1])
|
|
101
120
|
if embeddings.size
|
|
102
|
-
else
|
|
121
|
+
else parsed_config.embedding_provider.dimensions
|
|
103
122
|
),
|
|
104
123
|
}
|
|
105
|
-
|
|
106
|
-
update={
|
|
124
|
+
snapshot = snapshot.model_copy(
|
|
125
|
+
update={
|
|
126
|
+
"snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
|
|
127
|
+
"stats": stats,
|
|
128
|
+
}
|
|
107
129
|
)
|
|
108
|
-
corpus.
|
|
109
|
-
return
|
|
130
|
+
corpus.write_snapshot(snapshot)
|
|
131
|
+
return snapshot
|
|
110
132
|
|
|
111
133
|
def query(
|
|
112
134
|
self,
|
|
113
135
|
corpus: Corpus,
|
|
114
136
|
*,
|
|
115
|
-
|
|
137
|
+
snapshot: RetrievalSnapshot,
|
|
116
138
|
query_text: str,
|
|
117
139
|
budget: QueryBudget,
|
|
118
140
|
) -> RetrievalResult:
|
|
119
141
|
"""
|
|
120
|
-
Query an embedding index
|
|
142
|
+
Query an embedding index snapshot and return ranked evidence.
|
|
121
143
|
|
|
122
|
-
:param corpus: Corpus associated with the
|
|
144
|
+
:param corpus: Corpus associated with the snapshot.
|
|
123
145
|
:type corpus: Corpus
|
|
124
|
-
:param
|
|
125
|
-
:type
|
|
146
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
147
|
+
:type snapshot: biblicus.models.RetrievalSnapshot
|
|
126
148
|
:param query_text: Query text to embed.
|
|
127
149
|
:type query_text: str
|
|
128
150
|
:param budget: Evidence selection budget.
|
|
@@ -130,14 +152,18 @@ class EmbeddingIndexInMemoryBackend:
|
|
|
130
152
|
:return: Retrieval results containing evidence.
|
|
131
153
|
:rtype: biblicus.models.RetrievalResult
|
|
132
154
|
"""
|
|
133
|
-
|
|
134
|
-
|
|
155
|
+
parsed_config = EmbeddingIndexInMemoryConfiguration.model_validate(
|
|
156
|
+
snapshot.configuration.configuration
|
|
157
|
+
)
|
|
158
|
+
extraction_reference = resolve_extraction_reference(corpus, parsed_config)
|
|
135
159
|
|
|
136
|
-
paths =
|
|
160
|
+
paths = artifact_paths_for_snapshot(
|
|
161
|
+
snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
|
|
162
|
+
)
|
|
137
163
|
embeddings_path = corpus.root / paths["embeddings"]
|
|
138
164
|
chunks_path = corpus.root / paths["chunks"]
|
|
139
165
|
if not embeddings_path.is_file() or not chunks_path.is_file():
|
|
140
|
-
raise FileNotFoundError("Embedding index artifacts are missing for this
|
|
166
|
+
raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
|
|
141
167
|
|
|
142
168
|
embeddings = read_embeddings(embeddings_path, mmap=False).astype(np.float32)
|
|
143
169
|
chunk_records = read_chunks_jsonl(chunks_path)
|
|
@@ -147,7 +173,7 @@ class EmbeddingIndexInMemoryBackend:
|
|
|
147
173
|
"embeddings row count does not match chunk record count"
|
|
148
174
|
)
|
|
149
175
|
|
|
150
|
-
provider =
|
|
176
|
+
provider = parsed_config.embedding_provider.build_provider()
|
|
151
177
|
query_embedding = provider.embed_texts([query_text]).astype(np.float32)
|
|
152
178
|
if query_embedding.shape[0] != 1:
|
|
153
179
|
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
@@ -159,8 +185,8 @@ class EmbeddingIndexInMemoryBackend:
|
|
|
159
185
|
)
|
|
160
186
|
evidence_items = _build_evidence(
|
|
161
187
|
corpus,
|
|
162
|
-
|
|
163
|
-
|
|
188
|
+
snapshot=snapshot,
|
|
189
|
+
configuration=parsed_config,
|
|
164
190
|
candidates=candidates,
|
|
165
191
|
scores=scores,
|
|
166
192
|
chunk_records=chunk_records,
|
|
@@ -168,7 +194,11 @@ class EmbeddingIndexInMemoryBackend:
|
|
|
168
194
|
)
|
|
169
195
|
ranked = [
|
|
170
196
|
item.model_copy(
|
|
171
|
-
update={
|
|
197
|
+
update={
|
|
198
|
+
"rank": index,
|
|
199
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
200
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
201
|
+
}
|
|
172
202
|
)
|
|
173
203
|
for index, item in enumerate(evidence_items, start=1)
|
|
174
204
|
]
|
|
@@ -176,9 +206,9 @@ class EmbeddingIndexInMemoryBackend:
|
|
|
176
206
|
return RetrievalResult(
|
|
177
207
|
query_text=query_text,
|
|
178
208
|
budget=budget,
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
209
|
+
snapshot_id=snapshot.snapshot_id,
|
|
210
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
211
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
182
212
|
generated_at=utc_now_iso(),
|
|
183
213
|
evidence=evidence,
|
|
184
214
|
stats={"candidates": len(evidence_items), "returned": len(evidence)},
|
|
@@ -201,12 +231,12 @@ def _top_indices(scores: np.ndarray, *, limit: int) -> List[int]:
|
|
|
201
231
|
def _build_evidence(
|
|
202
232
|
corpus: Corpus,
|
|
203
233
|
*,
|
|
204
|
-
|
|
205
|
-
|
|
234
|
+
snapshot: RetrievalSnapshot,
|
|
235
|
+
configuration: EmbeddingIndexInMemoryConfiguration,
|
|
206
236
|
candidates: List[int],
|
|
207
237
|
scores: np.ndarray,
|
|
208
238
|
chunk_records: List[ChunkRecord],
|
|
209
|
-
extraction_reference: Optional[
|
|
239
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
210
240
|
) -> List[Evidence]:
|
|
211
241
|
catalog = corpus.load_catalog()
|
|
212
242
|
evidence_items: List[Evidence] = []
|
|
@@ -225,9 +255,9 @@ def _build_evidence(
|
|
|
225
255
|
media_type=media_type,
|
|
226
256
|
extraction_reference=extraction_reference,
|
|
227
257
|
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
258
|
+
span_text = _build_snippet(text, (span_start, span_end), configuration.snippet_characters)
|
|
259
|
+
if span_text is None:
|
|
260
|
+
span_text = _extract_span_text(text, (span_start, span_end))
|
|
231
261
|
evidence_items.append(
|
|
232
262
|
Evidence(
|
|
233
263
|
item_id=item_id,
|
|
@@ -235,15 +265,16 @@ def _build_evidence(
|
|
|
235
265
|
media_type=media_type,
|
|
236
266
|
score=float(scores[idx]),
|
|
237
267
|
rank=1,
|
|
238
|
-
text=
|
|
268
|
+
text=span_text,
|
|
239
269
|
content_ref=None,
|
|
240
270
|
span_start=span_start,
|
|
241
271
|
span_end=span_end,
|
|
242
|
-
stage=
|
|
272
|
+
stage=EmbeddingIndexInMemoryRetriever.retriever_id,
|
|
243
273
|
stage_scores=None,
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
274
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
275
|
+
snapshot_id=snapshot.snapshot_id,
|
|
276
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
277
|
+
hash=hash_text(span_text or ""),
|
|
247
278
|
)
|
|
248
279
|
)
|
|
249
280
|
return evidence_items
|
|
@@ -255,7 +286,7 @@ def _load_text_for_evidence(
|
|
|
255
286
|
item_id: str,
|
|
256
287
|
relpath: str,
|
|
257
288
|
media_type: str,
|
|
258
|
-
extraction_reference: Optional[
|
|
289
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
259
290
|
) -> Optional[str]:
|
|
260
291
|
from .embedding_index_common import _load_text_from_item
|
|
261
292
|
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hybrid retriever combining lexical and vector results.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalSnapshot
|
|
13
|
+
from ..retrieval import apply_budget, create_configuration_manifest, create_snapshot_manifest
|
|
14
|
+
from ..time import utc_now_iso
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HybridConfiguration(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Configuration for hybrid retrieval fusion.
|
|
20
|
+
|
|
21
|
+
:ivar lexical_retriever: Retriever identifier for lexical retrieval.
|
|
22
|
+
:vartype lexical_retriever: str
|
|
23
|
+
:ivar embedding_retriever: Retriever identifier for embedding retrieval.
|
|
24
|
+
:vartype embedding_retriever: str
|
|
25
|
+
:ivar lexical_weight: Weight for lexical scores.
|
|
26
|
+
:vartype lexical_weight: float
|
|
27
|
+
:ivar embedding_weight: Weight for embedding scores.
|
|
28
|
+
:vartype embedding_weight: float
|
|
29
|
+
:ivar lexical_configuration: Optional lexical retriever configuration.
|
|
30
|
+
:vartype lexical_configuration: dict[str, object]
|
|
31
|
+
:ivar embedding_configuration: Optional embedding retriever configuration.
|
|
32
|
+
:vartype embedding_configuration: dict[str, object]
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="forbid")
|
|
36
|
+
|
|
37
|
+
lexical_retriever: str = Field(default="sqlite-full-text-search", min_length=1)
|
|
38
|
+
embedding_retriever: str = Field(default="tf-vector", min_length=1)
|
|
39
|
+
lexical_weight: float = Field(default=0.5, ge=0, le=1)
|
|
40
|
+
embedding_weight: float = Field(default=0.5, ge=0, le=1)
|
|
41
|
+
lexical_configuration: Dict[str, object] = Field(default_factory=dict)
|
|
42
|
+
embedding_configuration: Dict[str, object] = Field(default_factory=dict)
|
|
43
|
+
|
|
44
|
+
@model_validator(mode="after")
|
|
45
|
+
def _validate_weights(self) -> "HybridConfiguration":
|
|
46
|
+
if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
|
|
47
|
+
raise ValueError("weights must sum to 1")
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class HybridRetriever:
|
|
52
|
+
"""
|
|
53
|
+
Hybrid retriever that fuses lexical and embedding retrieval.
|
|
54
|
+
|
|
55
|
+
:ivar retriever_id: Retriever identifier.
|
|
56
|
+
:vartype retriever_id: str
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
retriever_id = "hybrid"
|
|
60
|
+
|
|
61
|
+
def build_snapshot(
|
|
62
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
63
|
+
) -> RetrievalSnapshot:
|
|
64
|
+
"""
|
|
65
|
+
Build or register a hybrid retrieval snapshot.
|
|
66
|
+
|
|
67
|
+
:param corpus: Corpus to build against.
|
|
68
|
+
:type corpus: Corpus
|
|
69
|
+
:param configuration_name: Human-readable configuration name.
|
|
70
|
+
:type configuration_name: str
|
|
71
|
+
:param configuration: Retriever-specific configuration values.
|
|
72
|
+
:type configuration: dict[str, object]
|
|
73
|
+
:return: Snapshot manifest describing the build.
|
|
74
|
+
:rtype: RetrievalSnapshot
|
|
75
|
+
"""
|
|
76
|
+
parsed_config = HybridConfiguration.model_validate(configuration)
|
|
77
|
+
_ensure_retriever_supported(parsed_config)
|
|
78
|
+
lexical_retriever = _resolve_retriever(parsed_config.lexical_retriever)
|
|
79
|
+
embedding_retriever = _resolve_retriever(parsed_config.embedding_retriever)
|
|
80
|
+
lexical_snapshot = lexical_retriever.build_snapshot(
|
|
81
|
+
corpus,
|
|
82
|
+
configuration_name=f"{configuration_name}-lexical",
|
|
83
|
+
configuration=parsed_config.lexical_configuration,
|
|
84
|
+
)
|
|
85
|
+
embedding_snapshot = embedding_retriever.build_snapshot(
|
|
86
|
+
corpus,
|
|
87
|
+
configuration_name=f"{configuration_name}-embedding",
|
|
88
|
+
configuration=parsed_config.embedding_configuration,
|
|
89
|
+
)
|
|
90
|
+
configuration_manifest = create_configuration_manifest(
|
|
91
|
+
retriever_id=self.retriever_id,
|
|
92
|
+
name=configuration_name,
|
|
93
|
+
configuration=parsed_config.model_dump(),
|
|
94
|
+
)
|
|
95
|
+
stats = {
|
|
96
|
+
"lexical_snapshot_id": lexical_snapshot.snapshot_id,
|
|
97
|
+
"embedding_snapshot_id": embedding_snapshot.snapshot_id,
|
|
98
|
+
}
|
|
99
|
+
snapshot = create_snapshot_manifest(
|
|
100
|
+
corpus,
|
|
101
|
+
configuration=configuration_manifest,
|
|
102
|
+
stats=stats,
|
|
103
|
+
snapshot_artifacts=[],
|
|
104
|
+
)
|
|
105
|
+
corpus.write_snapshot(snapshot)
|
|
106
|
+
return snapshot
|
|
107
|
+
|
|
108
|
+
def query(
|
|
109
|
+
self,
|
|
110
|
+
corpus: Corpus,
|
|
111
|
+
*,
|
|
112
|
+
snapshot: RetrievalSnapshot,
|
|
113
|
+
query_text: str,
|
|
114
|
+
budget: QueryBudget,
|
|
115
|
+
) -> RetrievalResult:
|
|
116
|
+
"""
|
|
117
|
+
Query using both lexical and embedding retrievers and fuse scores.
|
|
118
|
+
|
|
119
|
+
:param corpus: Corpus associated with the snapshot.
|
|
120
|
+
:type corpus: Corpus
|
|
121
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
122
|
+
:type snapshot: RetrievalSnapshot
|
|
123
|
+
:param query_text: Query text to execute.
|
|
124
|
+
:type query_text: str
|
|
125
|
+
:param budget: Evidence selection budget.
|
|
126
|
+
:type budget: QueryBudget
|
|
127
|
+
:return: Retrieval results containing evidence.
|
|
128
|
+
:rtype: RetrievalResult
|
|
129
|
+
"""
|
|
130
|
+
configuration = HybridConfiguration.model_validate(snapshot.configuration.configuration)
|
|
131
|
+
_ensure_retriever_supported(configuration)
|
|
132
|
+
lexical_retriever = _resolve_retriever(configuration.lexical_retriever)
|
|
133
|
+
embedding_retriever = _resolve_retriever(configuration.embedding_retriever)
|
|
134
|
+
lexical_snapshot_id = snapshot.stats.get("lexical_snapshot_id")
|
|
135
|
+
embedding_snapshot_id = snapshot.stats.get("embedding_snapshot_id")
|
|
136
|
+
if not lexical_snapshot_id or not embedding_snapshot_id:
|
|
137
|
+
raise ValueError("Hybrid snapshot missing lexical or embedding snapshot identifiers")
|
|
138
|
+
lexical_snapshot = corpus.load_snapshot(str(lexical_snapshot_id))
|
|
139
|
+
embedding_snapshot = corpus.load_snapshot(str(embedding_snapshot_id))
|
|
140
|
+
component_budget = _expand_component_budget(budget)
|
|
141
|
+
lexical_result = lexical_retriever.query(
|
|
142
|
+
corpus, snapshot=lexical_snapshot, query_text=query_text, budget=component_budget
|
|
143
|
+
)
|
|
144
|
+
embedding_result = embedding_retriever.query(
|
|
145
|
+
corpus, snapshot=embedding_snapshot, query_text=query_text, budget=component_budget
|
|
146
|
+
)
|
|
147
|
+
candidates = _fuse_evidence(
|
|
148
|
+
lexical_result.evidence,
|
|
149
|
+
embedding_result.evidence,
|
|
150
|
+
lexical_weight=configuration.lexical_weight,
|
|
151
|
+
embedding_weight=configuration.embedding_weight,
|
|
152
|
+
)
|
|
153
|
+
sorted_candidates = sorted(
|
|
154
|
+
candidates,
|
|
155
|
+
key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
|
|
156
|
+
)
|
|
157
|
+
ranked = [
|
|
158
|
+
evidence_item.model_copy(
|
|
159
|
+
update={
|
|
160
|
+
"rank": index,
|
|
161
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
162
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
163
|
+
}
|
|
164
|
+
)
|
|
165
|
+
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
166
|
+
]
|
|
167
|
+
evidence = apply_budget(ranked, budget)
|
|
168
|
+
stats = {
|
|
169
|
+
"candidates": len(sorted_candidates),
|
|
170
|
+
"returned": len(evidence),
|
|
171
|
+
"fusion_weights": {
|
|
172
|
+
"lexical": configuration.lexical_weight,
|
|
173
|
+
"embedding": configuration.embedding_weight,
|
|
174
|
+
},
|
|
175
|
+
}
|
|
176
|
+
return RetrievalResult(
|
|
177
|
+
query_text=query_text,
|
|
178
|
+
budget=budget,
|
|
179
|
+
snapshot_id=snapshot.snapshot_id,
|
|
180
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
181
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
182
|
+
generated_at=utc_now_iso(),
|
|
183
|
+
evidence=evidence,
|
|
184
|
+
stats=stats,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _ensure_retriever_supported(configuration: HybridConfiguration) -> None:
|
|
189
|
+
"""
|
|
190
|
+
Validate that hybrid retrievers do not reference the hybrid retriever itself.
|
|
191
|
+
|
|
192
|
+
:param configuration: Parsed hybrid configuration.
|
|
193
|
+
:type configuration: HybridConfiguration
|
|
194
|
+
:return: None.
|
|
195
|
+
:rtype: None
|
|
196
|
+
:raises ValueError: If hybrid is used as a component retriever.
|
|
197
|
+
"""
|
|
198
|
+
if configuration.lexical_retriever == HybridRetriever.retriever_id:
|
|
199
|
+
raise ValueError("Hybrid retriever cannot use itself as the lexical retriever")
|
|
200
|
+
if configuration.embedding_retriever == HybridRetriever.retriever_id:
|
|
201
|
+
raise ValueError("Hybrid retriever cannot use itself as the embedding retriever")
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _resolve_retriever(retriever_id: str):
|
|
205
|
+
"""
|
|
206
|
+
Resolve a retriever by identifier.
|
|
207
|
+
|
|
208
|
+
:param retriever_id: Retriever identifier.
|
|
209
|
+
:type retriever_id: str
|
|
210
|
+
:return: Retriever instance.
|
|
211
|
+
:rtype: object
|
|
212
|
+
"""
|
|
213
|
+
from biblicus.retrievers import get_retriever # Delayed import to avoid circular import
|
|
214
|
+
|
|
215
|
+
return get_retriever(retriever_id)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
|
|
219
|
+
"""
|
|
220
|
+
Expand a final budget to collect more candidates for fusion.
|
|
221
|
+
|
|
222
|
+
:param budget: Final evidence budget.
|
|
223
|
+
:type budget: QueryBudget
|
|
224
|
+
:param multiplier: Candidate expansion multiplier.
|
|
225
|
+
:type multiplier: int
|
|
226
|
+
:return: Expanded budget for component retrievers.
|
|
227
|
+
:rtype: QueryBudget
|
|
228
|
+
"""
|
|
229
|
+
maximum_total_characters = budget.maximum_total_characters
|
|
230
|
+
expanded_characters = (
|
|
231
|
+
maximum_total_characters * multiplier if maximum_total_characters is not None else None
|
|
232
|
+
)
|
|
233
|
+
expanded_max_items_per_source = (
|
|
234
|
+
budget.max_items_per_source * multiplier
|
|
235
|
+
if budget.max_items_per_source is not None
|
|
236
|
+
else None
|
|
237
|
+
)
|
|
238
|
+
requested_items = budget.max_total_items + budget.offset
|
|
239
|
+
return QueryBudget(
|
|
240
|
+
max_total_items=requested_items * multiplier,
|
|
241
|
+
offset=0,
|
|
242
|
+
maximum_total_characters=expanded_characters,
|
|
243
|
+
max_items_per_source=expanded_max_items_per_source,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _fuse_evidence(
|
|
248
|
+
lexical: List[Evidence],
|
|
249
|
+
embedding: List[Evidence],
|
|
250
|
+
*,
|
|
251
|
+
lexical_weight: float,
|
|
252
|
+
embedding_weight: float,
|
|
253
|
+
) -> List[Evidence]:
|
|
254
|
+
"""
|
|
255
|
+
Fuse lexical and embedding evidence lists into hybrid candidates.
|
|
256
|
+
|
|
257
|
+
:param lexical: Lexical evidence list.
|
|
258
|
+
:type lexical: list[Evidence]
|
|
259
|
+
:param embedding: Embedding evidence list.
|
|
260
|
+
:type embedding: list[Evidence]
|
|
261
|
+
:param lexical_weight: Lexical score weight.
|
|
262
|
+
:type lexical_weight: float
|
|
263
|
+
:param embedding_weight: Embedding score weight.
|
|
264
|
+
:type embedding_weight: float
|
|
265
|
+
:return: Hybrid evidence list.
|
|
266
|
+
:rtype: list[Evidence]
|
|
267
|
+
"""
|
|
268
|
+
merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
|
|
269
|
+
for evidence_item in lexical:
|
|
270
|
+
merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
|
|
271
|
+
for evidence_item in embedding:
|
|
272
|
+
merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
|
|
273
|
+
|
|
274
|
+
candidates: List[Evidence] = []
|
|
275
|
+
for item_id, sources in merged.items():
|
|
276
|
+
lexical_evidence = sources.get("lexical")
|
|
277
|
+
embedding_evidence = sources.get("embedding")
|
|
278
|
+
lexical_score = lexical_evidence.score if lexical_evidence else 0.0
|
|
279
|
+
embedding_score = embedding_evidence.score if embedding_evidence else 0.0
|
|
280
|
+
combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
|
|
281
|
+
base_evidence = lexical_evidence or embedding_evidence
|
|
282
|
+
candidates.append(
|
|
283
|
+
Evidence(
|
|
284
|
+
item_id=item_id,
|
|
285
|
+
source_uri=base_evidence.source_uri,
|
|
286
|
+
media_type=base_evidence.media_type,
|
|
287
|
+
score=combined_score,
|
|
288
|
+
rank=1,
|
|
289
|
+
text=base_evidence.text,
|
|
290
|
+
content_ref=base_evidence.content_ref,
|
|
291
|
+
span_start=base_evidence.span_start,
|
|
292
|
+
span_end=base_evidence.span_end,
|
|
293
|
+
stage="hybrid",
|
|
294
|
+
stage_scores={"lexical": lexical_score, "embedding": embedding_score},
|
|
295
|
+
configuration_id="",
|
|
296
|
+
snapshot_id="",
|
|
297
|
+
metadata=base_evidence.metadata,
|
|
298
|
+
hash=base_evidence.hash,
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
return candidates
|