biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Embedding-index retrieval backend that loads the full embedding matrix into memory at query time.
2
+ Embedding-index retriever that loads the full embedding matrix into memory at query time.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -10,13 +10,26 @@ import numpy as np
10
10
  from pydantic import ConfigDict, Field
11
11
 
12
12
  from ..corpus import Corpus
13
- from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
14
- from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
13
+ from ..models import (
14
+ Evidence,
15
+ ExtractionSnapshotReference,
16
+ QueryBudget,
17
+ RetrievalResult,
18
+ RetrievalSnapshot,
19
+ )
20
+ from ..retrieval import (
21
+ apply_budget,
22
+ create_configuration_manifest,
23
+ create_snapshot_manifest,
24
+ hash_text,
25
+ )
15
26
  from ..time import utc_now_iso
16
27
  from .embedding_index_common import (
17
28
  ChunkRecord,
18
- EmbeddingIndexRecipeConfig,
19
- artifact_paths_for_run,
29
+ EmbeddingIndexConfiguration,
30
+ _build_snippet,
31
+ _extract_span_text,
32
+ artifact_paths_for_snapshot,
20
33
  chunks_to_records,
21
34
  collect_chunks,
22
35
  cosine_similarity_scores,
@@ -26,68 +39,74 @@ from .embedding_index_common import (
26
39
  write_chunks_jsonl,
27
40
  write_embeddings,
28
41
  )
29
- from .scan import _build_snippet
30
42
 
31
43
 
32
- class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
44
+ class EmbeddingIndexInMemoryConfiguration(EmbeddingIndexConfiguration):
33
45
  """
34
46
  Configuration for embedding-index-inmemory retrieval.
35
47
 
36
- :ivar max_chunks: Maximum chunks allowed for in-memory query loading.
37
- :vartype max_chunks: int
48
+ :ivar maximum_cache_total_items: Maximum chunks allowed for in-memory query loading.
49
+ :vartype maximum_cache_total_items: int
38
50
  """
39
51
 
40
52
  model_config = ConfigDict(extra="forbid")
41
53
 
42
- max_chunks: int = Field(default=25000, ge=1)
54
+ maximum_cache_total_items: int = Field(default=25000, ge=1)
43
55
 
44
56
 
45
- class EmbeddingIndexInMemoryBackend:
57
+ class EmbeddingIndexInMemoryRetriever:
46
58
  """
47
- Embedding retrieval backend using an in-memory similarity scan.
59
+ Embedding retrieval retriever using an in-memory similarity scan.
48
60
  """
49
61
 
50
- backend_id = "embedding-index-inmemory"
62
+ retriever_id = "embedding-index-inmemory"
51
63
 
52
- def build_run(
53
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
54
- ) -> RetrievalRun:
64
+ def build_snapshot(
65
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
66
+ ) -> RetrievalSnapshot:
55
67
  """
56
- Build an embedding index run by chunking text payloads and materializing embeddings.
68
+ Build an embedding index snapshot by chunking text payloads and materializing embeddings.
57
69
 
58
70
  :param corpus: Corpus to build against.
59
71
  :type corpus: Corpus
60
- :param recipe_name: Human-readable recipe name.
61
- :type recipe_name: str
62
- :param config: Backend-specific configuration values.
63
- :type config: dict[str, object]
64
- :return: Run manifest describing the build.
65
- :rtype: biblicus.models.RetrievalRun
72
+ :param configuration_name: Human-readable configuration name.
73
+ :type configuration_name: str
74
+ :param configuration: Retriever-specific configuration values.
75
+ :type configuration: dict[str, object]
76
+ :return: Snapshot manifest describing the build.
77
+ :rtype: biblicus.models.RetrievalSnapshot
66
78
  """
67
- recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(config)
68
- chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
69
- if len(chunks) > recipe_config.max_chunks:
79
+ parsed_config = EmbeddingIndexInMemoryConfiguration.model_validate(configuration)
80
+ chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
81
+ if len(chunks) > parsed_config.maximum_cache_total_items:
70
82
  raise ValueError(
71
- "embedding-index-inmemory exceeded max_chunks. "
72
- "Use embedding-index-file or increase max_chunks."
83
+ "embedding-index-inmemory exceeded maximum_cache_total_items. "
84
+ "Use embedding-index-file or increase maximum_cache_total_items."
73
85
  )
74
86
 
75
- provider = recipe_config.embedding_provider.build_provider()
87
+ provider = parsed_config.embedding_provider.build_provider()
76
88
  chunk_texts = [chunk.text for chunk in chunks]
77
89
  embeddings = provider.embed_texts(chunk_texts)
78
90
  embeddings = embeddings.astype(np.float32)
79
91
 
80
- recipe = create_recipe_manifest(
81
- backend_id=self.backend_id,
82
- name=recipe_name,
83
- config=recipe_config.model_dump(),
92
+ configuration_manifest = create_configuration_manifest(
93
+ retriever_id=self.retriever_id,
94
+ name=configuration_name,
95
+ configuration=parsed_config.model_dump(),
96
+ )
97
+ snapshot = create_snapshot_manifest(
98
+ corpus,
99
+ configuration=configuration_manifest,
100
+ stats={},
101
+ snapshot_artifacts=[],
84
102
  )
85
- run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
86
103
 
87
- paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
104
+ paths = artifact_paths_for_snapshot(
105
+ snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
106
+ )
88
107
  embeddings_path = corpus.root / paths["embeddings"]
89
108
  chunks_path = corpus.root / paths["chunks"]
90
- corpus.runs_dir.mkdir(parents=True, exist_ok=True)
109
+ corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
91
110
 
92
111
  write_embeddings(embeddings_path, embeddings)
93
112
  write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
@@ -99,30 +118,33 @@ class EmbeddingIndexInMemoryBackend:
99
118
  "dimensions": (
100
119
  int(embeddings.shape[1])
101
120
  if embeddings.size
102
- else recipe_config.embedding_provider.dimensions
121
+ else parsed_config.embedding_provider.dimensions
103
122
  ),
104
123
  }
105
- run = run.model_copy(
106
- update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
124
+ snapshot = snapshot.model_copy(
125
+ update={
126
+ "snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
127
+ "stats": stats,
128
+ }
107
129
  )
108
- corpus.write_run(run)
109
- return run
130
+ corpus.write_snapshot(snapshot)
131
+ return snapshot
110
132
 
111
133
  def query(
112
134
  self,
113
135
  corpus: Corpus,
114
136
  *,
115
- run: RetrievalRun,
137
+ snapshot: RetrievalSnapshot,
116
138
  query_text: str,
117
139
  budget: QueryBudget,
118
140
  ) -> RetrievalResult:
119
141
  """
120
- Query an embedding index run and return ranked evidence.
142
+ Query an embedding index snapshot and return ranked evidence.
121
143
 
122
- :param corpus: Corpus associated with the run.
144
+ :param corpus: Corpus associated with the snapshot.
123
145
  :type corpus: Corpus
124
- :param run: Run manifest to use for querying.
125
- :type run: biblicus.models.RetrievalRun
146
+ :param snapshot: Snapshot manifest to use for querying.
147
+ :type snapshot: biblicus.models.RetrievalSnapshot
126
148
  :param query_text: Query text to embed.
127
149
  :type query_text: str
128
150
  :param budget: Evidence selection budget.
@@ -130,14 +152,18 @@ class EmbeddingIndexInMemoryBackend:
130
152
  :return: Retrieval results containing evidence.
131
153
  :rtype: biblicus.models.RetrievalResult
132
154
  """
133
- recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(run.recipe.config)
134
- extraction_reference = resolve_extraction_reference(corpus, recipe_config)
155
+ parsed_config = EmbeddingIndexInMemoryConfiguration.model_validate(
156
+ snapshot.configuration.configuration
157
+ )
158
+ extraction_reference = resolve_extraction_reference(corpus, parsed_config)
135
159
 
136
- paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
160
+ paths = artifact_paths_for_snapshot(
161
+ snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
162
+ )
137
163
  embeddings_path = corpus.root / paths["embeddings"]
138
164
  chunks_path = corpus.root / paths["chunks"]
139
165
  if not embeddings_path.is_file() or not chunks_path.is_file():
140
- raise FileNotFoundError("Embedding index artifacts are missing for this run")
166
+ raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
141
167
 
142
168
  embeddings = read_embeddings(embeddings_path, mmap=False).astype(np.float32)
143
169
  chunk_records = read_chunks_jsonl(chunks_path)
@@ -147,7 +173,7 @@ class EmbeddingIndexInMemoryBackend:
147
173
  "embeddings row count does not match chunk record count"
148
174
  )
149
175
 
150
- provider = recipe_config.embedding_provider.build_provider()
176
+ provider = parsed_config.embedding_provider.build_provider()
151
177
  query_embedding = provider.embed_texts([query_text]).astype(np.float32)
152
178
  if query_embedding.shape[0] != 1:
153
179
  raise ValueError("Embedding provider returned an invalid query embedding shape")
@@ -159,8 +185,8 @@ class EmbeddingIndexInMemoryBackend:
159
185
  )
160
186
  evidence_items = _build_evidence(
161
187
  corpus,
162
- run=run,
163
- recipe_config=recipe_config,
188
+ snapshot=snapshot,
189
+ configuration=parsed_config,
164
190
  candidates=candidates,
165
191
  scores=scores,
166
192
  chunk_records=chunk_records,
@@ -168,7 +194,11 @@ class EmbeddingIndexInMemoryBackend:
168
194
  )
169
195
  ranked = [
170
196
  item.model_copy(
171
- update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
197
+ update={
198
+ "rank": index,
199
+ "configuration_id": snapshot.configuration.configuration_id,
200
+ "snapshot_id": snapshot.snapshot_id,
201
+ }
172
202
  )
173
203
  for index, item in enumerate(evidence_items, start=1)
174
204
  ]
@@ -176,9 +206,9 @@ class EmbeddingIndexInMemoryBackend:
176
206
  return RetrievalResult(
177
207
  query_text=query_text,
178
208
  budget=budget,
179
- run_id=run.run_id,
180
- recipe_id=run.recipe.recipe_id,
181
- backend_id=self.backend_id,
209
+ snapshot_id=snapshot.snapshot_id,
210
+ configuration_id=snapshot.configuration.configuration_id,
211
+ retriever_id=snapshot.configuration.retriever_id,
182
212
  generated_at=utc_now_iso(),
183
213
  evidence=evidence,
184
214
  stats={"candidates": len(evidence_items), "returned": len(evidence)},
@@ -201,12 +231,12 @@ def _top_indices(scores: np.ndarray, *, limit: int) -> List[int]:
201
231
  def _build_evidence(
202
232
  corpus: Corpus,
203
233
  *,
204
- run: RetrievalRun,
205
- recipe_config: EmbeddingIndexInMemoryRecipeConfig,
234
+ snapshot: RetrievalSnapshot,
235
+ configuration: EmbeddingIndexInMemoryConfiguration,
206
236
  candidates: List[int],
207
237
  scores: np.ndarray,
208
238
  chunk_records: List[ChunkRecord],
209
- extraction_reference: Optional[ExtractionRunReference],
239
+ extraction_reference: Optional[ExtractionSnapshotReference],
210
240
  ) -> List[Evidence]:
211
241
  catalog = corpus.load_catalog()
212
242
  evidence_items: List[Evidence] = []
@@ -225,9 +255,9 @@ def _build_evidence(
225
255
  media_type=media_type,
226
256
  extraction_reference=extraction_reference,
227
257
  )
228
- snippet = _build_snippet(
229
- text, (span_start, span_end), max_chars=recipe_config.snippet_characters
230
- )
258
+ span_text = _build_snippet(text, (span_start, span_end), configuration.snippet_characters)
259
+ if span_text is None:
260
+ span_text = _extract_span_text(text, (span_start, span_end))
231
261
  evidence_items.append(
232
262
  Evidence(
233
263
  item_id=item_id,
@@ -235,15 +265,16 @@ def _build_evidence(
235
265
  media_type=media_type,
236
266
  score=float(scores[idx]),
237
267
  rank=1,
238
- text=snippet,
268
+ text=span_text,
239
269
  content_ref=None,
240
270
  span_start=span_start,
241
271
  span_end=span_end,
242
- stage=EmbeddingIndexInMemoryBackend.backend_id,
272
+ stage=EmbeddingIndexInMemoryRetriever.retriever_id,
243
273
  stage_scores=None,
244
- recipe_id=run.recipe.recipe_id,
245
- run_id=run.run_id,
246
- hash=hash_text(snippet),
274
+ configuration_id=snapshot.configuration.configuration_id,
275
+ snapshot_id=snapshot.snapshot_id,
276
+ metadata=getattr(catalog_item, "metadata", {}) or {},
277
+ hash=hash_text(span_text or ""),
247
278
  )
248
279
  )
249
280
  return evidence_items
@@ -255,7 +286,7 @@ def _load_text_for_evidence(
255
286
  item_id: str,
256
287
  relpath: str,
257
288
  media_type: str,
258
- extraction_reference: Optional[ExtractionRunReference],
289
+ extraction_reference: Optional[ExtractionSnapshotReference],
259
290
  ) -> Optional[str]:
260
291
  from .embedding_index_common import _load_text_from_item
261
292
 
@@ -0,0 +1,301 @@
1
+ """
2
+ Hybrid retriever combining lexical and vector results.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
10
+
11
+ from ..corpus import Corpus
12
+ from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalSnapshot
13
+ from ..retrieval import apply_budget, create_configuration_manifest, create_snapshot_manifest
14
+ from ..time import utc_now_iso
15
+
16
+
17
+ class HybridConfiguration(BaseModel):
18
+ """
19
+ Configuration for hybrid retrieval fusion.
20
+
21
+ :ivar lexical_retriever: Retriever identifier for lexical retrieval.
22
+ :vartype lexical_retriever: str
23
+ :ivar embedding_retriever: Retriever identifier for embedding retrieval.
24
+ :vartype embedding_retriever: str
25
+ :ivar lexical_weight: Weight for lexical scores.
26
+ :vartype lexical_weight: float
27
+ :ivar embedding_weight: Weight for embedding scores.
28
+ :vartype embedding_weight: float
29
+ :ivar lexical_configuration: Optional lexical retriever configuration.
30
+ :vartype lexical_configuration: dict[str, object]
31
+ :ivar embedding_configuration: Optional embedding retriever configuration.
32
+ :vartype embedding_configuration: dict[str, object]
33
+ """
34
+
35
+ model_config = ConfigDict(extra="forbid")
36
+
37
+ lexical_retriever: str = Field(default="sqlite-full-text-search", min_length=1)
38
+ embedding_retriever: str = Field(default="tf-vector", min_length=1)
39
+ lexical_weight: float = Field(default=0.5, ge=0, le=1)
40
+ embedding_weight: float = Field(default=0.5, ge=0, le=1)
41
+ lexical_configuration: Dict[str, object] = Field(default_factory=dict)
42
+ embedding_configuration: Dict[str, object] = Field(default_factory=dict)
43
+
44
+ @model_validator(mode="after")
45
+ def _validate_weights(self) -> "HybridConfiguration":
46
+ if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
47
+ raise ValueError("weights must sum to 1")
48
+ return self
49
+
50
+
51
+ class HybridRetriever:
52
+ """
53
+ Hybrid retriever that fuses lexical and embedding retrieval.
54
+
55
+ :ivar retriever_id: Retriever identifier.
56
+ :vartype retriever_id: str
57
+ """
58
+
59
+ retriever_id = "hybrid"
60
+
61
+ def build_snapshot(
62
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
63
+ ) -> RetrievalSnapshot:
64
+ """
65
+ Build or register a hybrid retrieval snapshot.
66
+
67
+ :param corpus: Corpus to build against.
68
+ :type corpus: Corpus
69
+ :param configuration_name: Human-readable configuration name.
70
+ :type configuration_name: str
71
+ :param configuration: Retriever-specific configuration values.
72
+ :type configuration: dict[str, object]
73
+ :return: Snapshot manifest describing the build.
74
+ :rtype: RetrievalSnapshot
75
+ """
76
+ parsed_config = HybridConfiguration.model_validate(configuration)
77
+ _ensure_retriever_supported(parsed_config)
78
+ lexical_retriever = _resolve_retriever(parsed_config.lexical_retriever)
79
+ embedding_retriever = _resolve_retriever(parsed_config.embedding_retriever)
80
+ lexical_snapshot = lexical_retriever.build_snapshot(
81
+ corpus,
82
+ configuration_name=f"{configuration_name}-lexical",
83
+ configuration=parsed_config.lexical_configuration,
84
+ )
85
+ embedding_snapshot = embedding_retriever.build_snapshot(
86
+ corpus,
87
+ configuration_name=f"{configuration_name}-embedding",
88
+ configuration=parsed_config.embedding_configuration,
89
+ )
90
+ configuration_manifest = create_configuration_manifest(
91
+ retriever_id=self.retriever_id,
92
+ name=configuration_name,
93
+ configuration=parsed_config.model_dump(),
94
+ )
95
+ stats = {
96
+ "lexical_snapshot_id": lexical_snapshot.snapshot_id,
97
+ "embedding_snapshot_id": embedding_snapshot.snapshot_id,
98
+ }
99
+ snapshot = create_snapshot_manifest(
100
+ corpus,
101
+ configuration=configuration_manifest,
102
+ stats=stats,
103
+ snapshot_artifacts=[],
104
+ )
105
+ corpus.write_snapshot(snapshot)
106
+ return snapshot
107
+
108
+ def query(
109
+ self,
110
+ corpus: Corpus,
111
+ *,
112
+ snapshot: RetrievalSnapshot,
113
+ query_text: str,
114
+ budget: QueryBudget,
115
+ ) -> RetrievalResult:
116
+ """
117
+ Query using both lexical and embedding retrievers and fuse scores.
118
+
119
+ :param corpus: Corpus associated with the snapshot.
120
+ :type corpus: Corpus
121
+ :param snapshot: Snapshot manifest to use for querying.
122
+ :type snapshot: RetrievalSnapshot
123
+ :param query_text: Query text to execute.
124
+ :type query_text: str
125
+ :param budget: Evidence selection budget.
126
+ :type budget: QueryBudget
127
+ :return: Retrieval results containing evidence.
128
+ :rtype: RetrievalResult
129
+ """
130
+ configuration = HybridConfiguration.model_validate(snapshot.configuration.configuration)
131
+ _ensure_retriever_supported(configuration)
132
+ lexical_retriever = _resolve_retriever(configuration.lexical_retriever)
133
+ embedding_retriever = _resolve_retriever(configuration.embedding_retriever)
134
+ lexical_snapshot_id = snapshot.stats.get("lexical_snapshot_id")
135
+ embedding_snapshot_id = snapshot.stats.get("embedding_snapshot_id")
136
+ if not lexical_snapshot_id or not embedding_snapshot_id:
137
+ raise ValueError("Hybrid snapshot missing lexical or embedding snapshot identifiers")
138
+ lexical_snapshot = corpus.load_snapshot(str(lexical_snapshot_id))
139
+ embedding_snapshot = corpus.load_snapshot(str(embedding_snapshot_id))
140
+ component_budget = _expand_component_budget(budget)
141
+ lexical_result = lexical_retriever.query(
142
+ corpus, snapshot=lexical_snapshot, query_text=query_text, budget=component_budget
143
+ )
144
+ embedding_result = embedding_retriever.query(
145
+ corpus, snapshot=embedding_snapshot, query_text=query_text, budget=component_budget
146
+ )
147
+ candidates = _fuse_evidence(
148
+ lexical_result.evidence,
149
+ embedding_result.evidence,
150
+ lexical_weight=configuration.lexical_weight,
151
+ embedding_weight=configuration.embedding_weight,
152
+ )
153
+ sorted_candidates = sorted(
154
+ candidates,
155
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
156
+ )
157
+ ranked = [
158
+ evidence_item.model_copy(
159
+ update={
160
+ "rank": index,
161
+ "configuration_id": snapshot.configuration.configuration_id,
162
+ "snapshot_id": snapshot.snapshot_id,
163
+ }
164
+ )
165
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
166
+ ]
167
+ evidence = apply_budget(ranked, budget)
168
+ stats = {
169
+ "candidates": len(sorted_candidates),
170
+ "returned": len(evidence),
171
+ "fusion_weights": {
172
+ "lexical": configuration.lexical_weight,
173
+ "embedding": configuration.embedding_weight,
174
+ },
175
+ }
176
+ return RetrievalResult(
177
+ query_text=query_text,
178
+ budget=budget,
179
+ snapshot_id=snapshot.snapshot_id,
180
+ configuration_id=snapshot.configuration.configuration_id,
181
+ retriever_id=snapshot.configuration.retriever_id,
182
+ generated_at=utc_now_iso(),
183
+ evidence=evidence,
184
+ stats=stats,
185
+ )
186
+
187
+
188
+ def _ensure_retriever_supported(configuration: HybridConfiguration) -> None:
189
+ """
190
+ Validate that hybrid retrievers do not reference the hybrid retriever itself.
191
+
192
+ :param configuration: Parsed hybrid configuration.
193
+ :type configuration: HybridConfiguration
194
+ :return: None.
195
+ :rtype: None
196
+ :raises ValueError: If hybrid is used as a component retriever.
197
+ """
198
+ if configuration.lexical_retriever == HybridRetriever.retriever_id:
199
+ raise ValueError("Hybrid retriever cannot use itself as the lexical retriever")
200
+ if configuration.embedding_retriever == HybridRetriever.retriever_id:
201
+ raise ValueError("Hybrid retriever cannot use itself as the embedding retriever")
202
+
203
+
204
+ def _resolve_retriever(retriever_id: str):
205
+ """
206
+ Resolve a retriever by identifier.
207
+
208
+ :param retriever_id: Retriever identifier.
209
+ :type retriever_id: str
210
+ :return: Retriever instance.
211
+ :rtype: object
212
+ """
213
+ from biblicus.retrievers import get_retriever # Delayed import to avoid circular import
214
+
215
+ return get_retriever(retriever_id)
216
+
217
+
218
+ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
219
+ """
220
+ Expand a final budget to collect more candidates for fusion.
221
+
222
+ :param budget: Final evidence budget.
223
+ :type budget: QueryBudget
224
+ :param multiplier: Candidate expansion multiplier.
225
+ :type multiplier: int
226
+ :return: Expanded budget for component retrievers.
227
+ :rtype: QueryBudget
228
+ """
229
+ maximum_total_characters = budget.maximum_total_characters
230
+ expanded_characters = (
231
+ maximum_total_characters * multiplier if maximum_total_characters is not None else None
232
+ )
233
+ expanded_max_items_per_source = (
234
+ budget.max_items_per_source * multiplier
235
+ if budget.max_items_per_source is not None
236
+ else None
237
+ )
238
+ requested_items = budget.max_total_items + budget.offset
239
+ return QueryBudget(
240
+ max_total_items=requested_items * multiplier,
241
+ offset=0,
242
+ maximum_total_characters=expanded_characters,
243
+ max_items_per_source=expanded_max_items_per_source,
244
+ )
245
+
246
+
247
+ def _fuse_evidence(
248
+ lexical: List[Evidence],
249
+ embedding: List[Evidence],
250
+ *,
251
+ lexical_weight: float,
252
+ embedding_weight: float,
253
+ ) -> List[Evidence]:
254
+ """
255
+ Fuse lexical and embedding evidence lists into hybrid candidates.
256
+
257
+ :param lexical: Lexical evidence list.
258
+ :type lexical: list[Evidence]
259
+ :param embedding: Embedding evidence list.
260
+ :type embedding: list[Evidence]
261
+ :param lexical_weight: Lexical score weight.
262
+ :type lexical_weight: float
263
+ :param embedding_weight: Embedding score weight.
264
+ :type embedding_weight: float
265
+ :return: Hybrid evidence list.
266
+ :rtype: list[Evidence]
267
+ """
268
+ merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
269
+ for evidence_item in lexical:
270
+ merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
271
+ for evidence_item in embedding:
272
+ merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
273
+
274
+ candidates: List[Evidence] = []
275
+ for item_id, sources in merged.items():
276
+ lexical_evidence = sources.get("lexical")
277
+ embedding_evidence = sources.get("embedding")
278
+ lexical_score = lexical_evidence.score if lexical_evidence else 0.0
279
+ embedding_score = embedding_evidence.score if embedding_evidence else 0.0
280
+ combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
281
+ base_evidence = lexical_evidence or embedding_evidence
282
+ candidates.append(
283
+ Evidence(
284
+ item_id=item_id,
285
+ source_uri=base_evidence.source_uri,
286
+ media_type=base_evidence.media_type,
287
+ score=combined_score,
288
+ rank=1,
289
+ text=base_evidence.text,
290
+ content_ref=base_evidence.content_ref,
291
+ span_start=base_evidence.span_start,
292
+ span_end=base_evidence.span_end,
293
+ stage="hybrid",
294
+ stage_scores={"lexical": lexical_score, "embedding": embedding_score},
295
+ configuration_id="",
296
+ snapshot_id="",
297
+ metadata=base_evidence.metadata,
298
+ hash=base_evidence.hash,
299
+ )
300
+ )
301
+ return candidates