biblicus 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,8 +7,10 @@ from __future__ import annotations
7
7
  from typing import Dict, Type
8
8
 
9
9
  from .base import RetrievalBackend
10
+ from .hybrid import HybridBackend
10
11
  from .scan import ScanBackend
11
12
  from .sqlite_full_text_search import SqliteFullTextSearchBackend
13
+ from .vector import VectorBackend
12
14
 
13
15
 
14
16
  def available_backends() -> Dict[str, Type[RetrievalBackend]]:
@@ -19,8 +21,10 @@ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
19
21
  :rtype: dict[str, Type[RetrievalBackend]]
20
22
  """
21
23
  return {
24
+ HybridBackend.backend_id: HybridBackend,
22
25
  ScanBackend.backend_id: ScanBackend,
23
26
  SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
27
+ VectorBackend.backend_id: VectorBackend,
24
28
  }
25
29
 
26
30
 
@@ -0,0 +1,284 @@
1
+ """
2
+ Hybrid retrieval backend combining lexical and vector results.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
10
+
11
+ from ..corpus import Corpus
12
+ from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
13
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest
14
+ from ..time import utc_now_iso
15
+
16
+
17
+ class HybridRecipeConfig(BaseModel):
18
+ """
19
+ Configuration for hybrid retrieval fusion.
20
+
21
+ :ivar lexical_backend: Backend identifier for lexical retrieval.
22
+ :vartype lexical_backend: str
23
+ :ivar embedding_backend: Backend identifier for embedding retrieval.
24
+ :vartype embedding_backend: str
25
+ :ivar lexical_weight: Weight for lexical scores.
26
+ :vartype lexical_weight: float
27
+ :ivar embedding_weight: Weight for embedding scores.
28
+ :vartype embedding_weight: float
29
+ :ivar lexical_config: Optional lexical backend configuration.
30
+ :vartype lexical_config: dict[str, object]
31
+ :ivar embedding_config: Optional embedding backend configuration.
32
+ :vartype embedding_config: dict[str, object]
33
+ """
34
+
35
+ model_config = ConfigDict(extra="forbid")
36
+
37
+ lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
38
+ embedding_backend: str = Field(default="vector", min_length=1)
39
+ lexical_weight: float = Field(default=0.5, ge=0, le=1)
40
+ embedding_weight: float = Field(default=0.5, ge=0, le=1)
41
+ lexical_config: Dict[str, object] = Field(default_factory=dict)
42
+ embedding_config: Dict[str, object] = Field(default_factory=dict)
43
+
44
+ @model_validator(mode="after")
45
+ def _validate_weights(self) -> "HybridRecipeConfig":
46
+ if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
47
+ raise ValueError("weights must sum to 1")
48
+ return self
49
+
50
+
51
+ class HybridBackend:
52
+ """
53
+ Hybrid backend that fuses lexical and embedding retrieval.
54
+
55
+ :ivar backend_id: Backend identifier.
56
+ :vartype backend_id: str
57
+ """
58
+
59
+ backend_id = "hybrid"
60
+
61
+ def build_run(
62
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
63
+ ) -> RetrievalRun:
64
+ """
65
+ Build or register a hybrid retrieval run.
66
+
67
+ :param corpus: Corpus to build against.
68
+ :type corpus: Corpus
69
+ :param recipe_name: Human-readable recipe name.
70
+ :type recipe_name: str
71
+ :param config: Backend-specific configuration values.
72
+ :type config: dict[str, object]
73
+ :return: Run manifest describing the build.
74
+ :rtype: RetrievalRun
75
+ """
76
+ recipe_config = HybridRecipeConfig.model_validate(config)
77
+ _ensure_backend_supported(recipe_config)
78
+ lexical_backend = _resolve_backend(recipe_config.lexical_backend)
79
+ embedding_backend = _resolve_backend(recipe_config.embedding_backend)
80
+ lexical_run = lexical_backend.build_run(
81
+ corpus, recipe_name=f"{recipe_name}-lexical", config=recipe_config.lexical_config
82
+ )
83
+ embedding_run = embedding_backend.build_run(
84
+ corpus, recipe_name=f"{recipe_name}-embedding", config=recipe_config.embedding_config
85
+ )
86
+ recipe = create_recipe_manifest(
87
+ backend_id=self.backend_id,
88
+ name=recipe_name,
89
+ config=recipe_config.model_dump(),
90
+ )
91
+ stats = {
92
+ "lexical_run_id": lexical_run.run_id,
93
+ "embedding_run_id": embedding_run.run_id,
94
+ }
95
+ run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
96
+ corpus.write_run(run)
97
+ return run
98
+
99
+ def query(
100
+ self,
101
+ corpus: Corpus,
102
+ *,
103
+ run: RetrievalRun,
104
+ query_text: str,
105
+ budget: QueryBudget,
106
+ ) -> RetrievalResult:
107
+ """
108
+ Query using both lexical and embedding backends and fuse scores.
109
+
110
+ :param corpus: Corpus associated with the run.
111
+ :type corpus: Corpus
112
+ :param run: Run manifest to use for querying.
113
+ :type run: RetrievalRun
114
+ :param query_text: Query text to execute.
115
+ :type query_text: str
116
+ :param budget: Evidence selection budget.
117
+ :type budget: QueryBudget
118
+ :return: Retrieval results containing evidence.
119
+ :rtype: RetrievalResult
120
+ """
121
+ recipe_config = HybridRecipeConfig.model_validate(run.recipe.config)
122
+ _ensure_backend_supported(recipe_config)
123
+ lexical_backend = _resolve_backend(recipe_config.lexical_backend)
124
+ embedding_backend = _resolve_backend(recipe_config.embedding_backend)
125
+ lexical_run_id = run.stats.get("lexical_run_id")
126
+ embedding_run_id = run.stats.get("embedding_run_id")
127
+ if not lexical_run_id or not embedding_run_id:
128
+ raise ValueError("Hybrid run missing lexical or embedding run identifiers")
129
+ lexical_run = corpus.load_run(str(lexical_run_id))
130
+ embedding_run = corpus.load_run(str(embedding_run_id))
131
+ component_budget = _expand_component_budget(budget)
132
+ lexical_result = lexical_backend.query(
133
+ corpus, run=lexical_run, query_text=query_text, budget=component_budget
134
+ )
135
+ embedding_result = embedding_backend.query(
136
+ corpus, run=embedding_run, query_text=query_text, budget=component_budget
137
+ )
138
+ candidates = _fuse_evidence(
139
+ lexical_result.evidence,
140
+ embedding_result.evidence,
141
+ lexical_weight=recipe_config.lexical_weight,
142
+ embedding_weight=recipe_config.embedding_weight,
143
+ )
144
+ sorted_candidates = sorted(
145
+ candidates,
146
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
147
+ )
148
+ ranked = [
149
+ evidence_item.model_copy(
150
+ update={
151
+ "rank": index,
152
+ "recipe_id": run.recipe.recipe_id,
153
+ "run_id": run.run_id,
154
+ }
155
+ )
156
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
157
+ ]
158
+ evidence = apply_budget(ranked, budget)
159
+ stats = {
160
+ "candidates": len(sorted_candidates),
161
+ "returned": len(evidence),
162
+ "fusion_weights": {
163
+ "lexical": recipe_config.lexical_weight,
164
+ "embedding": recipe_config.embedding_weight,
165
+ },
166
+ }
167
+ return RetrievalResult(
168
+ query_text=query_text,
169
+ budget=budget,
170
+ run_id=run.run_id,
171
+ recipe_id=run.recipe.recipe_id,
172
+ backend_id=self.backend_id,
173
+ generated_at=utc_now_iso(),
174
+ evidence=evidence,
175
+ stats=stats,
176
+ )
177
+
178
+
179
+ def _ensure_backend_supported(recipe_config: HybridRecipeConfig) -> None:
180
+ """
181
+ Validate that hybrid backends do not reference the hybrid backend itself.
182
+
183
+ :param recipe_config: Parsed hybrid recipe configuration.
184
+ :type recipe_config: HybridRecipeConfig
185
+ :return: None.
186
+ :rtype: None
187
+ :raises ValueError: If hybrid is used as a component backend.
188
+ """
189
+ if recipe_config.lexical_backend == HybridBackend.backend_id:
190
+ raise ValueError("Hybrid backend cannot use itself as the lexical backend")
191
+ if recipe_config.embedding_backend == HybridBackend.backend_id:
192
+ raise ValueError("Hybrid backend cannot use itself as the embedding backend")
193
+
194
+
195
+ def _resolve_backend(backend_id: str):
196
+ """
197
+ Resolve a backend by identifier.
198
+
199
+ :param backend_id: Backend identifier.
200
+ :type backend_id: str
201
+ :return: Backend instance.
202
+ :rtype: object
203
+ """
204
+ from . import get_backend
205
+
206
+ return get_backend(backend_id)
207
+
208
+
209
+ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
210
+ """
211
+ Expand a final budget to collect more candidates for fusion.
212
+
213
+ :param budget: Final evidence budget.
214
+ :type budget: QueryBudget
215
+ :param multiplier: Candidate expansion multiplier.
216
+ :type multiplier: int
217
+ :return: Expanded budget for component backends.
218
+ :rtype: QueryBudget
219
+ """
220
+ max_total_characters = budget.max_total_characters
221
+ expanded_characters = (
222
+ max_total_characters * multiplier if max_total_characters is not None else None
223
+ )
224
+ return QueryBudget(
225
+ max_total_items=budget.max_total_items * multiplier,
226
+ max_total_characters=expanded_characters,
227
+ max_items_per_source=budget.max_items_per_source,
228
+ )
229
+
230
+
231
+ def _fuse_evidence(
232
+ lexical: List[Evidence],
233
+ embedding: List[Evidence],
234
+ *,
235
+ lexical_weight: float,
236
+ embedding_weight: float,
237
+ ) -> List[Evidence]:
238
+ """
239
+ Fuse lexical and embedding evidence lists into hybrid candidates.
240
+
241
+ :param lexical: Lexical evidence list.
242
+ :type lexical: list[Evidence]
243
+ :param embedding: Embedding evidence list.
244
+ :type embedding: list[Evidence]
245
+ :param lexical_weight: Lexical score weight.
246
+ :type lexical_weight: float
247
+ :param embedding_weight: Embedding score weight.
248
+ :type embedding_weight: float
249
+ :return: Hybrid evidence list.
250
+ :rtype: list[Evidence]
251
+ """
252
+ merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
253
+ for evidence_item in lexical:
254
+ merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
255
+ for evidence_item in embedding:
256
+ merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
257
+
258
+ candidates: List[Evidence] = []
259
+ for item_id, sources in merged.items():
260
+ lexical_evidence = sources.get("lexical")
261
+ embedding_evidence = sources.get("embedding")
262
+ lexical_score = lexical_evidence.score if lexical_evidence else 0.0
263
+ embedding_score = embedding_evidence.score if embedding_evidence else 0.0
264
+ combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
265
+ base_evidence = lexical_evidence or embedding_evidence
266
+ candidates.append(
267
+ Evidence(
268
+ item_id=item_id,
269
+ source_uri=base_evidence.source_uri,
270
+ media_type=base_evidence.media_type,
271
+ score=combined_score,
272
+ rank=1,
273
+ text=base_evidence.text,
274
+ content_ref=base_evidence.content_ref,
275
+ span_start=base_evidence.span_start,
276
+ span_end=base_evidence.span_end,
277
+ stage="hybrid",
278
+ stage_scores={"lexical": lexical_score, "embedding": embedding_score},
279
+ recipe_id="",
280
+ run_id="",
281
+ hash=base_evidence.hash,
282
+ )
283
+ )
284
+ return candidates
@@ -6,9 +6,9 @@ from __future__ import annotations
6
6
 
7
7
  import sqlite3
8
8
  from pathlib import Path
9
- from typing import Dict, Iterable, List, Optional, Tuple
9
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
10
10
 
11
- from pydantic import BaseModel, ConfigDict, Field
11
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
12
12
 
13
13
  from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
14
  from ..corpus import Corpus
@@ -35,6 +35,28 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
35
35
  :vartype chunk_overlap: int
36
36
  :ivar snippet_characters: Maximum characters to include in evidence snippets.
37
37
  :vartype snippet_characters: int
38
+ :ivar bm25_k1: BM25 k1 tuning parameter.
39
+ :vartype bm25_k1: float
40
+ :ivar bm25_b: BM25 b tuning parameter.
41
+ :vartype bm25_b: float
42
+ :ivar ngram_min: Minimum n-gram size for lexical tuning.
43
+ :vartype ngram_min: int
44
+ :ivar ngram_max: Maximum n-gram size for lexical tuning.
45
+ :vartype ngram_max: int
46
+ :ivar stop_words: Optional stop word policy or list.
47
+ :vartype stop_words: str or list[str] or None
48
+ :ivar field_weight_title: Relative weight for title field matches.
49
+ :vartype field_weight_title: float
50
+ :ivar field_weight_body: Relative weight for body field matches.
51
+ :vartype field_weight_body: float
52
+ :ivar field_weight_tags: Relative weight for tag field matches.
53
+ :vartype field_weight_tags: float
54
+ :ivar rerank_enabled: Whether to apply reranking to retrieved candidates.
55
+ :vartype rerank_enabled: bool
56
+ :ivar rerank_model: Reranker model identifier for metadata.
57
+ :vartype rerank_model: str or None
58
+ :ivar rerank_top_k: Number of candidates to rerank.
59
+ :vartype rerank_top_k: int
38
60
  :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
39
61
  :vartype extraction_run: str or None
40
62
  """
@@ -44,8 +66,81 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
44
66
  chunk_size: int = Field(default=800, ge=1)
45
67
  chunk_overlap: int = Field(default=200, ge=0)
46
68
  snippet_characters: int = Field(default=400, ge=1)
69
+ bm25_k1: float = Field(default=1.2, gt=0)
70
+ bm25_b: float = Field(default=0.75, ge=0, le=1)
71
+ ngram_min: int = Field(default=1, ge=1)
72
+ ngram_max: int = Field(default=1, ge=1)
73
+ stop_words: Optional[Union[str, List[str]]] = None
74
+ field_weight_title: float = Field(default=1.0, ge=0)
75
+ field_weight_body: float = Field(default=1.0, ge=0)
76
+ field_weight_tags: float = Field(default=1.0, ge=0)
77
+ rerank_enabled: bool = False
78
+ rerank_model: Optional[str] = None
79
+ rerank_top_k: int = Field(default=10, ge=1)
47
80
  extraction_run: Optional[str] = None
48
81
 
82
+ @field_validator("stop_words")
83
+ @classmethod
84
+ def _validate_stop_words(
85
+ cls, value: Optional[Union[str, List[str]]]
86
+ ) -> Optional[Union[str, List[str]]]:
87
+ if value is None:
88
+ return None
89
+ if isinstance(value, str):
90
+ if value.lower() == "english":
91
+ return "english"
92
+ raise ValueError("stop_words must be 'english' or a list of strings")
93
+ if not value:
94
+ raise ValueError("stop_words list must not be empty")
95
+ if any(not isinstance(token, str) or not token.strip() for token in value):
96
+ raise ValueError("stop_words list must contain non-empty strings")
97
+ return value
98
+
99
+ @model_validator(mode="after")
100
+ def _validate_ngram_range(self) -> "SqliteFullTextSearchRecipeConfig":
101
+ if self.ngram_min > self.ngram_max:
102
+ raise ValueError("Invalid ngram range: ngram_min must be <= ngram_max")
103
+ if self.rerank_enabled and not self.rerank_model:
104
+ raise ValueError("Rerank enabled requires rerank_model")
105
+ return self
106
+
107
+
108
+ _ENGLISH_STOP_WORDS: Set[str] = {
109
+ "a",
110
+ "an",
111
+ "and",
112
+ "are",
113
+ "as",
114
+ "at",
115
+ "be",
116
+ "but",
117
+ "by",
118
+ "for",
119
+ "if",
120
+ "in",
121
+ "into",
122
+ "is",
123
+ "it",
124
+ "no",
125
+ "not",
126
+ "of",
127
+ "on",
128
+ "or",
129
+ "such",
130
+ "that",
131
+ "the",
132
+ "their",
133
+ "then",
134
+ "there",
135
+ "these",
136
+ "they",
137
+ "this",
138
+ "to",
139
+ "was",
140
+ "will",
141
+ "with",
142
+ }
143
+
49
144
 
50
145
  class SqliteFullTextSearchBackend:
51
146
  """
@@ -118,29 +213,39 @@ class SqliteFullTextSearchBackend:
118
213
  :rtype: RetrievalResult
119
214
  """
120
215
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
216
+ query_tokens = _tokenize_query(query_text)
217
+ stop_words = _resolve_stop_words(recipe_config.stop_words)
218
+ filtered_tokens = _apply_stop_words(query_tokens, stop_words)
219
+ if not filtered_tokens:
220
+ return RetrievalResult(
221
+ query_text=query_text,
222
+ budget=budget,
223
+ run_id=run.run_id,
224
+ recipe_id=run.recipe.recipe_id,
225
+ backend_id=self.backend_id,
226
+ generated_at=utc_now_iso(),
227
+ evidence=[],
228
+ stats={"candidates": 0, "returned": 0},
229
+ )
121
230
  db_path = _resolve_run_db_path(corpus, run)
122
231
  candidates = _query_full_text_search_index(
123
232
  db_path=db_path,
124
- query_text=query_text,
233
+ query_text=" ".join(filtered_tokens),
125
234
  limit=_candidate_limit(budget.max_total_items),
126
235
  snippet_characters=recipe_config.snippet_characters,
127
236
  )
128
- sorted_candidates = sorted(
129
- candidates,
130
- key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
237
+ sorted_candidates = _rank_candidates(candidates)
238
+ evidence = _apply_rerank_if_enabled(
239
+ sorted_candidates,
240
+ query_tokens=filtered_tokens,
241
+ run=run,
242
+ budget=budget,
243
+ rerank_enabled=recipe_config.rerank_enabled,
244
+ rerank_top_k=recipe_config.rerank_top_k,
131
245
  )
132
- ranked = [
133
- evidence_item.model_copy(
134
- update={
135
- "rank": index,
136
- "recipe_id": run.recipe.recipe_id,
137
- "run_id": run.run_id,
138
- }
139
- )
140
- for index, evidence_item in enumerate(sorted_candidates, start=1)
141
- ]
142
- evidence = apply_budget(ranked, budget)
143
- stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
246
+ stats: Dict[str, object] = {"candidates": len(sorted_candidates), "returned": len(evidence)}
247
+ if recipe_config.rerank_enabled:
248
+ stats["reranked_candidates"] = min(len(sorted_candidates), recipe_config.rerank_top_k)
144
249
  return RetrievalResult(
145
250
  query_text=query_text,
146
251
  budget=budget,
@@ -165,6 +270,147 @@ def _candidate_limit(max_total_items: int) -> int:
165
270
  return max_total_items * 5
166
271
 
167
272
 
273
+ def _tokenize_query(query_text: str) -> List[str]:
274
+ """
275
+ Tokenize a query string into lowercased terms.
276
+
277
+ :param query_text: Raw query text.
278
+ :type query_text: str
279
+ :return: Token list.
280
+ :rtype: list[str]
281
+ """
282
+ return [token for token in query_text.lower().split() if token]
283
+
284
+
285
+ def _resolve_stop_words(value: Optional[Union[str, List[str]]]) -> Set[str]:
286
+ """
287
+ Resolve stop words based on a configuration value.
288
+
289
+ :param value: Stop word configuration.
290
+ :type value: str or list[str] or None
291
+ :return: Stop word set.
292
+ :rtype: set[str]
293
+ """
294
+ if value is None:
295
+ return set()
296
+ if isinstance(value, str):
297
+ return set(_ENGLISH_STOP_WORDS)
298
+ return {token.strip().lower() for token in value if token.strip()}
299
+
300
+
301
+ def _apply_stop_words(tokens: List[str], stop_words: Set[str]) -> List[str]:
302
+ """
303
+ Filter query tokens by a stop word list.
304
+
305
+ :param tokens: Token list.
306
+ :type tokens: list[str]
307
+ :param stop_words: Stop word set.
308
+ :type stop_words: set[str]
309
+ :return: Filtered token list.
310
+ :rtype: list[str]
311
+ """
312
+ if not stop_words:
313
+ return tokens
314
+ return [token for token in tokens if token not in stop_words]
315
+
316
+
317
+ def _rank_candidates(candidates: List[Evidence]) -> List[Evidence]:
318
+ """
319
+ Sort evidence candidates by descending score.
320
+
321
+ :param candidates: Evidence list to sort.
322
+ :type candidates: list[Evidence]
323
+ :return: Sorted evidence list.
324
+ :rtype: list[Evidence]
325
+ """
326
+ return sorted(
327
+ candidates, key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id)
328
+ )
329
+
330
+
331
+ def _rerank_score(text: str, query_tokens: List[str]) -> float:
332
+ """
333
+ Compute a simple rerank score using token overlap.
334
+
335
+ :param text: Candidate text.
336
+ :type text: str
337
+ :param query_tokens: Query tokens.
338
+ :type query_tokens: list[str]
339
+ :return: Rerank score.
340
+ :rtype: float
341
+ """
342
+ lower_text = text.lower() if text else ""
343
+ return float(sum(1 for token in query_tokens if token in lower_text))
344
+
345
+
346
+ def _apply_rerank_if_enabled(
347
+ candidates: List[Evidence],
348
+ *,
349
+ query_tokens: List[str],
350
+ run: RetrievalRun,
351
+ budget: QueryBudget,
352
+ rerank_enabled: bool,
353
+ rerank_top_k: int,
354
+ ) -> List[Evidence]:
355
+ """
356
+ Rerank candidates when enabled, otherwise apply the budget to ranked results.
357
+
358
+ :param candidates: Ranked candidate evidence.
359
+ :type candidates: list[Evidence]
360
+ :param query_tokens: Query tokens used for reranking.
361
+ :type query_tokens: list[str]
362
+ :param run: Retrieval run to annotate evidence with.
363
+ :type run: RetrievalRun
364
+ :param budget: Evidence selection budget.
365
+ :type budget: QueryBudget
366
+ :param rerank_enabled: Whether reranking is enabled.
367
+ :type rerank_enabled: bool
368
+ :param rerank_top_k: Maximum candidates to rerank.
369
+ :type rerank_top_k: int
370
+ :return: Evidence list respecting budget.
371
+ :rtype: list[Evidence]
372
+ """
373
+ if not rerank_enabled:
374
+ ranked = [
375
+ evidence_item.model_copy(
376
+ update={
377
+ "rank": index,
378
+ "recipe_id": run.recipe.recipe_id,
379
+ "run_id": run.run_id,
380
+ }
381
+ )
382
+ for index, evidence_item in enumerate(candidates, start=1)
383
+ ]
384
+ return apply_budget(ranked, budget)
385
+
386
+ rerank_limit = min(len(candidates), rerank_top_k)
387
+ rerank_candidates = candidates[:rerank_limit]
388
+ reranked: List[Evidence] = []
389
+ for evidence_item in rerank_candidates:
390
+ rerank_score = _rerank_score(evidence_item.text or "", query_tokens)
391
+ reranked.append(
392
+ evidence_item.model_copy(
393
+ update={
394
+ "score": rerank_score,
395
+ "stage": "rerank",
396
+ "stage_scores": {"retrieve": evidence_item.score, "rerank": rerank_score},
397
+ }
398
+ )
399
+ )
400
+ reranked_sorted = _rank_candidates(reranked)
401
+ ranked = [
402
+ evidence_item.model_copy(
403
+ update={
404
+ "rank": index,
405
+ "recipe_id": run.recipe.recipe_id,
406
+ "run_id": run.run_id,
407
+ }
408
+ )
409
+ for index, evidence_item in enumerate(reranked_sorted, start=1)
410
+ ]
411
+ return apply_budget(ranked, budget)
412
+
413
+
168
414
  def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
169
415
  """
170
416
  Resolve the SQLite index path for a retrieval run.
@@ -213,8 +459,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
213
459
  :return: None.
214
460
  :rtype: None
215
461
  """
216
- conn.execute(
217
- """
462
+ conn.execute("""
218
463
  CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
219
464
  content,
220
465
  item_id UNINDEXED,
@@ -225,8 +470,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
225
470
  start_offset UNINDEXED,
226
471
  end_offset UNINDEXED
227
472
  )
228
- """
229
- )
473
+ """)
230
474
 
231
475
 
232
476
  def _build_full_text_search_index(