biblicus 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +0 -1
- biblicus/_vendor/dotyaml/interpolation.py +0 -1
- biblicus/_vendor/dotyaml/loader.py +0 -1
- biblicus/_vendor/dotyaml/transformer.py +0 -1
- biblicus/analysis/__init__.py +2 -0
- biblicus/analysis/models.py +228 -5
- biblicus/analysis/profiling.py +337 -0
- biblicus/analysis/topic_modeling.py +3 -6
- biblicus/backends/__init__.py +4 -0
- biblicus/backends/hybrid.py +284 -0
- biblicus/backends/sqlite_full_text_search.py +266 -22
- biblicus/backends/vector.py +460 -0
- biblicus/cli.py +83 -4
- biblicus/corpus.py +9 -3
- biblicus/evidence_processing.py +4 -2
- biblicus/extraction.py +3 -1
- biblicus/extractors/markitdown_text.py +1 -0
- biblicus/extractors/paddleocr_vl_text.py +1 -3
- biblicus/models.py +3 -0
- biblicus/user_config.py +2 -6
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/METADATA +13 -6
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/RECORD +27 -24
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/WHEEL +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.11.0.dist-info}/top_level.txt +0 -0
biblicus/backends/__init__.py
CHANGED
|
@@ -7,8 +7,10 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict, Type
|
|
8
8
|
|
|
9
9
|
from .base import RetrievalBackend
|
|
10
|
+
from .hybrid import HybridBackend
|
|
10
11
|
from .scan import ScanBackend
|
|
11
12
|
from .sqlite_full_text_search import SqliteFullTextSearchBackend
|
|
13
|
+
from .vector import VectorBackend
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
@@ -19,8 +21,10 @@ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
|
19
21
|
:rtype: dict[str, Type[RetrievalBackend]]
|
|
20
22
|
"""
|
|
21
23
|
return {
|
|
24
|
+
HybridBackend.backend_id: HybridBackend,
|
|
22
25
|
ScanBackend.backend_id: ScanBackend,
|
|
23
26
|
SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
|
|
27
|
+
VectorBackend.backend_id: VectorBackend,
|
|
24
28
|
}
|
|
25
29
|
|
|
26
30
|
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hybrid retrieval backend combining lexical and vector results.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
|
|
13
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest
|
|
14
|
+
from ..time import utc_now_iso
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HybridRecipeConfig(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Configuration for hybrid retrieval fusion.
|
|
20
|
+
|
|
21
|
+
:ivar lexical_backend: Backend identifier for lexical retrieval.
|
|
22
|
+
:vartype lexical_backend: str
|
|
23
|
+
:ivar embedding_backend: Backend identifier for embedding retrieval.
|
|
24
|
+
:vartype embedding_backend: str
|
|
25
|
+
:ivar lexical_weight: Weight for lexical scores.
|
|
26
|
+
:vartype lexical_weight: float
|
|
27
|
+
:ivar embedding_weight: Weight for embedding scores.
|
|
28
|
+
:vartype embedding_weight: float
|
|
29
|
+
:ivar lexical_config: Optional lexical backend configuration.
|
|
30
|
+
:vartype lexical_config: dict[str, object]
|
|
31
|
+
:ivar embedding_config: Optional embedding backend configuration.
|
|
32
|
+
:vartype embedding_config: dict[str, object]
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="forbid")
|
|
36
|
+
|
|
37
|
+
lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
|
|
38
|
+
embedding_backend: str = Field(default="vector", min_length=1)
|
|
39
|
+
lexical_weight: float = Field(default=0.5, ge=0, le=1)
|
|
40
|
+
embedding_weight: float = Field(default=0.5, ge=0, le=1)
|
|
41
|
+
lexical_config: Dict[str, object] = Field(default_factory=dict)
|
|
42
|
+
embedding_config: Dict[str, object] = Field(default_factory=dict)
|
|
43
|
+
|
|
44
|
+
@model_validator(mode="after")
|
|
45
|
+
def _validate_weights(self) -> "HybridRecipeConfig":
|
|
46
|
+
if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
|
|
47
|
+
raise ValueError("weights must sum to 1")
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class HybridBackend:
|
|
52
|
+
"""
|
|
53
|
+
Hybrid backend that fuses lexical and embedding retrieval.
|
|
54
|
+
|
|
55
|
+
:ivar backend_id: Backend identifier.
|
|
56
|
+
:vartype backend_id: str
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
backend_id = "hybrid"
|
|
60
|
+
|
|
61
|
+
def build_run(
|
|
62
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
63
|
+
) -> RetrievalRun:
|
|
64
|
+
"""
|
|
65
|
+
Build or register a hybrid retrieval run.
|
|
66
|
+
|
|
67
|
+
:param corpus: Corpus to build against.
|
|
68
|
+
:type corpus: Corpus
|
|
69
|
+
:param recipe_name: Human-readable recipe name.
|
|
70
|
+
:type recipe_name: str
|
|
71
|
+
:param config: Backend-specific configuration values.
|
|
72
|
+
:type config: dict[str, object]
|
|
73
|
+
:return: Run manifest describing the build.
|
|
74
|
+
:rtype: RetrievalRun
|
|
75
|
+
"""
|
|
76
|
+
recipe_config = HybridRecipeConfig.model_validate(config)
|
|
77
|
+
_ensure_backend_supported(recipe_config)
|
|
78
|
+
lexical_backend = _resolve_backend(recipe_config.lexical_backend)
|
|
79
|
+
embedding_backend = _resolve_backend(recipe_config.embedding_backend)
|
|
80
|
+
lexical_run = lexical_backend.build_run(
|
|
81
|
+
corpus, recipe_name=f"{recipe_name}-lexical", config=recipe_config.lexical_config
|
|
82
|
+
)
|
|
83
|
+
embedding_run = embedding_backend.build_run(
|
|
84
|
+
corpus, recipe_name=f"{recipe_name}-embedding", config=recipe_config.embedding_config
|
|
85
|
+
)
|
|
86
|
+
recipe = create_recipe_manifest(
|
|
87
|
+
backend_id=self.backend_id,
|
|
88
|
+
name=recipe_name,
|
|
89
|
+
config=recipe_config.model_dump(),
|
|
90
|
+
)
|
|
91
|
+
stats = {
|
|
92
|
+
"lexical_run_id": lexical_run.run_id,
|
|
93
|
+
"embedding_run_id": embedding_run.run_id,
|
|
94
|
+
}
|
|
95
|
+
run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
|
|
96
|
+
corpus.write_run(run)
|
|
97
|
+
return run
|
|
98
|
+
|
|
99
|
+
def query(
|
|
100
|
+
self,
|
|
101
|
+
corpus: Corpus,
|
|
102
|
+
*,
|
|
103
|
+
run: RetrievalRun,
|
|
104
|
+
query_text: str,
|
|
105
|
+
budget: QueryBudget,
|
|
106
|
+
) -> RetrievalResult:
|
|
107
|
+
"""
|
|
108
|
+
Query using both lexical and embedding backends and fuse scores.
|
|
109
|
+
|
|
110
|
+
:param corpus: Corpus associated with the run.
|
|
111
|
+
:type corpus: Corpus
|
|
112
|
+
:param run: Run manifest to use for querying.
|
|
113
|
+
:type run: RetrievalRun
|
|
114
|
+
:param query_text: Query text to execute.
|
|
115
|
+
:type query_text: str
|
|
116
|
+
:param budget: Evidence selection budget.
|
|
117
|
+
:type budget: QueryBudget
|
|
118
|
+
:return: Retrieval results containing evidence.
|
|
119
|
+
:rtype: RetrievalResult
|
|
120
|
+
"""
|
|
121
|
+
recipe_config = HybridRecipeConfig.model_validate(run.recipe.config)
|
|
122
|
+
_ensure_backend_supported(recipe_config)
|
|
123
|
+
lexical_backend = _resolve_backend(recipe_config.lexical_backend)
|
|
124
|
+
embedding_backend = _resolve_backend(recipe_config.embedding_backend)
|
|
125
|
+
lexical_run_id = run.stats.get("lexical_run_id")
|
|
126
|
+
embedding_run_id = run.stats.get("embedding_run_id")
|
|
127
|
+
if not lexical_run_id or not embedding_run_id:
|
|
128
|
+
raise ValueError("Hybrid run missing lexical or embedding run identifiers")
|
|
129
|
+
lexical_run = corpus.load_run(str(lexical_run_id))
|
|
130
|
+
embedding_run = corpus.load_run(str(embedding_run_id))
|
|
131
|
+
component_budget = _expand_component_budget(budget)
|
|
132
|
+
lexical_result = lexical_backend.query(
|
|
133
|
+
corpus, run=lexical_run, query_text=query_text, budget=component_budget
|
|
134
|
+
)
|
|
135
|
+
embedding_result = embedding_backend.query(
|
|
136
|
+
corpus, run=embedding_run, query_text=query_text, budget=component_budget
|
|
137
|
+
)
|
|
138
|
+
candidates = _fuse_evidence(
|
|
139
|
+
lexical_result.evidence,
|
|
140
|
+
embedding_result.evidence,
|
|
141
|
+
lexical_weight=recipe_config.lexical_weight,
|
|
142
|
+
embedding_weight=recipe_config.embedding_weight,
|
|
143
|
+
)
|
|
144
|
+
sorted_candidates = sorted(
|
|
145
|
+
candidates,
|
|
146
|
+
key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
|
|
147
|
+
)
|
|
148
|
+
ranked = [
|
|
149
|
+
evidence_item.model_copy(
|
|
150
|
+
update={
|
|
151
|
+
"rank": index,
|
|
152
|
+
"recipe_id": run.recipe.recipe_id,
|
|
153
|
+
"run_id": run.run_id,
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
157
|
+
]
|
|
158
|
+
evidence = apply_budget(ranked, budget)
|
|
159
|
+
stats = {
|
|
160
|
+
"candidates": len(sorted_candidates),
|
|
161
|
+
"returned": len(evidence),
|
|
162
|
+
"fusion_weights": {
|
|
163
|
+
"lexical": recipe_config.lexical_weight,
|
|
164
|
+
"embedding": recipe_config.embedding_weight,
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
return RetrievalResult(
|
|
168
|
+
query_text=query_text,
|
|
169
|
+
budget=budget,
|
|
170
|
+
run_id=run.run_id,
|
|
171
|
+
recipe_id=run.recipe.recipe_id,
|
|
172
|
+
backend_id=self.backend_id,
|
|
173
|
+
generated_at=utc_now_iso(),
|
|
174
|
+
evidence=evidence,
|
|
175
|
+
stats=stats,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _ensure_backend_supported(recipe_config: HybridRecipeConfig) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Validate that hybrid backends do not reference the hybrid backend itself.
|
|
182
|
+
|
|
183
|
+
:param recipe_config: Parsed hybrid recipe configuration.
|
|
184
|
+
:type recipe_config: HybridRecipeConfig
|
|
185
|
+
:return: None.
|
|
186
|
+
:rtype: None
|
|
187
|
+
:raises ValueError: If hybrid is used as a component backend.
|
|
188
|
+
"""
|
|
189
|
+
if recipe_config.lexical_backend == HybridBackend.backend_id:
|
|
190
|
+
raise ValueError("Hybrid backend cannot use itself as the lexical backend")
|
|
191
|
+
if recipe_config.embedding_backend == HybridBackend.backend_id:
|
|
192
|
+
raise ValueError("Hybrid backend cannot use itself as the embedding backend")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _resolve_backend(backend_id: str):
|
|
196
|
+
"""
|
|
197
|
+
Resolve a backend by identifier.
|
|
198
|
+
|
|
199
|
+
:param backend_id: Backend identifier.
|
|
200
|
+
:type backend_id: str
|
|
201
|
+
:return: Backend instance.
|
|
202
|
+
:rtype: object
|
|
203
|
+
"""
|
|
204
|
+
from . import get_backend
|
|
205
|
+
|
|
206
|
+
return get_backend(backend_id)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
|
|
210
|
+
"""
|
|
211
|
+
Expand a final budget to collect more candidates for fusion.
|
|
212
|
+
|
|
213
|
+
:param budget: Final evidence budget.
|
|
214
|
+
:type budget: QueryBudget
|
|
215
|
+
:param multiplier: Candidate expansion multiplier.
|
|
216
|
+
:type multiplier: int
|
|
217
|
+
:return: Expanded budget for component backends.
|
|
218
|
+
:rtype: QueryBudget
|
|
219
|
+
"""
|
|
220
|
+
max_total_characters = budget.max_total_characters
|
|
221
|
+
expanded_characters = (
|
|
222
|
+
max_total_characters * multiplier if max_total_characters is not None else None
|
|
223
|
+
)
|
|
224
|
+
return QueryBudget(
|
|
225
|
+
max_total_items=budget.max_total_items * multiplier,
|
|
226
|
+
max_total_characters=expanded_characters,
|
|
227
|
+
max_items_per_source=budget.max_items_per_source,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _fuse_evidence(
|
|
232
|
+
lexical: List[Evidence],
|
|
233
|
+
embedding: List[Evidence],
|
|
234
|
+
*,
|
|
235
|
+
lexical_weight: float,
|
|
236
|
+
embedding_weight: float,
|
|
237
|
+
) -> List[Evidence]:
|
|
238
|
+
"""
|
|
239
|
+
Fuse lexical and embedding evidence lists into hybrid candidates.
|
|
240
|
+
|
|
241
|
+
:param lexical: Lexical evidence list.
|
|
242
|
+
:type lexical: list[Evidence]
|
|
243
|
+
:param embedding: Embedding evidence list.
|
|
244
|
+
:type embedding: list[Evidence]
|
|
245
|
+
:param lexical_weight: Lexical score weight.
|
|
246
|
+
:type lexical_weight: float
|
|
247
|
+
:param embedding_weight: Embedding score weight.
|
|
248
|
+
:type embedding_weight: float
|
|
249
|
+
:return: Hybrid evidence list.
|
|
250
|
+
:rtype: list[Evidence]
|
|
251
|
+
"""
|
|
252
|
+
merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
|
|
253
|
+
for evidence_item in lexical:
|
|
254
|
+
merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
|
|
255
|
+
for evidence_item in embedding:
|
|
256
|
+
merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
|
|
257
|
+
|
|
258
|
+
candidates: List[Evidence] = []
|
|
259
|
+
for item_id, sources in merged.items():
|
|
260
|
+
lexical_evidence = sources.get("lexical")
|
|
261
|
+
embedding_evidence = sources.get("embedding")
|
|
262
|
+
lexical_score = lexical_evidence.score if lexical_evidence else 0.0
|
|
263
|
+
embedding_score = embedding_evidence.score if embedding_evidence else 0.0
|
|
264
|
+
combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
|
|
265
|
+
base_evidence = lexical_evidence or embedding_evidence
|
|
266
|
+
candidates.append(
|
|
267
|
+
Evidence(
|
|
268
|
+
item_id=item_id,
|
|
269
|
+
source_uri=base_evidence.source_uri,
|
|
270
|
+
media_type=base_evidence.media_type,
|
|
271
|
+
score=combined_score,
|
|
272
|
+
rank=1,
|
|
273
|
+
text=base_evidence.text,
|
|
274
|
+
content_ref=base_evidence.content_ref,
|
|
275
|
+
span_start=base_evidence.span_start,
|
|
276
|
+
span_end=base_evidence.span_end,
|
|
277
|
+
stage="hybrid",
|
|
278
|
+
stage_scores={"lexical": lexical_score, "embedding": embedding_score},
|
|
279
|
+
recipe_id="",
|
|
280
|
+
run_id="",
|
|
281
|
+
hash=base_evidence.hash,
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
return candidates
|
|
@@ -6,9 +6,9 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import sqlite3
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Dict, Iterable, List, Optional, Tuple
|
|
9
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
|
11
|
-
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
12
12
|
|
|
13
13
|
from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
|
|
14
14
|
from ..corpus import Corpus
|
|
@@ -35,6 +35,28 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
35
35
|
:vartype chunk_overlap: int
|
|
36
36
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
37
37
|
:vartype snippet_characters: int
|
|
38
|
+
:ivar bm25_k1: BM25 k1 tuning parameter.
|
|
39
|
+
:vartype bm25_k1: float
|
|
40
|
+
:ivar bm25_b: BM25 b tuning parameter.
|
|
41
|
+
:vartype bm25_b: float
|
|
42
|
+
:ivar ngram_min: Minimum n-gram size for lexical tuning.
|
|
43
|
+
:vartype ngram_min: int
|
|
44
|
+
:ivar ngram_max: Maximum n-gram size for lexical tuning.
|
|
45
|
+
:vartype ngram_max: int
|
|
46
|
+
:ivar stop_words: Optional stop word policy or list.
|
|
47
|
+
:vartype stop_words: str or list[str] or None
|
|
48
|
+
:ivar field_weight_title: Relative weight for title field matches.
|
|
49
|
+
:vartype field_weight_title: float
|
|
50
|
+
:ivar field_weight_body: Relative weight for body field matches.
|
|
51
|
+
:vartype field_weight_body: float
|
|
52
|
+
:ivar field_weight_tags: Relative weight for tag field matches.
|
|
53
|
+
:vartype field_weight_tags: float
|
|
54
|
+
:ivar rerank_enabled: Whether to apply reranking to retrieved candidates.
|
|
55
|
+
:vartype rerank_enabled: bool
|
|
56
|
+
:ivar rerank_model: Reranker model identifier for metadata.
|
|
57
|
+
:vartype rerank_model: str or None
|
|
58
|
+
:ivar rerank_top_k: Number of candidates to rerank.
|
|
59
|
+
:vartype rerank_top_k: int
|
|
38
60
|
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
39
61
|
:vartype extraction_run: str or None
|
|
40
62
|
"""
|
|
@@ -44,8 +66,81 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
44
66
|
chunk_size: int = Field(default=800, ge=1)
|
|
45
67
|
chunk_overlap: int = Field(default=200, ge=0)
|
|
46
68
|
snippet_characters: int = Field(default=400, ge=1)
|
|
69
|
+
bm25_k1: float = Field(default=1.2, gt=0)
|
|
70
|
+
bm25_b: float = Field(default=0.75, ge=0, le=1)
|
|
71
|
+
ngram_min: int = Field(default=1, ge=1)
|
|
72
|
+
ngram_max: int = Field(default=1, ge=1)
|
|
73
|
+
stop_words: Optional[Union[str, List[str]]] = None
|
|
74
|
+
field_weight_title: float = Field(default=1.0, ge=0)
|
|
75
|
+
field_weight_body: float = Field(default=1.0, ge=0)
|
|
76
|
+
field_weight_tags: float = Field(default=1.0, ge=0)
|
|
77
|
+
rerank_enabled: bool = False
|
|
78
|
+
rerank_model: Optional[str] = None
|
|
79
|
+
rerank_top_k: int = Field(default=10, ge=1)
|
|
47
80
|
extraction_run: Optional[str] = None
|
|
48
81
|
|
|
82
|
+
@field_validator("stop_words")
|
|
83
|
+
@classmethod
|
|
84
|
+
def _validate_stop_words(
|
|
85
|
+
cls, value: Optional[Union[str, List[str]]]
|
|
86
|
+
) -> Optional[Union[str, List[str]]]:
|
|
87
|
+
if value is None:
|
|
88
|
+
return None
|
|
89
|
+
if isinstance(value, str):
|
|
90
|
+
if value.lower() == "english":
|
|
91
|
+
return "english"
|
|
92
|
+
raise ValueError("stop_words must be 'english' or a list of strings")
|
|
93
|
+
if not value:
|
|
94
|
+
raise ValueError("stop_words list must not be empty")
|
|
95
|
+
if any(not isinstance(token, str) or not token.strip() for token in value):
|
|
96
|
+
raise ValueError("stop_words list must contain non-empty strings")
|
|
97
|
+
return value
|
|
98
|
+
|
|
99
|
+
@model_validator(mode="after")
|
|
100
|
+
def _validate_ngram_range(self) -> "SqliteFullTextSearchRecipeConfig":
|
|
101
|
+
if self.ngram_min > self.ngram_max:
|
|
102
|
+
raise ValueError("Invalid ngram range: ngram_min must be <= ngram_max")
|
|
103
|
+
if self.rerank_enabled and not self.rerank_model:
|
|
104
|
+
raise ValueError("Rerank enabled requires rerank_model")
|
|
105
|
+
return self
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
_ENGLISH_STOP_WORDS: Set[str] = {
|
|
109
|
+
"a",
|
|
110
|
+
"an",
|
|
111
|
+
"and",
|
|
112
|
+
"are",
|
|
113
|
+
"as",
|
|
114
|
+
"at",
|
|
115
|
+
"be",
|
|
116
|
+
"but",
|
|
117
|
+
"by",
|
|
118
|
+
"for",
|
|
119
|
+
"if",
|
|
120
|
+
"in",
|
|
121
|
+
"into",
|
|
122
|
+
"is",
|
|
123
|
+
"it",
|
|
124
|
+
"no",
|
|
125
|
+
"not",
|
|
126
|
+
"of",
|
|
127
|
+
"on",
|
|
128
|
+
"or",
|
|
129
|
+
"such",
|
|
130
|
+
"that",
|
|
131
|
+
"the",
|
|
132
|
+
"their",
|
|
133
|
+
"then",
|
|
134
|
+
"there",
|
|
135
|
+
"these",
|
|
136
|
+
"they",
|
|
137
|
+
"this",
|
|
138
|
+
"to",
|
|
139
|
+
"was",
|
|
140
|
+
"will",
|
|
141
|
+
"with",
|
|
142
|
+
}
|
|
143
|
+
|
|
49
144
|
|
|
50
145
|
class SqliteFullTextSearchBackend:
|
|
51
146
|
"""
|
|
@@ -118,29 +213,39 @@ class SqliteFullTextSearchBackend:
|
|
|
118
213
|
:rtype: RetrievalResult
|
|
119
214
|
"""
|
|
120
215
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
|
|
216
|
+
query_tokens = _tokenize_query(query_text)
|
|
217
|
+
stop_words = _resolve_stop_words(recipe_config.stop_words)
|
|
218
|
+
filtered_tokens = _apply_stop_words(query_tokens, stop_words)
|
|
219
|
+
if not filtered_tokens:
|
|
220
|
+
return RetrievalResult(
|
|
221
|
+
query_text=query_text,
|
|
222
|
+
budget=budget,
|
|
223
|
+
run_id=run.run_id,
|
|
224
|
+
recipe_id=run.recipe.recipe_id,
|
|
225
|
+
backend_id=self.backend_id,
|
|
226
|
+
generated_at=utc_now_iso(),
|
|
227
|
+
evidence=[],
|
|
228
|
+
stats={"candidates": 0, "returned": 0},
|
|
229
|
+
)
|
|
121
230
|
db_path = _resolve_run_db_path(corpus, run)
|
|
122
231
|
candidates = _query_full_text_search_index(
|
|
123
232
|
db_path=db_path,
|
|
124
|
-
query_text=
|
|
233
|
+
query_text=" ".join(filtered_tokens),
|
|
125
234
|
limit=_candidate_limit(budget.max_total_items),
|
|
126
235
|
snippet_characters=recipe_config.snippet_characters,
|
|
127
236
|
)
|
|
128
|
-
sorted_candidates =
|
|
129
|
-
|
|
130
|
-
|
|
237
|
+
sorted_candidates = _rank_candidates(candidates)
|
|
238
|
+
evidence = _apply_rerank_if_enabled(
|
|
239
|
+
sorted_candidates,
|
|
240
|
+
query_tokens=filtered_tokens,
|
|
241
|
+
run=run,
|
|
242
|
+
budget=budget,
|
|
243
|
+
rerank_enabled=recipe_config.rerank_enabled,
|
|
244
|
+
rerank_top_k=recipe_config.rerank_top_k,
|
|
131
245
|
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
"rank": index,
|
|
136
|
-
"recipe_id": run.recipe.recipe_id,
|
|
137
|
-
"run_id": run.run_id,
|
|
138
|
-
}
|
|
139
|
-
)
|
|
140
|
-
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
141
|
-
]
|
|
142
|
-
evidence = apply_budget(ranked, budget)
|
|
143
|
-
stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
246
|
+
stats: Dict[str, object] = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
247
|
+
if recipe_config.rerank_enabled:
|
|
248
|
+
stats["reranked_candidates"] = min(len(sorted_candidates), recipe_config.rerank_top_k)
|
|
144
249
|
return RetrievalResult(
|
|
145
250
|
query_text=query_text,
|
|
146
251
|
budget=budget,
|
|
@@ -165,6 +270,147 @@ def _candidate_limit(max_total_items: int) -> int:
|
|
|
165
270
|
return max_total_items * 5
|
|
166
271
|
|
|
167
272
|
|
|
273
|
+
def _tokenize_query(query_text: str) -> List[str]:
|
|
274
|
+
"""
|
|
275
|
+
Tokenize a query string into lowercased terms.
|
|
276
|
+
|
|
277
|
+
:param query_text: Raw query text.
|
|
278
|
+
:type query_text: str
|
|
279
|
+
:return: Token list.
|
|
280
|
+
:rtype: list[str]
|
|
281
|
+
"""
|
|
282
|
+
return [token for token in query_text.lower().split() if token]
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _resolve_stop_words(value: Optional[Union[str, List[str]]]) -> Set[str]:
|
|
286
|
+
"""
|
|
287
|
+
Resolve stop words based on a configuration value.
|
|
288
|
+
|
|
289
|
+
:param value: Stop word configuration.
|
|
290
|
+
:type value: str or list[str] or None
|
|
291
|
+
:return: Stop word set.
|
|
292
|
+
:rtype: set[str]
|
|
293
|
+
"""
|
|
294
|
+
if value is None:
|
|
295
|
+
return set()
|
|
296
|
+
if isinstance(value, str):
|
|
297
|
+
return set(_ENGLISH_STOP_WORDS)
|
|
298
|
+
return {token.strip().lower() for token in value if token.strip()}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _apply_stop_words(tokens: List[str], stop_words: Set[str]) -> List[str]:
|
|
302
|
+
"""
|
|
303
|
+
Filter query tokens by a stop word list.
|
|
304
|
+
|
|
305
|
+
:param tokens: Token list.
|
|
306
|
+
:type tokens: list[str]
|
|
307
|
+
:param stop_words: Stop word set.
|
|
308
|
+
:type stop_words: set[str]
|
|
309
|
+
:return: Filtered token list.
|
|
310
|
+
:rtype: list[str]
|
|
311
|
+
"""
|
|
312
|
+
if not stop_words:
|
|
313
|
+
return tokens
|
|
314
|
+
return [token for token in tokens if token not in stop_words]
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _rank_candidates(candidates: List[Evidence]) -> List[Evidence]:
|
|
318
|
+
"""
|
|
319
|
+
Sort evidence candidates by descending score.
|
|
320
|
+
|
|
321
|
+
:param candidates: Evidence list to sort.
|
|
322
|
+
:type candidates: list[Evidence]
|
|
323
|
+
:return: Sorted evidence list.
|
|
324
|
+
:rtype: list[Evidence]
|
|
325
|
+
"""
|
|
326
|
+
return sorted(
|
|
327
|
+
candidates, key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id)
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _rerank_score(text: str, query_tokens: List[str]) -> float:
|
|
332
|
+
"""
|
|
333
|
+
Compute a simple rerank score using token overlap.
|
|
334
|
+
|
|
335
|
+
:param text: Candidate text.
|
|
336
|
+
:type text: str
|
|
337
|
+
:param query_tokens: Query tokens.
|
|
338
|
+
:type query_tokens: list[str]
|
|
339
|
+
:return: Rerank score.
|
|
340
|
+
:rtype: float
|
|
341
|
+
"""
|
|
342
|
+
lower_text = text.lower() if text else ""
|
|
343
|
+
return float(sum(1 for token in query_tokens if token in lower_text))
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _apply_rerank_if_enabled(
|
|
347
|
+
candidates: List[Evidence],
|
|
348
|
+
*,
|
|
349
|
+
query_tokens: List[str],
|
|
350
|
+
run: RetrievalRun,
|
|
351
|
+
budget: QueryBudget,
|
|
352
|
+
rerank_enabled: bool,
|
|
353
|
+
rerank_top_k: int,
|
|
354
|
+
) -> List[Evidence]:
|
|
355
|
+
"""
|
|
356
|
+
Rerank candidates when enabled, otherwise apply the budget to ranked results.
|
|
357
|
+
|
|
358
|
+
:param candidates: Ranked candidate evidence.
|
|
359
|
+
:type candidates: list[Evidence]
|
|
360
|
+
:param query_tokens: Query tokens used for reranking.
|
|
361
|
+
:type query_tokens: list[str]
|
|
362
|
+
:param run: Retrieval run to annotate evidence with.
|
|
363
|
+
:type run: RetrievalRun
|
|
364
|
+
:param budget: Evidence selection budget.
|
|
365
|
+
:type budget: QueryBudget
|
|
366
|
+
:param rerank_enabled: Whether reranking is enabled.
|
|
367
|
+
:type rerank_enabled: bool
|
|
368
|
+
:param rerank_top_k: Maximum candidates to rerank.
|
|
369
|
+
:type rerank_top_k: int
|
|
370
|
+
:return: Evidence list respecting budget.
|
|
371
|
+
:rtype: list[Evidence]
|
|
372
|
+
"""
|
|
373
|
+
if not rerank_enabled:
|
|
374
|
+
ranked = [
|
|
375
|
+
evidence_item.model_copy(
|
|
376
|
+
update={
|
|
377
|
+
"rank": index,
|
|
378
|
+
"recipe_id": run.recipe.recipe_id,
|
|
379
|
+
"run_id": run.run_id,
|
|
380
|
+
}
|
|
381
|
+
)
|
|
382
|
+
for index, evidence_item in enumerate(candidates, start=1)
|
|
383
|
+
]
|
|
384
|
+
return apply_budget(ranked, budget)
|
|
385
|
+
|
|
386
|
+
rerank_limit = min(len(candidates), rerank_top_k)
|
|
387
|
+
rerank_candidates = candidates[:rerank_limit]
|
|
388
|
+
reranked: List[Evidence] = []
|
|
389
|
+
for evidence_item in rerank_candidates:
|
|
390
|
+
rerank_score = _rerank_score(evidence_item.text or "", query_tokens)
|
|
391
|
+
reranked.append(
|
|
392
|
+
evidence_item.model_copy(
|
|
393
|
+
update={
|
|
394
|
+
"score": rerank_score,
|
|
395
|
+
"stage": "rerank",
|
|
396
|
+
"stage_scores": {"retrieve": evidence_item.score, "rerank": rerank_score},
|
|
397
|
+
}
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
reranked_sorted = _rank_candidates(reranked)
|
|
401
|
+
ranked = [
|
|
402
|
+
evidence_item.model_copy(
|
|
403
|
+
update={
|
|
404
|
+
"rank": index,
|
|
405
|
+
"recipe_id": run.recipe.recipe_id,
|
|
406
|
+
"run_id": run.run_id,
|
|
407
|
+
}
|
|
408
|
+
)
|
|
409
|
+
for index, evidence_item in enumerate(reranked_sorted, start=1)
|
|
410
|
+
]
|
|
411
|
+
return apply_budget(ranked, budget)
|
|
412
|
+
|
|
413
|
+
|
|
168
414
|
def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
|
|
169
415
|
"""
|
|
170
416
|
Resolve the SQLite index path for a retrieval run.
|
|
@@ -213,8 +459,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
213
459
|
:return: None.
|
|
214
460
|
:rtype: None
|
|
215
461
|
"""
|
|
216
|
-
conn.execute(
|
|
217
|
-
"""
|
|
462
|
+
conn.execute("""
|
|
218
463
|
CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
|
|
219
464
|
content,
|
|
220
465
|
item_id UNINDEXED,
|
|
@@ -225,8 +470,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
225
470
|
start_offset UNINDEXED,
|
|
226
471
|
end_offset UNINDEXED
|
|
227
472
|
)
|
|
228
|
-
"""
|
|
229
|
-
)
|
|
473
|
+
""")
|
|
230
474
|
|
|
231
475
|
|
|
232
476
|
def _build_full_text_search_index(
|