biblicus 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/profiling.py +1 -1
- biblicus/backends/__init__.py +4 -0
- biblicus/backends/hybrid.py +284 -0
- biblicus/backends/sqlite_full_text_search.py +264 -18
- biblicus/backends/vector.py +460 -0
- biblicus/models.py +3 -0
- {biblicus-0.10.0.dist-info → biblicus-0.11.0.dist-info}/METADATA +7 -1
- {biblicus-0.10.0.dist-info → biblicus-0.11.0.dist-info}/RECORD +13 -11
- {biblicus-0.10.0.dist-info → biblicus-0.11.0.dist-info}/WHEEL +0 -0
- {biblicus-0.10.0.dist-info → biblicus-0.11.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.10.0.dist-info → biblicus-0.11.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.10.0.dist-info → biblicus-0.11.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/analysis/profiling.py
CHANGED
|
@@ -266,7 +266,7 @@ def _build_extracted_text_report(
|
|
|
266
266
|
empty_items += 1
|
|
267
267
|
continue
|
|
268
268
|
nonempty_items += 1
|
|
269
|
-
text_lengths.append(len(
|
|
269
|
+
text_lengths.append(len(stripped))
|
|
270
270
|
|
|
271
271
|
sampled_lengths = _apply_sample(text_lengths, config.sample_size)
|
|
272
272
|
characters_distribution = _build_distribution(sampled_lengths, config.percentiles)
|
biblicus/backends/__init__.py
CHANGED
|
@@ -7,8 +7,10 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict, Type
|
|
8
8
|
|
|
9
9
|
from .base import RetrievalBackend
|
|
10
|
+
from .hybrid import HybridBackend
|
|
10
11
|
from .scan import ScanBackend
|
|
11
12
|
from .sqlite_full_text_search import SqliteFullTextSearchBackend
|
|
13
|
+
from .vector import VectorBackend
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
@@ -19,8 +21,10 @@ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
|
|
|
19
21
|
:rtype: dict[str, Type[RetrievalBackend]]
|
|
20
22
|
"""
|
|
21
23
|
return {
|
|
24
|
+
HybridBackend.backend_id: HybridBackend,
|
|
22
25
|
ScanBackend.backend_id: ScanBackend,
|
|
23
26
|
SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
|
|
27
|
+
VectorBackend.backend_id: VectorBackend,
|
|
24
28
|
}
|
|
25
29
|
|
|
26
30
|
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hybrid retrieval backend combining lexical and vector results.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
|
|
13
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest
|
|
14
|
+
from ..time import utc_now_iso
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HybridRecipeConfig(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Configuration for hybrid retrieval fusion.
|
|
20
|
+
|
|
21
|
+
:ivar lexical_backend: Backend identifier for lexical retrieval.
|
|
22
|
+
:vartype lexical_backend: str
|
|
23
|
+
:ivar embedding_backend: Backend identifier for embedding retrieval.
|
|
24
|
+
:vartype embedding_backend: str
|
|
25
|
+
:ivar lexical_weight: Weight for lexical scores.
|
|
26
|
+
:vartype lexical_weight: float
|
|
27
|
+
:ivar embedding_weight: Weight for embedding scores.
|
|
28
|
+
:vartype embedding_weight: float
|
|
29
|
+
:ivar lexical_config: Optional lexical backend configuration.
|
|
30
|
+
:vartype lexical_config: dict[str, object]
|
|
31
|
+
:ivar embedding_config: Optional embedding backend configuration.
|
|
32
|
+
:vartype embedding_config: dict[str, object]
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="forbid")
|
|
36
|
+
|
|
37
|
+
lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
|
|
38
|
+
embedding_backend: str = Field(default="vector", min_length=1)
|
|
39
|
+
lexical_weight: float = Field(default=0.5, ge=0, le=1)
|
|
40
|
+
embedding_weight: float = Field(default=0.5, ge=0, le=1)
|
|
41
|
+
lexical_config: Dict[str, object] = Field(default_factory=dict)
|
|
42
|
+
embedding_config: Dict[str, object] = Field(default_factory=dict)
|
|
43
|
+
|
|
44
|
+
@model_validator(mode="after")
|
|
45
|
+
def _validate_weights(self) -> "HybridRecipeConfig":
|
|
46
|
+
if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
|
|
47
|
+
raise ValueError("weights must sum to 1")
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class HybridBackend:
|
|
52
|
+
"""
|
|
53
|
+
Hybrid backend that fuses lexical and embedding retrieval.
|
|
54
|
+
|
|
55
|
+
:ivar backend_id: Backend identifier.
|
|
56
|
+
:vartype backend_id: str
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
backend_id = "hybrid"
|
|
60
|
+
|
|
61
|
+
def build_run(
|
|
62
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
63
|
+
) -> RetrievalRun:
|
|
64
|
+
"""
|
|
65
|
+
Build or register a hybrid retrieval run.
|
|
66
|
+
|
|
67
|
+
:param corpus: Corpus to build against.
|
|
68
|
+
:type corpus: Corpus
|
|
69
|
+
:param recipe_name: Human-readable recipe name.
|
|
70
|
+
:type recipe_name: str
|
|
71
|
+
:param config: Backend-specific configuration values.
|
|
72
|
+
:type config: dict[str, object]
|
|
73
|
+
:return: Run manifest describing the build.
|
|
74
|
+
:rtype: RetrievalRun
|
|
75
|
+
"""
|
|
76
|
+
recipe_config = HybridRecipeConfig.model_validate(config)
|
|
77
|
+
_ensure_backend_supported(recipe_config)
|
|
78
|
+
lexical_backend = _resolve_backend(recipe_config.lexical_backend)
|
|
79
|
+
embedding_backend = _resolve_backend(recipe_config.embedding_backend)
|
|
80
|
+
lexical_run = lexical_backend.build_run(
|
|
81
|
+
corpus, recipe_name=f"{recipe_name}-lexical", config=recipe_config.lexical_config
|
|
82
|
+
)
|
|
83
|
+
embedding_run = embedding_backend.build_run(
|
|
84
|
+
corpus, recipe_name=f"{recipe_name}-embedding", config=recipe_config.embedding_config
|
|
85
|
+
)
|
|
86
|
+
recipe = create_recipe_manifest(
|
|
87
|
+
backend_id=self.backend_id,
|
|
88
|
+
name=recipe_name,
|
|
89
|
+
config=recipe_config.model_dump(),
|
|
90
|
+
)
|
|
91
|
+
stats = {
|
|
92
|
+
"lexical_run_id": lexical_run.run_id,
|
|
93
|
+
"embedding_run_id": embedding_run.run_id,
|
|
94
|
+
}
|
|
95
|
+
run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
|
|
96
|
+
corpus.write_run(run)
|
|
97
|
+
return run
|
|
98
|
+
|
|
99
|
+
def query(
|
|
100
|
+
self,
|
|
101
|
+
corpus: Corpus,
|
|
102
|
+
*,
|
|
103
|
+
run: RetrievalRun,
|
|
104
|
+
query_text: str,
|
|
105
|
+
budget: QueryBudget,
|
|
106
|
+
) -> RetrievalResult:
|
|
107
|
+
"""
|
|
108
|
+
Query using both lexical and embedding backends and fuse scores.
|
|
109
|
+
|
|
110
|
+
:param corpus: Corpus associated with the run.
|
|
111
|
+
:type corpus: Corpus
|
|
112
|
+
:param run: Run manifest to use for querying.
|
|
113
|
+
:type run: RetrievalRun
|
|
114
|
+
:param query_text: Query text to execute.
|
|
115
|
+
:type query_text: str
|
|
116
|
+
:param budget: Evidence selection budget.
|
|
117
|
+
:type budget: QueryBudget
|
|
118
|
+
:return: Retrieval results containing evidence.
|
|
119
|
+
:rtype: RetrievalResult
|
|
120
|
+
"""
|
|
121
|
+
recipe_config = HybridRecipeConfig.model_validate(run.recipe.config)
|
|
122
|
+
_ensure_backend_supported(recipe_config)
|
|
123
|
+
lexical_backend = _resolve_backend(recipe_config.lexical_backend)
|
|
124
|
+
embedding_backend = _resolve_backend(recipe_config.embedding_backend)
|
|
125
|
+
lexical_run_id = run.stats.get("lexical_run_id")
|
|
126
|
+
embedding_run_id = run.stats.get("embedding_run_id")
|
|
127
|
+
if not lexical_run_id or not embedding_run_id:
|
|
128
|
+
raise ValueError("Hybrid run missing lexical or embedding run identifiers")
|
|
129
|
+
lexical_run = corpus.load_run(str(lexical_run_id))
|
|
130
|
+
embedding_run = corpus.load_run(str(embedding_run_id))
|
|
131
|
+
component_budget = _expand_component_budget(budget)
|
|
132
|
+
lexical_result = lexical_backend.query(
|
|
133
|
+
corpus, run=lexical_run, query_text=query_text, budget=component_budget
|
|
134
|
+
)
|
|
135
|
+
embedding_result = embedding_backend.query(
|
|
136
|
+
corpus, run=embedding_run, query_text=query_text, budget=component_budget
|
|
137
|
+
)
|
|
138
|
+
candidates = _fuse_evidence(
|
|
139
|
+
lexical_result.evidence,
|
|
140
|
+
embedding_result.evidence,
|
|
141
|
+
lexical_weight=recipe_config.lexical_weight,
|
|
142
|
+
embedding_weight=recipe_config.embedding_weight,
|
|
143
|
+
)
|
|
144
|
+
sorted_candidates = sorted(
|
|
145
|
+
candidates,
|
|
146
|
+
key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
|
|
147
|
+
)
|
|
148
|
+
ranked = [
|
|
149
|
+
evidence_item.model_copy(
|
|
150
|
+
update={
|
|
151
|
+
"rank": index,
|
|
152
|
+
"recipe_id": run.recipe.recipe_id,
|
|
153
|
+
"run_id": run.run_id,
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
157
|
+
]
|
|
158
|
+
evidence = apply_budget(ranked, budget)
|
|
159
|
+
stats = {
|
|
160
|
+
"candidates": len(sorted_candidates),
|
|
161
|
+
"returned": len(evidence),
|
|
162
|
+
"fusion_weights": {
|
|
163
|
+
"lexical": recipe_config.lexical_weight,
|
|
164
|
+
"embedding": recipe_config.embedding_weight,
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
return RetrievalResult(
|
|
168
|
+
query_text=query_text,
|
|
169
|
+
budget=budget,
|
|
170
|
+
run_id=run.run_id,
|
|
171
|
+
recipe_id=run.recipe.recipe_id,
|
|
172
|
+
backend_id=self.backend_id,
|
|
173
|
+
generated_at=utc_now_iso(),
|
|
174
|
+
evidence=evidence,
|
|
175
|
+
stats=stats,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _ensure_backend_supported(recipe_config: HybridRecipeConfig) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Validate that hybrid backends do not reference the hybrid backend itself.
|
|
182
|
+
|
|
183
|
+
:param recipe_config: Parsed hybrid recipe configuration.
|
|
184
|
+
:type recipe_config: HybridRecipeConfig
|
|
185
|
+
:return: None.
|
|
186
|
+
:rtype: None
|
|
187
|
+
:raises ValueError: If hybrid is used as a component backend.
|
|
188
|
+
"""
|
|
189
|
+
if recipe_config.lexical_backend == HybridBackend.backend_id:
|
|
190
|
+
raise ValueError("Hybrid backend cannot use itself as the lexical backend")
|
|
191
|
+
if recipe_config.embedding_backend == HybridBackend.backend_id:
|
|
192
|
+
raise ValueError("Hybrid backend cannot use itself as the embedding backend")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _resolve_backend(backend_id: str):
|
|
196
|
+
"""
|
|
197
|
+
Resolve a backend by identifier.
|
|
198
|
+
|
|
199
|
+
:param backend_id: Backend identifier.
|
|
200
|
+
:type backend_id: str
|
|
201
|
+
:return: Backend instance.
|
|
202
|
+
:rtype: object
|
|
203
|
+
"""
|
|
204
|
+
from . import get_backend
|
|
205
|
+
|
|
206
|
+
return get_backend(backend_id)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
|
|
210
|
+
"""
|
|
211
|
+
Expand a final budget to collect more candidates for fusion.
|
|
212
|
+
|
|
213
|
+
:param budget: Final evidence budget.
|
|
214
|
+
:type budget: QueryBudget
|
|
215
|
+
:param multiplier: Candidate expansion multiplier.
|
|
216
|
+
:type multiplier: int
|
|
217
|
+
:return: Expanded budget for component backends.
|
|
218
|
+
:rtype: QueryBudget
|
|
219
|
+
"""
|
|
220
|
+
max_total_characters = budget.max_total_characters
|
|
221
|
+
expanded_characters = (
|
|
222
|
+
max_total_characters * multiplier if max_total_characters is not None else None
|
|
223
|
+
)
|
|
224
|
+
return QueryBudget(
|
|
225
|
+
max_total_items=budget.max_total_items * multiplier,
|
|
226
|
+
max_total_characters=expanded_characters,
|
|
227
|
+
max_items_per_source=budget.max_items_per_source,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _fuse_evidence(
|
|
232
|
+
lexical: List[Evidence],
|
|
233
|
+
embedding: List[Evidence],
|
|
234
|
+
*,
|
|
235
|
+
lexical_weight: float,
|
|
236
|
+
embedding_weight: float,
|
|
237
|
+
) -> List[Evidence]:
|
|
238
|
+
"""
|
|
239
|
+
Fuse lexical and embedding evidence lists into hybrid candidates.
|
|
240
|
+
|
|
241
|
+
:param lexical: Lexical evidence list.
|
|
242
|
+
:type lexical: list[Evidence]
|
|
243
|
+
:param embedding: Embedding evidence list.
|
|
244
|
+
:type embedding: list[Evidence]
|
|
245
|
+
:param lexical_weight: Lexical score weight.
|
|
246
|
+
:type lexical_weight: float
|
|
247
|
+
:param embedding_weight: Embedding score weight.
|
|
248
|
+
:type embedding_weight: float
|
|
249
|
+
:return: Hybrid evidence list.
|
|
250
|
+
:rtype: list[Evidence]
|
|
251
|
+
"""
|
|
252
|
+
merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
|
|
253
|
+
for evidence_item in lexical:
|
|
254
|
+
merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
|
|
255
|
+
for evidence_item in embedding:
|
|
256
|
+
merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
|
|
257
|
+
|
|
258
|
+
candidates: List[Evidence] = []
|
|
259
|
+
for item_id, sources in merged.items():
|
|
260
|
+
lexical_evidence = sources.get("lexical")
|
|
261
|
+
embedding_evidence = sources.get("embedding")
|
|
262
|
+
lexical_score = lexical_evidence.score if lexical_evidence else 0.0
|
|
263
|
+
embedding_score = embedding_evidence.score if embedding_evidence else 0.0
|
|
264
|
+
combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
|
|
265
|
+
base_evidence = lexical_evidence or embedding_evidence
|
|
266
|
+
candidates.append(
|
|
267
|
+
Evidence(
|
|
268
|
+
item_id=item_id,
|
|
269
|
+
source_uri=base_evidence.source_uri,
|
|
270
|
+
media_type=base_evidence.media_type,
|
|
271
|
+
score=combined_score,
|
|
272
|
+
rank=1,
|
|
273
|
+
text=base_evidence.text,
|
|
274
|
+
content_ref=base_evidence.content_ref,
|
|
275
|
+
span_start=base_evidence.span_start,
|
|
276
|
+
span_end=base_evidence.span_end,
|
|
277
|
+
stage="hybrid",
|
|
278
|
+
stage_scores={"lexical": lexical_score, "embedding": embedding_score},
|
|
279
|
+
recipe_id="",
|
|
280
|
+
run_id="",
|
|
281
|
+
hash=base_evidence.hash,
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
return candidates
|
|
@@ -6,9 +6,9 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import sqlite3
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Dict, Iterable, List, Optional, Tuple
|
|
9
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
|
11
|
-
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
12
12
|
|
|
13
13
|
from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
|
|
14
14
|
from ..corpus import Corpus
|
|
@@ -35,6 +35,28 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
35
35
|
:vartype chunk_overlap: int
|
|
36
36
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
37
37
|
:vartype snippet_characters: int
|
|
38
|
+
:ivar bm25_k1: BM25 k1 tuning parameter.
|
|
39
|
+
:vartype bm25_k1: float
|
|
40
|
+
:ivar bm25_b: BM25 b tuning parameter.
|
|
41
|
+
:vartype bm25_b: float
|
|
42
|
+
:ivar ngram_min: Minimum n-gram size for lexical tuning.
|
|
43
|
+
:vartype ngram_min: int
|
|
44
|
+
:ivar ngram_max: Maximum n-gram size for lexical tuning.
|
|
45
|
+
:vartype ngram_max: int
|
|
46
|
+
:ivar stop_words: Optional stop word policy or list.
|
|
47
|
+
:vartype stop_words: str or list[str] or None
|
|
48
|
+
:ivar field_weight_title: Relative weight for title field matches.
|
|
49
|
+
:vartype field_weight_title: float
|
|
50
|
+
:ivar field_weight_body: Relative weight for body field matches.
|
|
51
|
+
:vartype field_weight_body: float
|
|
52
|
+
:ivar field_weight_tags: Relative weight for tag field matches.
|
|
53
|
+
:vartype field_weight_tags: float
|
|
54
|
+
:ivar rerank_enabled: Whether to apply reranking to retrieved candidates.
|
|
55
|
+
:vartype rerank_enabled: bool
|
|
56
|
+
:ivar rerank_model: Reranker model identifier for metadata.
|
|
57
|
+
:vartype rerank_model: str or None
|
|
58
|
+
:ivar rerank_top_k: Number of candidates to rerank.
|
|
59
|
+
:vartype rerank_top_k: int
|
|
38
60
|
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
39
61
|
:vartype extraction_run: str or None
|
|
40
62
|
"""
|
|
@@ -44,8 +66,81 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
44
66
|
chunk_size: int = Field(default=800, ge=1)
|
|
45
67
|
chunk_overlap: int = Field(default=200, ge=0)
|
|
46
68
|
snippet_characters: int = Field(default=400, ge=1)
|
|
69
|
+
bm25_k1: float = Field(default=1.2, gt=0)
|
|
70
|
+
bm25_b: float = Field(default=0.75, ge=0, le=1)
|
|
71
|
+
ngram_min: int = Field(default=1, ge=1)
|
|
72
|
+
ngram_max: int = Field(default=1, ge=1)
|
|
73
|
+
stop_words: Optional[Union[str, List[str]]] = None
|
|
74
|
+
field_weight_title: float = Field(default=1.0, ge=0)
|
|
75
|
+
field_weight_body: float = Field(default=1.0, ge=0)
|
|
76
|
+
field_weight_tags: float = Field(default=1.0, ge=0)
|
|
77
|
+
rerank_enabled: bool = False
|
|
78
|
+
rerank_model: Optional[str] = None
|
|
79
|
+
rerank_top_k: int = Field(default=10, ge=1)
|
|
47
80
|
extraction_run: Optional[str] = None
|
|
48
81
|
|
|
82
|
+
@field_validator("stop_words")
|
|
83
|
+
@classmethod
|
|
84
|
+
def _validate_stop_words(
|
|
85
|
+
cls, value: Optional[Union[str, List[str]]]
|
|
86
|
+
) -> Optional[Union[str, List[str]]]:
|
|
87
|
+
if value is None:
|
|
88
|
+
return None
|
|
89
|
+
if isinstance(value, str):
|
|
90
|
+
if value.lower() == "english":
|
|
91
|
+
return "english"
|
|
92
|
+
raise ValueError("stop_words must be 'english' or a list of strings")
|
|
93
|
+
if not value:
|
|
94
|
+
raise ValueError("stop_words list must not be empty")
|
|
95
|
+
if any(not isinstance(token, str) or not token.strip() for token in value):
|
|
96
|
+
raise ValueError("stop_words list must contain non-empty strings")
|
|
97
|
+
return value
|
|
98
|
+
|
|
99
|
+
@model_validator(mode="after")
|
|
100
|
+
def _validate_ngram_range(self) -> "SqliteFullTextSearchRecipeConfig":
|
|
101
|
+
if self.ngram_min > self.ngram_max:
|
|
102
|
+
raise ValueError("Invalid ngram range: ngram_min must be <= ngram_max")
|
|
103
|
+
if self.rerank_enabled and not self.rerank_model:
|
|
104
|
+
raise ValueError("Rerank enabled requires rerank_model")
|
|
105
|
+
return self
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
_ENGLISH_STOP_WORDS: Set[str] = {
|
|
109
|
+
"a",
|
|
110
|
+
"an",
|
|
111
|
+
"and",
|
|
112
|
+
"are",
|
|
113
|
+
"as",
|
|
114
|
+
"at",
|
|
115
|
+
"be",
|
|
116
|
+
"but",
|
|
117
|
+
"by",
|
|
118
|
+
"for",
|
|
119
|
+
"if",
|
|
120
|
+
"in",
|
|
121
|
+
"into",
|
|
122
|
+
"is",
|
|
123
|
+
"it",
|
|
124
|
+
"no",
|
|
125
|
+
"not",
|
|
126
|
+
"of",
|
|
127
|
+
"on",
|
|
128
|
+
"or",
|
|
129
|
+
"such",
|
|
130
|
+
"that",
|
|
131
|
+
"the",
|
|
132
|
+
"their",
|
|
133
|
+
"then",
|
|
134
|
+
"there",
|
|
135
|
+
"these",
|
|
136
|
+
"they",
|
|
137
|
+
"this",
|
|
138
|
+
"to",
|
|
139
|
+
"was",
|
|
140
|
+
"will",
|
|
141
|
+
"with",
|
|
142
|
+
}
|
|
143
|
+
|
|
49
144
|
|
|
50
145
|
class SqliteFullTextSearchBackend:
|
|
51
146
|
"""
|
|
@@ -118,29 +213,39 @@ class SqliteFullTextSearchBackend:
|
|
|
118
213
|
:rtype: RetrievalResult
|
|
119
214
|
"""
|
|
120
215
|
recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
|
|
216
|
+
query_tokens = _tokenize_query(query_text)
|
|
217
|
+
stop_words = _resolve_stop_words(recipe_config.stop_words)
|
|
218
|
+
filtered_tokens = _apply_stop_words(query_tokens, stop_words)
|
|
219
|
+
if not filtered_tokens:
|
|
220
|
+
return RetrievalResult(
|
|
221
|
+
query_text=query_text,
|
|
222
|
+
budget=budget,
|
|
223
|
+
run_id=run.run_id,
|
|
224
|
+
recipe_id=run.recipe.recipe_id,
|
|
225
|
+
backend_id=self.backend_id,
|
|
226
|
+
generated_at=utc_now_iso(),
|
|
227
|
+
evidence=[],
|
|
228
|
+
stats={"candidates": 0, "returned": 0},
|
|
229
|
+
)
|
|
121
230
|
db_path = _resolve_run_db_path(corpus, run)
|
|
122
231
|
candidates = _query_full_text_search_index(
|
|
123
232
|
db_path=db_path,
|
|
124
|
-
query_text=
|
|
233
|
+
query_text=" ".join(filtered_tokens),
|
|
125
234
|
limit=_candidate_limit(budget.max_total_items),
|
|
126
235
|
snippet_characters=recipe_config.snippet_characters,
|
|
127
236
|
)
|
|
128
|
-
sorted_candidates =
|
|
129
|
-
|
|
130
|
-
|
|
237
|
+
sorted_candidates = _rank_candidates(candidates)
|
|
238
|
+
evidence = _apply_rerank_if_enabled(
|
|
239
|
+
sorted_candidates,
|
|
240
|
+
query_tokens=filtered_tokens,
|
|
241
|
+
run=run,
|
|
242
|
+
budget=budget,
|
|
243
|
+
rerank_enabled=recipe_config.rerank_enabled,
|
|
244
|
+
rerank_top_k=recipe_config.rerank_top_k,
|
|
131
245
|
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
"rank": index,
|
|
136
|
-
"recipe_id": run.recipe.recipe_id,
|
|
137
|
-
"run_id": run.run_id,
|
|
138
|
-
}
|
|
139
|
-
)
|
|
140
|
-
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
141
|
-
]
|
|
142
|
-
evidence = apply_budget(ranked, budget)
|
|
143
|
-
stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
246
|
+
stats: Dict[str, object] = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
247
|
+
if recipe_config.rerank_enabled:
|
|
248
|
+
stats["reranked_candidates"] = min(len(sorted_candidates), recipe_config.rerank_top_k)
|
|
144
249
|
return RetrievalResult(
|
|
145
250
|
query_text=query_text,
|
|
146
251
|
budget=budget,
|
|
@@ -165,6 +270,147 @@ def _candidate_limit(max_total_items: int) -> int:
|
|
|
165
270
|
return max_total_items * 5
|
|
166
271
|
|
|
167
272
|
|
|
273
|
+
def _tokenize_query(query_text: str) -> List[str]:
|
|
274
|
+
"""
|
|
275
|
+
Tokenize a query string into lowercased terms.
|
|
276
|
+
|
|
277
|
+
:param query_text: Raw query text.
|
|
278
|
+
:type query_text: str
|
|
279
|
+
:return: Token list.
|
|
280
|
+
:rtype: list[str]
|
|
281
|
+
"""
|
|
282
|
+
return [token for token in query_text.lower().split() if token]
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _resolve_stop_words(value: Optional[Union[str, List[str]]]) -> Set[str]:
|
|
286
|
+
"""
|
|
287
|
+
Resolve stop words based on a configuration value.
|
|
288
|
+
|
|
289
|
+
:param value: Stop word configuration.
|
|
290
|
+
:type value: str or list[str] or None
|
|
291
|
+
:return: Stop word set.
|
|
292
|
+
:rtype: set[str]
|
|
293
|
+
"""
|
|
294
|
+
if value is None:
|
|
295
|
+
return set()
|
|
296
|
+
if isinstance(value, str):
|
|
297
|
+
return set(_ENGLISH_STOP_WORDS)
|
|
298
|
+
return {token.strip().lower() for token in value if token.strip()}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _apply_stop_words(tokens: List[str], stop_words: Set[str]) -> List[str]:
|
|
302
|
+
"""
|
|
303
|
+
Filter query tokens by a stop word list.
|
|
304
|
+
|
|
305
|
+
:param tokens: Token list.
|
|
306
|
+
:type tokens: list[str]
|
|
307
|
+
:param stop_words: Stop word set.
|
|
308
|
+
:type stop_words: set[str]
|
|
309
|
+
:return: Filtered token list.
|
|
310
|
+
:rtype: list[str]
|
|
311
|
+
"""
|
|
312
|
+
if not stop_words:
|
|
313
|
+
return tokens
|
|
314
|
+
return [token for token in tokens if token not in stop_words]
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _rank_candidates(candidates: List[Evidence]) -> List[Evidence]:
|
|
318
|
+
"""
|
|
319
|
+
Sort evidence candidates by descending score.
|
|
320
|
+
|
|
321
|
+
:param candidates: Evidence list to sort.
|
|
322
|
+
:type candidates: list[Evidence]
|
|
323
|
+
:return: Sorted evidence list.
|
|
324
|
+
:rtype: list[Evidence]
|
|
325
|
+
"""
|
|
326
|
+
return sorted(
|
|
327
|
+
candidates, key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id)
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _rerank_score(text: str, query_tokens: List[str]) -> float:
|
|
332
|
+
"""
|
|
333
|
+
Compute a simple rerank score using token overlap.
|
|
334
|
+
|
|
335
|
+
:param text: Candidate text.
|
|
336
|
+
:type text: str
|
|
337
|
+
:param query_tokens: Query tokens.
|
|
338
|
+
:type query_tokens: list[str]
|
|
339
|
+
:return: Rerank score.
|
|
340
|
+
:rtype: float
|
|
341
|
+
"""
|
|
342
|
+
lower_text = text.lower() if text else ""
|
|
343
|
+
return float(sum(1 for token in query_tokens if token in lower_text))
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _apply_rerank_if_enabled(
|
|
347
|
+
candidates: List[Evidence],
|
|
348
|
+
*,
|
|
349
|
+
query_tokens: List[str],
|
|
350
|
+
run: RetrievalRun,
|
|
351
|
+
budget: QueryBudget,
|
|
352
|
+
rerank_enabled: bool,
|
|
353
|
+
rerank_top_k: int,
|
|
354
|
+
) -> List[Evidence]:
|
|
355
|
+
"""
|
|
356
|
+
Rerank candidates when enabled, otherwise apply the budget to ranked results.
|
|
357
|
+
|
|
358
|
+
:param candidates: Ranked candidate evidence.
|
|
359
|
+
:type candidates: list[Evidence]
|
|
360
|
+
:param query_tokens: Query tokens used for reranking.
|
|
361
|
+
:type query_tokens: list[str]
|
|
362
|
+
:param run: Retrieval run to annotate evidence with.
|
|
363
|
+
:type run: RetrievalRun
|
|
364
|
+
:param budget: Evidence selection budget.
|
|
365
|
+
:type budget: QueryBudget
|
|
366
|
+
:param rerank_enabled: Whether reranking is enabled.
|
|
367
|
+
:type rerank_enabled: bool
|
|
368
|
+
:param rerank_top_k: Maximum candidates to rerank.
|
|
369
|
+
:type rerank_top_k: int
|
|
370
|
+
:return: Evidence list respecting budget.
|
|
371
|
+
:rtype: list[Evidence]
|
|
372
|
+
"""
|
|
373
|
+
if not rerank_enabled:
|
|
374
|
+
ranked = [
|
|
375
|
+
evidence_item.model_copy(
|
|
376
|
+
update={
|
|
377
|
+
"rank": index,
|
|
378
|
+
"recipe_id": run.recipe.recipe_id,
|
|
379
|
+
"run_id": run.run_id,
|
|
380
|
+
}
|
|
381
|
+
)
|
|
382
|
+
for index, evidence_item in enumerate(candidates, start=1)
|
|
383
|
+
]
|
|
384
|
+
return apply_budget(ranked, budget)
|
|
385
|
+
|
|
386
|
+
rerank_limit = min(len(candidates), rerank_top_k)
|
|
387
|
+
rerank_candidates = candidates[:rerank_limit]
|
|
388
|
+
reranked: List[Evidence] = []
|
|
389
|
+
for evidence_item in rerank_candidates:
|
|
390
|
+
rerank_score = _rerank_score(evidence_item.text or "", query_tokens)
|
|
391
|
+
reranked.append(
|
|
392
|
+
evidence_item.model_copy(
|
|
393
|
+
update={
|
|
394
|
+
"score": rerank_score,
|
|
395
|
+
"stage": "rerank",
|
|
396
|
+
"stage_scores": {"retrieve": evidence_item.score, "rerank": rerank_score},
|
|
397
|
+
}
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
reranked_sorted = _rank_candidates(reranked)
|
|
401
|
+
ranked = [
|
|
402
|
+
evidence_item.model_copy(
|
|
403
|
+
update={
|
|
404
|
+
"rank": index,
|
|
405
|
+
"recipe_id": run.recipe.recipe_id,
|
|
406
|
+
"run_id": run.run_id,
|
|
407
|
+
}
|
|
408
|
+
)
|
|
409
|
+
for index, evidence_item in enumerate(reranked_sorted, start=1)
|
|
410
|
+
]
|
|
411
|
+
return apply_budget(ranked, budget)
|
|
412
|
+
|
|
413
|
+
|
|
168
414
|
def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
|
|
169
415
|
"""
|
|
170
416
|
Resolve the SQLite index path for a retrieval run.
|
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic term-frequency vector retrieval backend.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
import re
|
|
9
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
12
|
+
|
|
13
|
+
from ..corpus import Corpus
|
|
14
|
+
from ..frontmatter import parse_front_matter
|
|
15
|
+
from ..models import (
|
|
16
|
+
Evidence,
|
|
17
|
+
ExtractionRunReference,
|
|
18
|
+
QueryBudget,
|
|
19
|
+
RetrievalResult,
|
|
20
|
+
RetrievalRun,
|
|
21
|
+
parse_extraction_run_reference,
|
|
22
|
+
)
|
|
23
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
24
|
+
from ..time import utc_now_iso
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class VectorRecipeConfig(BaseModel):
|
|
28
|
+
"""
|
|
29
|
+
Configuration for the vector retrieval backend.
|
|
30
|
+
|
|
31
|
+
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
32
|
+
:vartype snippet_characters: int
|
|
33
|
+
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
34
|
+
:vartype extraction_run: str or None
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
model_config = ConfigDict(extra="forbid")
|
|
38
|
+
|
|
39
|
+
snippet_characters: int = Field(default=400, ge=1)
|
|
40
|
+
extraction_run: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class VectorBackend:
|
|
44
|
+
"""
|
|
45
|
+
Deterministic vector backend using term-frequency cosine similarity.
|
|
46
|
+
|
|
47
|
+
:ivar backend_id: Backend identifier.
|
|
48
|
+
:vartype backend_id: str
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
backend_id = "vector"
|
|
52
|
+
|
|
53
|
+
def build_run(
|
|
54
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
55
|
+
) -> RetrievalRun:
|
|
56
|
+
"""
|
|
57
|
+
Register a vector backend run (no materialization).
|
|
58
|
+
|
|
59
|
+
:param corpus: Corpus to build against.
|
|
60
|
+
:type corpus: Corpus
|
|
61
|
+
:param recipe_name: Human-readable recipe name.
|
|
62
|
+
:type recipe_name: str
|
|
63
|
+
:param config: Backend-specific configuration values.
|
|
64
|
+
:type config: dict[str, object]
|
|
65
|
+
:return: Run manifest describing the build.
|
|
66
|
+
:rtype: RetrievalRun
|
|
67
|
+
"""
|
|
68
|
+
recipe_config = VectorRecipeConfig.model_validate(config)
|
|
69
|
+
catalog = corpus.load_catalog()
|
|
70
|
+
recipe = create_recipe_manifest(
|
|
71
|
+
backend_id=self.backend_id,
|
|
72
|
+
name=recipe_name,
|
|
73
|
+
config=recipe_config.model_dump(),
|
|
74
|
+
)
|
|
75
|
+
stats = {
|
|
76
|
+
"items": len(catalog.items),
|
|
77
|
+
"text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
|
|
78
|
+
}
|
|
79
|
+
run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
|
|
80
|
+
corpus.write_run(run)
|
|
81
|
+
return run
|
|
82
|
+
|
|
83
|
+
def query(
|
|
84
|
+
self,
|
|
85
|
+
corpus: Corpus,
|
|
86
|
+
*,
|
|
87
|
+
run: RetrievalRun,
|
|
88
|
+
query_text: str,
|
|
89
|
+
budget: QueryBudget,
|
|
90
|
+
) -> RetrievalResult:
|
|
91
|
+
"""
|
|
92
|
+
Query the corpus using term-frequency cosine similarity.
|
|
93
|
+
|
|
94
|
+
:param corpus: Corpus associated with the run.
|
|
95
|
+
:type corpus: Corpus
|
|
96
|
+
:param run: Run manifest to use for querying.
|
|
97
|
+
:type run: RetrievalRun
|
|
98
|
+
:param query_text: Query text to execute.
|
|
99
|
+
:type query_text: str
|
|
100
|
+
:param budget: Evidence selection budget.
|
|
101
|
+
:type budget: QueryBudget
|
|
102
|
+
:return: Retrieval results containing evidence.
|
|
103
|
+
:rtype: RetrievalResult
|
|
104
|
+
"""
|
|
105
|
+
recipe_config = VectorRecipeConfig.model_validate(run.recipe.config)
|
|
106
|
+
query_tokens = _tokenize_text(query_text)
|
|
107
|
+
if not query_tokens:
|
|
108
|
+
return RetrievalResult(
|
|
109
|
+
query_text=query_text,
|
|
110
|
+
budget=budget,
|
|
111
|
+
run_id=run.run_id,
|
|
112
|
+
recipe_id=run.recipe.recipe_id,
|
|
113
|
+
backend_id=self.backend_id,
|
|
114
|
+
generated_at=utc_now_iso(),
|
|
115
|
+
evidence=[],
|
|
116
|
+
stats={"candidates": 0, "returned": 0},
|
|
117
|
+
)
|
|
118
|
+
query_vector = _term_frequencies(query_tokens)
|
|
119
|
+
query_norm = _vector_norm(query_vector)
|
|
120
|
+
catalog = corpus.load_catalog()
|
|
121
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
122
|
+
scored_candidates = _score_items(
|
|
123
|
+
corpus,
|
|
124
|
+
catalog.items.values(),
|
|
125
|
+
query_tokens=query_tokens,
|
|
126
|
+
query_vector=query_vector,
|
|
127
|
+
query_norm=query_norm,
|
|
128
|
+
snippet_characters=recipe_config.snippet_characters,
|
|
129
|
+
extraction_reference=extraction_reference,
|
|
130
|
+
)
|
|
131
|
+
sorted_candidates = sorted(
|
|
132
|
+
scored_candidates,
|
|
133
|
+
key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
|
|
134
|
+
)
|
|
135
|
+
ranked = [
|
|
136
|
+
evidence_item.model_copy(
|
|
137
|
+
update={
|
|
138
|
+
"rank": index,
|
|
139
|
+
"recipe_id": run.recipe.recipe_id,
|
|
140
|
+
"run_id": run.run_id,
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
144
|
+
]
|
|
145
|
+
evidence = apply_budget(ranked, budget)
|
|
146
|
+
stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
147
|
+
return RetrievalResult(
|
|
148
|
+
query_text=query_text,
|
|
149
|
+
budget=budget,
|
|
150
|
+
run_id=run.run_id,
|
|
151
|
+
recipe_id=run.recipe.recipe_id,
|
|
152
|
+
backend_id=self.backend_id,
|
|
153
|
+
generated_at=utc_now_iso(),
|
|
154
|
+
evidence=evidence,
|
|
155
|
+
stats=stats,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _resolve_extraction_reference(
|
|
160
|
+
corpus: Corpus, recipe_config: VectorRecipeConfig
|
|
161
|
+
) -> Optional[ExtractionRunReference]:
|
|
162
|
+
"""
|
|
163
|
+
Resolve an extraction run reference from a recipe config.
|
|
164
|
+
|
|
165
|
+
:param corpus: Corpus associated with the recipe.
|
|
166
|
+
:type corpus: Corpus
|
|
167
|
+
:param recipe_config: Parsed vector recipe configuration.
|
|
168
|
+
:type recipe_config: VectorRecipeConfig
|
|
169
|
+
:return: Parsed extraction reference or None.
|
|
170
|
+
:rtype: ExtractionRunReference or None
|
|
171
|
+
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
172
|
+
"""
|
|
173
|
+
if not recipe_config.extraction_run:
|
|
174
|
+
return None
|
|
175
|
+
extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
|
|
176
|
+
run_dir = corpus.extraction_run_dir(
|
|
177
|
+
extractor_id=extraction_reference.extractor_id,
|
|
178
|
+
run_id=extraction_reference.run_id,
|
|
179
|
+
)
|
|
180
|
+
if not run_dir.is_dir():
|
|
181
|
+
raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
|
|
182
|
+
return extraction_reference
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _count_text_items(
|
|
186
|
+
corpus: Corpus, items: Iterable[object], recipe_config: VectorRecipeConfig
|
|
187
|
+
) -> int:
|
|
188
|
+
"""
|
|
189
|
+
Count catalog items that represent text content.
|
|
190
|
+
|
|
191
|
+
:param corpus: Corpus containing the items.
|
|
192
|
+
:type corpus: Corpus
|
|
193
|
+
:param items: Catalog items to inspect.
|
|
194
|
+
:type items: Iterable[object]
|
|
195
|
+
:param recipe_config: Parsed vector recipe configuration.
|
|
196
|
+
:type recipe_config: VectorRecipeConfig
|
|
197
|
+
:return: Number of text items.
|
|
198
|
+
:rtype: int
|
|
199
|
+
"""
|
|
200
|
+
text_item_count = 0
|
|
201
|
+
extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
|
|
202
|
+
for catalog_item in items:
|
|
203
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
204
|
+
if extraction_reference and item_id:
|
|
205
|
+
extracted_text = corpus.read_extracted_text(
|
|
206
|
+
extractor_id=extraction_reference.extractor_id,
|
|
207
|
+
run_id=extraction_reference.run_id,
|
|
208
|
+
item_id=item_id,
|
|
209
|
+
)
|
|
210
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
211
|
+
text_item_count += 1
|
|
212
|
+
continue
|
|
213
|
+
media_type = getattr(catalog_item, "media_type", "")
|
|
214
|
+
if media_type == "text/markdown" or str(media_type).startswith("text/"):
|
|
215
|
+
text_item_count += 1
|
|
216
|
+
return text_item_count
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _tokenize_text(text: str) -> List[str]:
|
|
220
|
+
"""
|
|
221
|
+
Tokenize text into lowercase word tokens.
|
|
222
|
+
|
|
223
|
+
:param text: Input text.
|
|
224
|
+
:type text: str
|
|
225
|
+
:return: Token list.
|
|
226
|
+
:rtype: list[str]
|
|
227
|
+
"""
|
|
228
|
+
return re.findall(r"[a-z0-9]+", text.lower())
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _term_frequencies(tokens: List[str]) -> Dict[str, float]:
|
|
232
|
+
"""
|
|
233
|
+
Build term frequency weights from tokens.
|
|
234
|
+
|
|
235
|
+
:param tokens: Token list.
|
|
236
|
+
:type tokens: list[str]
|
|
237
|
+
:return: Term frequency mapping.
|
|
238
|
+
:rtype: dict[str, float]
|
|
239
|
+
"""
|
|
240
|
+
frequencies: Dict[str, float] = {}
|
|
241
|
+
for token in tokens:
|
|
242
|
+
frequencies[token] = frequencies.get(token, 0.0) + 1.0
|
|
243
|
+
return frequencies
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _vector_norm(vector: Dict[str, float]) -> float:
|
|
247
|
+
"""
|
|
248
|
+
Compute the Euclidean norm of a term-frequency vector.
|
|
249
|
+
|
|
250
|
+
:param vector: Term frequency mapping.
|
|
251
|
+
:type vector: dict[str, float]
|
|
252
|
+
:return: Vector norm.
|
|
253
|
+
:rtype: float
|
|
254
|
+
"""
|
|
255
|
+
return math.sqrt(sum(value * value for value in vector.values()))
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _cosine_similarity(
|
|
259
|
+
left: Dict[str, float],
|
|
260
|
+
*,
|
|
261
|
+
left_norm: float,
|
|
262
|
+
right: Dict[str, float],
|
|
263
|
+
right_norm: float,
|
|
264
|
+
) -> float:
|
|
265
|
+
"""
|
|
266
|
+
Compute cosine similarity between two term-frequency vectors.
|
|
267
|
+
|
|
268
|
+
:param left: Left term-frequency vector.
|
|
269
|
+
:type left: dict[str, float]
|
|
270
|
+
:param left_norm: Precomputed left vector norm.
|
|
271
|
+
:type left_norm: float
|
|
272
|
+
:param right: Right term-frequency vector.
|
|
273
|
+
:type right: dict[str, float]
|
|
274
|
+
:param right_norm: Precomputed right vector norm.
|
|
275
|
+
:type right_norm: float
|
|
276
|
+
:return: Cosine similarity score.
|
|
277
|
+
:rtype: float
|
|
278
|
+
"""
|
|
279
|
+
dot = 0.0
|
|
280
|
+
if len(left) < len(right):
|
|
281
|
+
for token, value in left.items():
|
|
282
|
+
dot += value * right.get(token, 0.0)
|
|
283
|
+
else:
|
|
284
|
+
for token, value in right.items():
|
|
285
|
+
dot += value * left.get(token, 0.0)
|
|
286
|
+
return dot / (left_norm * right_norm)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _load_text_from_item(
|
|
290
|
+
corpus: Corpus,
|
|
291
|
+
*,
|
|
292
|
+
item_id: str,
|
|
293
|
+
relpath: str,
|
|
294
|
+
media_type: str,
|
|
295
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
296
|
+
) -> Optional[str]:
|
|
297
|
+
"""
|
|
298
|
+
Load a text payload from a catalog item.
|
|
299
|
+
|
|
300
|
+
:param corpus: Corpus containing the item.
|
|
301
|
+
:type corpus: Corpus
|
|
302
|
+
:param item_id: Item identifier.
|
|
303
|
+
:type item_id: str
|
|
304
|
+
:param relpath: Relative path to the stored content.
|
|
305
|
+
:type relpath: str
|
|
306
|
+
:param media_type: Media type for the stored content.
|
|
307
|
+
:type media_type: str
|
|
308
|
+
:param extraction_reference: Optional extraction run reference.
|
|
309
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
310
|
+
:return: Text payload or None if not decodable as text.
|
|
311
|
+
:rtype: str or None
|
|
312
|
+
"""
|
|
313
|
+
if extraction_reference:
|
|
314
|
+
extracted_text = corpus.read_extracted_text(
|
|
315
|
+
extractor_id=extraction_reference.extractor_id,
|
|
316
|
+
run_id=extraction_reference.run_id,
|
|
317
|
+
item_id=item_id,
|
|
318
|
+
)
|
|
319
|
+
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
320
|
+
return extracted_text
|
|
321
|
+
|
|
322
|
+
content_path = corpus.root / relpath
|
|
323
|
+
raw_bytes = content_path.read_bytes()
|
|
324
|
+
if media_type == "text/markdown":
|
|
325
|
+
markdown_text = raw_bytes.decode("utf-8")
|
|
326
|
+
parsed_document = parse_front_matter(markdown_text)
|
|
327
|
+
return parsed_document.body
|
|
328
|
+
if media_type.startswith("text/"):
|
|
329
|
+
return raw_bytes.decode("utf-8")
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
|
|
334
|
+
"""
|
|
335
|
+
Locate the earliest token match span in a text payload.
|
|
336
|
+
|
|
337
|
+
:param text: Text to scan.
|
|
338
|
+
:type text: str
|
|
339
|
+
:param tokens: Query tokens.
|
|
340
|
+
:type tokens: list[str]
|
|
341
|
+
:return: Start/end span for the earliest match, or None if no matches.
|
|
342
|
+
:rtype: tuple[int, int] or None
|
|
343
|
+
"""
|
|
344
|
+
lower_text = text.lower()
|
|
345
|
+
best_start: Optional[int] = None
|
|
346
|
+
best_end: Optional[int] = None
|
|
347
|
+
for token in tokens:
|
|
348
|
+
if not token:
|
|
349
|
+
continue
|
|
350
|
+
token_start = lower_text.find(token)
|
|
351
|
+
if token_start == -1:
|
|
352
|
+
continue
|
|
353
|
+
token_end = token_start + len(token)
|
|
354
|
+
if best_start is None or token_start < best_start:
|
|
355
|
+
best_start = token_start
|
|
356
|
+
best_end = token_end
|
|
357
|
+
if best_start is None or best_end is None:
|
|
358
|
+
return None
|
|
359
|
+
return best_start, best_end
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
|
|
363
|
+
"""
|
|
364
|
+
Build a snippet around a match span, constrained by a character budget.
|
|
365
|
+
|
|
366
|
+
:param text: Source text to slice.
|
|
367
|
+
:type text: str
|
|
368
|
+
:param span: Match span to center on.
|
|
369
|
+
:type span: tuple[int, int] or None
|
|
370
|
+
:param max_chars: Maximum snippet length.
|
|
371
|
+
:type max_chars: int
|
|
372
|
+
:return: Snippet text.
|
|
373
|
+
:rtype: str
|
|
374
|
+
"""
|
|
375
|
+
if not text:
|
|
376
|
+
return ""
|
|
377
|
+
if span is None:
|
|
378
|
+
return text[:max_chars]
|
|
379
|
+
span_start, span_end = span
|
|
380
|
+
half_window = max_chars // 2
|
|
381
|
+
snippet_start = max(span_start - half_window, 0)
|
|
382
|
+
snippet_end = min(span_end + half_window, len(text))
|
|
383
|
+
return text[snippet_start:snippet_end]
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _score_items(
|
|
387
|
+
corpus: Corpus,
|
|
388
|
+
items: Iterable[object],
|
|
389
|
+
*,
|
|
390
|
+
query_tokens: List[str],
|
|
391
|
+
query_vector: Dict[str, float],
|
|
392
|
+
query_norm: float,
|
|
393
|
+
snippet_characters: int,
|
|
394
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
395
|
+
) -> List[Evidence]:
|
|
396
|
+
"""
|
|
397
|
+
Score catalog items and return evidence candidates.
|
|
398
|
+
|
|
399
|
+
:param corpus: Corpus containing the items.
|
|
400
|
+
:type corpus: Corpus
|
|
401
|
+
:param items: Catalog items to score.
|
|
402
|
+
:type items: Iterable[object]
|
|
403
|
+
:param query_tokens: Tokenized query text.
|
|
404
|
+
:type query_tokens: list[str]
|
|
405
|
+
:param query_vector: Query term-frequency vector.
|
|
406
|
+
:type query_vector: dict[str, float]
|
|
407
|
+
:param query_norm: Query vector norm.
|
|
408
|
+
:type query_norm: float
|
|
409
|
+
:param snippet_characters: Snippet length budget.
|
|
410
|
+
:type snippet_characters: int
|
|
411
|
+
:param extraction_reference: Optional extraction run reference.
|
|
412
|
+
:type extraction_reference: ExtractionRunReference or None
|
|
413
|
+
:return: Evidence candidates with provisional ranks.
|
|
414
|
+
:rtype: list[Evidence]
|
|
415
|
+
"""
|
|
416
|
+
evidence_items: List[Evidence] = []
|
|
417
|
+
for catalog_item in items:
|
|
418
|
+
media_type = getattr(catalog_item, "media_type", "")
|
|
419
|
+
relpath = getattr(catalog_item, "relpath", "")
|
|
420
|
+
item_id = str(getattr(catalog_item, "id", ""))
|
|
421
|
+
item_text = _load_text_from_item(
|
|
422
|
+
corpus,
|
|
423
|
+
item_id=item_id,
|
|
424
|
+
relpath=relpath,
|
|
425
|
+
media_type=str(media_type),
|
|
426
|
+
extraction_reference=extraction_reference,
|
|
427
|
+
)
|
|
428
|
+
if item_text is None:
|
|
429
|
+
continue
|
|
430
|
+
tokens = _tokenize_text(item_text)
|
|
431
|
+
if not tokens:
|
|
432
|
+
continue
|
|
433
|
+
vector = _term_frequencies(tokens)
|
|
434
|
+
similarity = _cosine_similarity(
|
|
435
|
+
query_vector, left_norm=query_norm, right=vector, right_norm=_vector_norm(vector)
|
|
436
|
+
)
|
|
437
|
+
if similarity <= 0:
|
|
438
|
+
continue
|
|
439
|
+
span = _find_first_match(item_text, query_tokens)
|
|
440
|
+
snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
|
|
441
|
+
span_start = span[0] if span else None
|
|
442
|
+
span_end = span[1] if span else None
|
|
443
|
+
evidence_items.append(
|
|
444
|
+
Evidence(
|
|
445
|
+
item_id=str(getattr(catalog_item, "id")),
|
|
446
|
+
source_uri=getattr(catalog_item, "source_uri", None),
|
|
447
|
+
media_type=str(media_type),
|
|
448
|
+
score=float(similarity),
|
|
449
|
+
rank=1,
|
|
450
|
+
text=snippet,
|
|
451
|
+
content_ref=None,
|
|
452
|
+
span_start=span_start,
|
|
453
|
+
span_end=span_end,
|
|
454
|
+
stage="vector",
|
|
455
|
+
recipe_id="",
|
|
456
|
+
run_id="",
|
|
457
|
+
hash=hash_text(snippet),
|
|
458
|
+
)
|
|
459
|
+
)
|
|
460
|
+
return evidence_items
|
biblicus/models.py
CHANGED
|
@@ -263,6 +263,8 @@ class Evidence(BaseModel):
|
|
|
263
263
|
:vartype span_end: int or None
|
|
264
264
|
:ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
|
|
265
265
|
:vartype stage: str
|
|
266
|
+
:ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
|
|
267
|
+
:vartype stage_scores: dict[str, float] or None
|
|
266
268
|
:ivar recipe_id: Recipe identifier used to create the run.
|
|
267
269
|
:vartype recipe_id: str
|
|
268
270
|
:ivar run_id: Retrieval run identifier.
|
|
@@ -283,6 +285,7 @@ class Evidence(BaseModel):
|
|
|
283
285
|
span_start: Optional[int] = None
|
|
284
286
|
span_end: Optional[int] = None
|
|
285
287
|
stage: str
|
|
288
|
+
stage_scores: Optional[Dict[str, float]] = None
|
|
286
289
|
recipe_id: str
|
|
287
290
|
run_id: str
|
|
288
291
|
hash: Optional[str] = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -493,6 +493,12 @@ Two backends are included.
|
|
|
493
493
|
|
|
494
494
|
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
495
495
|
|
|
496
|
+
## Retrieval documentation
|
|
497
|
+
|
|
498
|
+
For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
|
|
499
|
+
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
|
|
500
|
+
and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
|
|
501
|
+
|
|
496
502
|
## Extraction backends
|
|
497
503
|
|
|
498
504
|
These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=sT0PFc3DRGFRcN7Zx4Yooc8OzmLvaj1-ZjbvFHce8lU,496
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
3
|
biblicus/cli.py,sha256=aH3plnednnYgcPnSoYQf200nboKc6N-tuc3FuLPQEcU,35132
|
|
4
4
|
biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
|
|
@@ -16,7 +16,7 @@ biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
|
|
|
16
16
|
biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
17
17
|
biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
|
|
18
18
|
biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
|
|
19
|
-
biblicus/models.py,sha256=
|
|
19
|
+
biblicus/models.py,sha256=r28O6cg3d1bjJnKqpLieVLTgtXTfzb_60wMORvVuDN0,15846
|
|
20
20
|
biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
21
21
|
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
22
22
|
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
@@ -30,13 +30,15 @@ biblicus/analysis/__init__.py,sha256=Z4Wb4d-EoUuGHkcfRm9ILuZ8vr9FBqRxC0u1i6Fp_0w
|
|
|
30
30
|
biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
|
|
31
31
|
biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
|
|
32
32
|
biblicus/analysis/models.py,sha256=LuR52w27JRzV-Mr-WAOduZrBOCTrp5uYkMc46QHTRrI,27300
|
|
33
|
-
biblicus/analysis/profiling.py,sha256=
|
|
33
|
+
biblicus/analysis/profiling.py,sha256=v2B4Tn9WiXRRP_wIADBPRQVKkMc92KXCas7OBa7n0LU,10670
|
|
34
34
|
biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
|
|
35
35
|
biblicus/analysis/topic_modeling.py,sha256=ZGXvm2MyU6plxz2FE1RQU-3bra6QZ-t8EJj8kG1TW0M,19438
|
|
36
|
-
biblicus/backends/__init__.py,sha256=
|
|
36
|
+
biblicus/backends/__init__.py,sha256=3HJY0oMm8pFFVGC4Z-dlPRHhIPVDdUzsa4IMjKP_9dI,1378
|
|
37
37
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
38
|
+
biblicus/backends/hybrid.py,sha256=CXh6QrlE0RsTJjSlZRdtomLlILfkglBDQG3YVa8RpFU,10589
|
|
38
39
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
39
|
-
biblicus/backends/sqlite_full_text_search.py,sha256=
|
|
40
|
+
biblicus/backends/sqlite_full_text_search.py,sha256=VAn4fDdfiaS1Rn6zHlYz3E10_3vMU9P94QU8cL0l8Mk,24466
|
|
41
|
+
biblicus/backends/vector.py,sha256=3RdxSBPb1kOX4Sfd4d1qXFW9ecuiRvGpOHadLCbeh1g,15183
|
|
40
42
|
biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
|
|
41
43
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
42
44
|
biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
|
|
@@ -55,9 +57,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
|
|
|
55
57
|
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
56
58
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
57
59
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
58
|
-
biblicus-0.
|
|
59
|
-
biblicus-0.
|
|
60
|
-
biblicus-0.
|
|
61
|
-
biblicus-0.
|
|
62
|
-
biblicus-0.
|
|
63
|
-
biblicus-0.
|
|
60
|
+
biblicus-0.11.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
61
|
+
biblicus-0.11.0.dist-info/METADATA,sha256=zrJESYGfGLu7Iq1I--GPIkEY9gXDb9szBIuenlWor7I,27765
|
|
62
|
+
biblicus-0.11.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
63
|
+
biblicus-0.11.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
64
|
+
biblicus-0.11.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
65
|
+
biblicus-0.11.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|