biblicus 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -27,4 +27,4 @@ __all__ = [
27
27
  "RetrievalRun",
28
28
  ]
29
29
 
30
- __version__ = "0.10.0"
30
+ __version__ = "0.11.0"
@@ -266,7 +266,7 @@ def _build_extracted_text_report(
266
266
  empty_items += 1
267
267
  continue
268
268
  nonempty_items += 1
269
- text_lengths.append(len(text_value))
269
+ text_lengths.append(len(stripped))
270
270
 
271
271
  sampled_lengths = _apply_sample(text_lengths, config.sample_size)
272
272
  characters_distribution = _build_distribution(sampled_lengths, config.percentiles)
@@ -7,8 +7,10 @@ from __future__ import annotations
7
7
  from typing import Dict, Type
8
8
 
9
9
  from .base import RetrievalBackend
10
+ from .hybrid import HybridBackend
10
11
  from .scan import ScanBackend
11
12
  from .sqlite_full_text_search import SqliteFullTextSearchBackend
13
+ from .vector import VectorBackend
12
14
 
13
15
 
14
16
  def available_backends() -> Dict[str, Type[RetrievalBackend]]:
@@ -19,8 +21,10 @@ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
19
21
  :rtype: dict[str, Type[RetrievalBackend]]
20
22
  """
21
23
  return {
24
+ HybridBackend.backend_id: HybridBackend,
22
25
  ScanBackend.backend_id: ScanBackend,
23
26
  SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
27
+ VectorBackend.backend_id: VectorBackend,
24
28
  }
25
29
 
26
30
 
@@ -0,0 +1,284 @@
1
+ """
2
+ Hybrid retrieval backend combining lexical and vector results.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
10
+
11
+ from ..corpus import Corpus
12
+ from ..models import Evidence, QueryBudget, RetrievalResult, RetrievalRun
13
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest
14
+ from ..time import utc_now_iso
15
+
16
+
17
+ class HybridRecipeConfig(BaseModel):
18
+ """
19
+ Configuration for hybrid retrieval fusion.
20
+
21
+ :ivar lexical_backend: Backend identifier for lexical retrieval.
22
+ :vartype lexical_backend: str
23
+ :ivar embedding_backend: Backend identifier for embedding retrieval.
24
+ :vartype embedding_backend: str
25
+ :ivar lexical_weight: Weight for lexical scores.
26
+ :vartype lexical_weight: float
27
+ :ivar embedding_weight: Weight for embedding scores.
28
+ :vartype embedding_weight: float
29
+ :ivar lexical_config: Optional lexical backend configuration.
30
+ :vartype lexical_config: dict[str, object]
31
+ :ivar embedding_config: Optional embedding backend configuration.
32
+ :vartype embedding_config: dict[str, object]
33
+ """
34
+
35
+ model_config = ConfigDict(extra="forbid")
36
+
37
+ lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
38
+ embedding_backend: str = Field(default="vector", min_length=1)
39
+ lexical_weight: float = Field(default=0.5, ge=0, le=1)
40
+ embedding_weight: float = Field(default=0.5, ge=0, le=1)
41
+ lexical_config: Dict[str, object] = Field(default_factory=dict)
42
+ embedding_config: Dict[str, object] = Field(default_factory=dict)
43
+
44
+ @model_validator(mode="after")
45
+ def _validate_weights(self) -> "HybridRecipeConfig":
46
+ if abs((self.lexical_weight + self.embedding_weight) - 1.0) > 1e-6:
47
+ raise ValueError("weights must sum to 1")
48
+ return self
49
+
50
+
51
+ class HybridBackend:
52
+ """
53
+ Hybrid backend that fuses lexical and embedding retrieval.
54
+
55
+ :ivar backend_id: Backend identifier.
56
+ :vartype backend_id: str
57
+ """
58
+
59
+ backend_id = "hybrid"
60
+
61
+ def build_run(
62
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
63
+ ) -> RetrievalRun:
64
+ """
65
+ Build or register a hybrid retrieval run.
66
+
67
+ :param corpus: Corpus to build against.
68
+ :type corpus: Corpus
69
+ :param recipe_name: Human-readable recipe name.
70
+ :type recipe_name: str
71
+ :param config: Backend-specific configuration values.
72
+ :type config: dict[str, object]
73
+ :return: Run manifest describing the build.
74
+ :rtype: RetrievalRun
75
+ """
76
+ recipe_config = HybridRecipeConfig.model_validate(config)
77
+ _ensure_backend_supported(recipe_config)
78
+ lexical_backend = _resolve_backend(recipe_config.lexical_backend)
79
+ embedding_backend = _resolve_backend(recipe_config.embedding_backend)
80
+ lexical_run = lexical_backend.build_run(
81
+ corpus, recipe_name=f"{recipe_name}-lexical", config=recipe_config.lexical_config
82
+ )
83
+ embedding_run = embedding_backend.build_run(
84
+ corpus, recipe_name=f"{recipe_name}-embedding", config=recipe_config.embedding_config
85
+ )
86
+ recipe = create_recipe_manifest(
87
+ backend_id=self.backend_id,
88
+ name=recipe_name,
89
+ config=recipe_config.model_dump(),
90
+ )
91
+ stats = {
92
+ "lexical_run_id": lexical_run.run_id,
93
+ "embedding_run_id": embedding_run.run_id,
94
+ }
95
+ run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
96
+ corpus.write_run(run)
97
+ return run
98
+
99
+ def query(
100
+ self,
101
+ corpus: Corpus,
102
+ *,
103
+ run: RetrievalRun,
104
+ query_text: str,
105
+ budget: QueryBudget,
106
+ ) -> RetrievalResult:
107
+ """
108
+ Query using both lexical and embedding backends and fuse scores.
109
+
110
+ :param corpus: Corpus associated with the run.
111
+ :type corpus: Corpus
112
+ :param run: Run manifest to use for querying.
113
+ :type run: RetrievalRun
114
+ :param query_text: Query text to execute.
115
+ :type query_text: str
116
+ :param budget: Evidence selection budget.
117
+ :type budget: QueryBudget
118
+ :return: Retrieval results containing evidence.
119
+ :rtype: RetrievalResult
120
+ """
121
+ recipe_config = HybridRecipeConfig.model_validate(run.recipe.config)
122
+ _ensure_backend_supported(recipe_config)
123
+ lexical_backend = _resolve_backend(recipe_config.lexical_backend)
124
+ embedding_backend = _resolve_backend(recipe_config.embedding_backend)
125
+ lexical_run_id = run.stats.get("lexical_run_id")
126
+ embedding_run_id = run.stats.get("embedding_run_id")
127
+ if not lexical_run_id or not embedding_run_id:
128
+ raise ValueError("Hybrid run missing lexical or embedding run identifiers")
129
+ lexical_run = corpus.load_run(str(lexical_run_id))
130
+ embedding_run = corpus.load_run(str(embedding_run_id))
131
+ component_budget = _expand_component_budget(budget)
132
+ lexical_result = lexical_backend.query(
133
+ corpus, run=lexical_run, query_text=query_text, budget=component_budget
134
+ )
135
+ embedding_result = embedding_backend.query(
136
+ corpus, run=embedding_run, query_text=query_text, budget=component_budget
137
+ )
138
+ candidates = _fuse_evidence(
139
+ lexical_result.evidence,
140
+ embedding_result.evidence,
141
+ lexical_weight=recipe_config.lexical_weight,
142
+ embedding_weight=recipe_config.embedding_weight,
143
+ )
144
+ sorted_candidates = sorted(
145
+ candidates,
146
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
147
+ )
148
+ ranked = [
149
+ evidence_item.model_copy(
150
+ update={
151
+ "rank": index,
152
+ "recipe_id": run.recipe.recipe_id,
153
+ "run_id": run.run_id,
154
+ }
155
+ )
156
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
157
+ ]
158
+ evidence = apply_budget(ranked, budget)
159
+ stats = {
160
+ "candidates": len(sorted_candidates),
161
+ "returned": len(evidence),
162
+ "fusion_weights": {
163
+ "lexical": recipe_config.lexical_weight,
164
+ "embedding": recipe_config.embedding_weight,
165
+ },
166
+ }
167
+ return RetrievalResult(
168
+ query_text=query_text,
169
+ budget=budget,
170
+ run_id=run.run_id,
171
+ recipe_id=run.recipe.recipe_id,
172
+ backend_id=self.backend_id,
173
+ generated_at=utc_now_iso(),
174
+ evidence=evidence,
175
+ stats=stats,
176
+ )
177
+
178
+
179
+ def _ensure_backend_supported(recipe_config: HybridRecipeConfig) -> None:
180
+ """
181
+ Validate that hybrid backends do not reference the hybrid backend itself.
182
+
183
+ :param recipe_config: Parsed hybrid recipe configuration.
184
+ :type recipe_config: HybridRecipeConfig
185
+ :return: None.
186
+ :rtype: None
187
+ :raises ValueError: If hybrid is used as a component backend.
188
+ """
189
+ if recipe_config.lexical_backend == HybridBackend.backend_id:
190
+ raise ValueError("Hybrid backend cannot use itself as the lexical backend")
191
+ if recipe_config.embedding_backend == HybridBackend.backend_id:
192
+ raise ValueError("Hybrid backend cannot use itself as the embedding backend")
193
+
194
+
195
+ def _resolve_backend(backend_id: str):
196
+ """
197
+ Resolve a backend by identifier.
198
+
199
+ :param backend_id: Backend identifier.
200
+ :type backend_id: str
201
+ :return: Backend instance.
202
+ :rtype: object
203
+ """
204
+ from . import get_backend
205
+
206
+ return get_backend(backend_id)
207
+
208
+
209
+ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> QueryBudget:
210
+ """
211
+ Expand a final budget to collect more candidates for fusion.
212
+
213
+ :param budget: Final evidence budget.
214
+ :type budget: QueryBudget
215
+ :param multiplier: Candidate expansion multiplier.
216
+ :type multiplier: int
217
+ :return: Expanded budget for component backends.
218
+ :rtype: QueryBudget
219
+ """
220
+ max_total_characters = budget.max_total_characters
221
+ expanded_characters = (
222
+ max_total_characters * multiplier if max_total_characters is not None else None
223
+ )
224
+ return QueryBudget(
225
+ max_total_items=budget.max_total_items * multiplier,
226
+ max_total_characters=expanded_characters,
227
+ max_items_per_source=budget.max_items_per_source,
228
+ )
229
+
230
+
231
+ def _fuse_evidence(
232
+ lexical: List[Evidence],
233
+ embedding: List[Evidence],
234
+ *,
235
+ lexical_weight: float,
236
+ embedding_weight: float,
237
+ ) -> List[Evidence]:
238
+ """
239
+ Fuse lexical and embedding evidence lists into hybrid candidates.
240
+
241
+ :param lexical: Lexical evidence list.
242
+ :type lexical: list[Evidence]
243
+ :param embedding: Embedding evidence list.
244
+ :type embedding: list[Evidence]
245
+ :param lexical_weight: Lexical score weight.
246
+ :type lexical_weight: float
247
+ :param embedding_weight: Embedding score weight.
248
+ :type embedding_weight: float
249
+ :return: Hybrid evidence list.
250
+ :rtype: list[Evidence]
251
+ """
252
+ merged: Dict[str, Dict[str, Optional[Evidence]]] = {}
253
+ for evidence_item in lexical:
254
+ merged.setdefault(evidence_item.item_id, {})["lexical"] = evidence_item
255
+ for evidence_item in embedding:
256
+ merged.setdefault(evidence_item.item_id, {})["embedding"] = evidence_item
257
+
258
+ candidates: List[Evidence] = []
259
+ for item_id, sources in merged.items():
260
+ lexical_evidence = sources.get("lexical")
261
+ embedding_evidence = sources.get("embedding")
262
+ lexical_score = lexical_evidence.score if lexical_evidence else 0.0
263
+ embedding_score = embedding_evidence.score if embedding_evidence else 0.0
264
+ combined_score = (lexical_score * lexical_weight) + (embedding_score * embedding_weight)
265
+ base_evidence = lexical_evidence or embedding_evidence
266
+ candidates.append(
267
+ Evidence(
268
+ item_id=item_id,
269
+ source_uri=base_evidence.source_uri,
270
+ media_type=base_evidence.media_type,
271
+ score=combined_score,
272
+ rank=1,
273
+ text=base_evidence.text,
274
+ content_ref=base_evidence.content_ref,
275
+ span_start=base_evidence.span_start,
276
+ span_end=base_evidence.span_end,
277
+ stage="hybrid",
278
+ stage_scores={"lexical": lexical_score, "embedding": embedding_score},
279
+ recipe_id="",
280
+ run_id="",
281
+ hash=base_evidence.hash,
282
+ )
283
+ )
284
+ return candidates
@@ -6,9 +6,9 @@ from __future__ import annotations
6
6
 
7
7
  import sqlite3
8
8
  from pathlib import Path
9
- from typing import Dict, Iterable, List, Optional, Tuple
9
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
10
10
 
11
- from pydantic import BaseModel, ConfigDict, Field
11
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
12
12
 
13
13
  from ..constants import CORPUS_DIR_NAME, RUNS_DIR_NAME
14
14
  from ..corpus import Corpus
@@ -35,6 +35,28 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
35
35
  :vartype chunk_overlap: int
36
36
  :ivar snippet_characters: Maximum characters to include in evidence snippets.
37
37
  :vartype snippet_characters: int
38
+ :ivar bm25_k1: BM25 k1 tuning parameter.
39
+ :vartype bm25_k1: float
40
+ :ivar bm25_b: BM25 b tuning parameter.
41
+ :vartype bm25_b: float
42
+ :ivar ngram_min: Minimum n-gram size for lexical tuning.
43
+ :vartype ngram_min: int
44
+ :ivar ngram_max: Maximum n-gram size for lexical tuning.
45
+ :vartype ngram_max: int
46
+ :ivar stop_words: Optional stop word policy or list.
47
+ :vartype stop_words: str or list[str] or None
48
+ :ivar field_weight_title: Relative weight for title field matches.
49
+ :vartype field_weight_title: float
50
+ :ivar field_weight_body: Relative weight for body field matches.
51
+ :vartype field_weight_body: float
52
+ :ivar field_weight_tags: Relative weight for tag field matches.
53
+ :vartype field_weight_tags: float
54
+ :ivar rerank_enabled: Whether to apply reranking to retrieved candidates.
55
+ :vartype rerank_enabled: bool
56
+ :ivar rerank_model: Reranker model identifier for metadata.
57
+ :vartype rerank_model: str or None
58
+ :ivar rerank_top_k: Number of candidates to rerank.
59
+ :vartype rerank_top_k: int
38
60
  :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
39
61
  :vartype extraction_run: str or None
40
62
  """
@@ -44,8 +66,81 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
44
66
  chunk_size: int = Field(default=800, ge=1)
45
67
  chunk_overlap: int = Field(default=200, ge=0)
46
68
  snippet_characters: int = Field(default=400, ge=1)
69
+ bm25_k1: float = Field(default=1.2, gt=0)
70
+ bm25_b: float = Field(default=0.75, ge=0, le=1)
71
+ ngram_min: int = Field(default=1, ge=1)
72
+ ngram_max: int = Field(default=1, ge=1)
73
+ stop_words: Optional[Union[str, List[str]]] = None
74
+ field_weight_title: float = Field(default=1.0, ge=0)
75
+ field_weight_body: float = Field(default=1.0, ge=0)
76
+ field_weight_tags: float = Field(default=1.0, ge=0)
77
+ rerank_enabled: bool = False
78
+ rerank_model: Optional[str] = None
79
+ rerank_top_k: int = Field(default=10, ge=1)
47
80
  extraction_run: Optional[str] = None
48
81
 
82
+ @field_validator("stop_words")
83
+ @classmethod
84
+ def _validate_stop_words(
85
+ cls, value: Optional[Union[str, List[str]]]
86
+ ) -> Optional[Union[str, List[str]]]:
87
+ if value is None:
88
+ return None
89
+ if isinstance(value, str):
90
+ if value.lower() == "english":
91
+ return "english"
92
+ raise ValueError("stop_words must be 'english' or a list of strings")
93
+ if not value:
94
+ raise ValueError("stop_words list must not be empty")
95
+ if any(not isinstance(token, str) or not token.strip() for token in value):
96
+ raise ValueError("stop_words list must contain non-empty strings")
97
+ return value
98
+
99
+ @model_validator(mode="after")
100
+ def _validate_ngram_range(self) -> "SqliteFullTextSearchRecipeConfig":
101
+ if self.ngram_min > self.ngram_max:
102
+ raise ValueError("Invalid ngram range: ngram_min must be <= ngram_max")
103
+ if self.rerank_enabled and not self.rerank_model:
104
+ raise ValueError("Rerank enabled requires rerank_model")
105
+ return self
106
+
107
+
108
+ _ENGLISH_STOP_WORDS: Set[str] = {
109
+ "a",
110
+ "an",
111
+ "and",
112
+ "are",
113
+ "as",
114
+ "at",
115
+ "be",
116
+ "but",
117
+ "by",
118
+ "for",
119
+ "if",
120
+ "in",
121
+ "into",
122
+ "is",
123
+ "it",
124
+ "no",
125
+ "not",
126
+ "of",
127
+ "on",
128
+ "or",
129
+ "such",
130
+ "that",
131
+ "the",
132
+ "their",
133
+ "then",
134
+ "there",
135
+ "these",
136
+ "they",
137
+ "this",
138
+ "to",
139
+ "was",
140
+ "will",
141
+ "with",
142
+ }
143
+
49
144
 
50
145
  class SqliteFullTextSearchBackend:
51
146
  """
@@ -118,29 +213,39 @@ class SqliteFullTextSearchBackend:
118
213
  :rtype: RetrievalResult
119
214
  """
120
215
  recipe_config = SqliteFullTextSearchRecipeConfig.model_validate(run.recipe.config)
216
+ query_tokens = _tokenize_query(query_text)
217
+ stop_words = _resolve_stop_words(recipe_config.stop_words)
218
+ filtered_tokens = _apply_stop_words(query_tokens, stop_words)
219
+ if not filtered_tokens:
220
+ return RetrievalResult(
221
+ query_text=query_text,
222
+ budget=budget,
223
+ run_id=run.run_id,
224
+ recipe_id=run.recipe.recipe_id,
225
+ backend_id=self.backend_id,
226
+ generated_at=utc_now_iso(),
227
+ evidence=[],
228
+ stats={"candidates": 0, "returned": 0},
229
+ )
121
230
  db_path = _resolve_run_db_path(corpus, run)
122
231
  candidates = _query_full_text_search_index(
123
232
  db_path=db_path,
124
- query_text=query_text,
233
+ query_text=" ".join(filtered_tokens),
125
234
  limit=_candidate_limit(budget.max_total_items),
126
235
  snippet_characters=recipe_config.snippet_characters,
127
236
  )
128
- sorted_candidates = sorted(
129
- candidates,
130
- key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
237
+ sorted_candidates = _rank_candidates(candidates)
238
+ evidence = _apply_rerank_if_enabled(
239
+ sorted_candidates,
240
+ query_tokens=filtered_tokens,
241
+ run=run,
242
+ budget=budget,
243
+ rerank_enabled=recipe_config.rerank_enabled,
244
+ rerank_top_k=recipe_config.rerank_top_k,
131
245
  )
132
- ranked = [
133
- evidence_item.model_copy(
134
- update={
135
- "rank": index,
136
- "recipe_id": run.recipe.recipe_id,
137
- "run_id": run.run_id,
138
- }
139
- )
140
- for index, evidence_item in enumerate(sorted_candidates, start=1)
141
- ]
142
- evidence = apply_budget(ranked, budget)
143
- stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
246
+ stats: Dict[str, object] = {"candidates": len(sorted_candidates), "returned": len(evidence)}
247
+ if recipe_config.rerank_enabled:
248
+ stats["reranked_candidates"] = min(len(sorted_candidates), recipe_config.rerank_top_k)
144
249
  return RetrievalResult(
145
250
  query_text=query_text,
146
251
  budget=budget,
@@ -165,6 +270,147 @@ def _candidate_limit(max_total_items: int) -> int:
165
270
  return max_total_items * 5
166
271
 
167
272
 
273
+ def _tokenize_query(query_text: str) -> List[str]:
274
+ """
275
+ Tokenize a query string into lowercased terms.
276
+
277
+ :param query_text: Raw query text.
278
+ :type query_text: str
279
+ :return: Token list.
280
+ :rtype: list[str]
281
+ """
282
+ return [token for token in query_text.lower().split() if token]
283
+
284
+
285
+ def _resolve_stop_words(value: Optional[Union[str, List[str]]]) -> Set[str]:
286
+ """
287
+ Resolve stop words based on a configuration value.
288
+
289
+ :param value: Stop word configuration.
290
+ :type value: str or list[str] or None
291
+ :return: Stop word set.
292
+ :rtype: set[str]
293
+ """
294
+ if value is None:
295
+ return set()
296
+ if isinstance(value, str):
297
+ return set(_ENGLISH_STOP_WORDS)
298
+ return {token.strip().lower() for token in value if token.strip()}
299
+
300
+
301
+ def _apply_stop_words(tokens: List[str], stop_words: Set[str]) -> List[str]:
302
+ """
303
+ Filter query tokens by a stop word list.
304
+
305
+ :param tokens: Token list.
306
+ :type tokens: list[str]
307
+ :param stop_words: Stop word set.
308
+ :type stop_words: set[str]
309
+ :return: Filtered token list.
310
+ :rtype: list[str]
311
+ """
312
+ if not stop_words:
313
+ return tokens
314
+ return [token for token in tokens if token not in stop_words]
315
+
316
+
317
+ def _rank_candidates(candidates: List[Evidence]) -> List[Evidence]:
318
+ """
319
+ Sort evidence candidates by descending score.
320
+
321
+ :param candidates: Evidence list to sort.
322
+ :type candidates: list[Evidence]
323
+ :return: Sorted evidence list.
324
+ :rtype: list[Evidence]
325
+ """
326
+ return sorted(
327
+ candidates, key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id)
328
+ )
329
+
330
+
331
+ def _rerank_score(text: str, query_tokens: List[str]) -> float:
332
+ """
333
+ Compute a simple rerank score using token overlap.
334
+
335
+ :param text: Candidate text.
336
+ :type text: str
337
+ :param query_tokens: Query tokens.
338
+ :type query_tokens: list[str]
339
+ :return: Rerank score.
340
+ :rtype: float
341
+ """
342
+ lower_text = text.lower() if text else ""
343
+ return float(sum(1 for token in query_tokens if token in lower_text))
344
+
345
+
346
+ def _apply_rerank_if_enabled(
347
+ candidates: List[Evidence],
348
+ *,
349
+ query_tokens: List[str],
350
+ run: RetrievalRun,
351
+ budget: QueryBudget,
352
+ rerank_enabled: bool,
353
+ rerank_top_k: int,
354
+ ) -> List[Evidence]:
355
+ """
356
+ Rerank candidates when enabled, otherwise apply the budget to ranked results.
357
+
358
+ :param candidates: Ranked candidate evidence.
359
+ :type candidates: list[Evidence]
360
+ :param query_tokens: Query tokens used for reranking.
361
+ :type query_tokens: list[str]
362
+ :param run: Retrieval run to annotate evidence with.
363
+ :type run: RetrievalRun
364
+ :param budget: Evidence selection budget.
365
+ :type budget: QueryBudget
366
+ :param rerank_enabled: Whether reranking is enabled.
367
+ :type rerank_enabled: bool
368
+ :param rerank_top_k: Maximum candidates to rerank.
369
+ :type rerank_top_k: int
370
+ :return: Evidence list respecting budget.
371
+ :rtype: list[Evidence]
372
+ """
373
+ if not rerank_enabled:
374
+ ranked = [
375
+ evidence_item.model_copy(
376
+ update={
377
+ "rank": index,
378
+ "recipe_id": run.recipe.recipe_id,
379
+ "run_id": run.run_id,
380
+ }
381
+ )
382
+ for index, evidence_item in enumerate(candidates, start=1)
383
+ ]
384
+ return apply_budget(ranked, budget)
385
+
386
+ rerank_limit = min(len(candidates), rerank_top_k)
387
+ rerank_candidates = candidates[:rerank_limit]
388
+ reranked: List[Evidence] = []
389
+ for evidence_item in rerank_candidates:
390
+ rerank_score = _rerank_score(evidence_item.text or "", query_tokens)
391
+ reranked.append(
392
+ evidence_item.model_copy(
393
+ update={
394
+ "score": rerank_score,
395
+ "stage": "rerank",
396
+ "stage_scores": {"retrieve": evidence_item.score, "rerank": rerank_score},
397
+ }
398
+ )
399
+ )
400
+ reranked_sorted = _rank_candidates(reranked)
401
+ ranked = [
402
+ evidence_item.model_copy(
403
+ update={
404
+ "rank": index,
405
+ "recipe_id": run.recipe.recipe_id,
406
+ "run_id": run.run_id,
407
+ }
408
+ )
409
+ for index, evidence_item in enumerate(reranked_sorted, start=1)
410
+ ]
411
+ return apply_budget(ranked, budget)
412
+
413
+
168
414
  def _resolve_run_db_path(corpus: Corpus, run: RetrievalRun) -> Path:
169
415
  """
170
416
  Resolve the SQLite index path for a retrieval run.
@@ -0,0 +1,460 @@
1
+ """
2
+ Deterministic term-frequency vector retrieval backend.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import math
8
+ import re
9
+ from typing import Dict, Iterable, List, Optional, Tuple
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from ..corpus import Corpus
14
+ from ..frontmatter import parse_front_matter
15
+ from ..models import (
16
+ Evidence,
17
+ ExtractionRunReference,
18
+ QueryBudget,
19
+ RetrievalResult,
20
+ RetrievalRun,
21
+ parse_extraction_run_reference,
22
+ )
23
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
24
+ from ..time import utc_now_iso
25
+
26
+
27
+ class VectorRecipeConfig(BaseModel):
28
+ """
29
+ Configuration for the vector retrieval backend.
30
+
31
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
32
+ :vartype snippet_characters: int
33
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
34
+ :vartype extraction_run: str or None
35
+ """
36
+
37
+ model_config = ConfigDict(extra="forbid")
38
+
39
+ snippet_characters: int = Field(default=400, ge=1)
40
+ extraction_run: Optional[str] = None
41
+
42
+
43
+ class VectorBackend:
44
+ """
45
+ Deterministic vector backend using term-frequency cosine similarity.
46
+
47
+ :ivar backend_id: Backend identifier.
48
+ :vartype backend_id: str
49
+ """
50
+
51
+ backend_id = "vector"
52
+
53
+ def build_run(
54
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
55
+ ) -> RetrievalRun:
56
+ """
57
+ Register a vector backend run (no materialization).
58
+
59
+ :param corpus: Corpus to build against.
60
+ :type corpus: Corpus
61
+ :param recipe_name: Human-readable recipe name.
62
+ :type recipe_name: str
63
+ :param config: Backend-specific configuration values.
64
+ :type config: dict[str, object]
65
+ :return: Run manifest describing the build.
66
+ :rtype: RetrievalRun
67
+ """
68
+ recipe_config = VectorRecipeConfig.model_validate(config)
69
+ catalog = corpus.load_catalog()
70
+ recipe = create_recipe_manifest(
71
+ backend_id=self.backend_id,
72
+ name=recipe_name,
73
+ config=recipe_config.model_dump(),
74
+ )
75
+ stats = {
76
+ "items": len(catalog.items),
77
+ "text_items": _count_text_items(corpus, catalog.items.values(), recipe_config),
78
+ }
79
+ run = create_run_manifest(corpus, recipe=recipe, stats=stats, artifact_paths=[])
80
+ corpus.write_run(run)
81
+ return run
82
+
83
+ def query(
84
+ self,
85
+ corpus: Corpus,
86
+ *,
87
+ run: RetrievalRun,
88
+ query_text: str,
89
+ budget: QueryBudget,
90
+ ) -> RetrievalResult:
91
+ """
92
+ Query the corpus using term-frequency cosine similarity.
93
+
94
+ :param corpus: Corpus associated with the run.
95
+ :type corpus: Corpus
96
+ :param run: Run manifest to use for querying.
97
+ :type run: RetrievalRun
98
+ :param query_text: Query text to execute.
99
+ :type query_text: str
100
+ :param budget: Evidence selection budget.
101
+ :type budget: QueryBudget
102
+ :return: Retrieval results containing evidence.
103
+ :rtype: RetrievalResult
104
+ """
105
+ recipe_config = VectorRecipeConfig.model_validate(run.recipe.config)
106
+ query_tokens = _tokenize_text(query_text)
107
+ if not query_tokens:
108
+ return RetrievalResult(
109
+ query_text=query_text,
110
+ budget=budget,
111
+ run_id=run.run_id,
112
+ recipe_id=run.recipe.recipe_id,
113
+ backend_id=self.backend_id,
114
+ generated_at=utc_now_iso(),
115
+ evidence=[],
116
+ stats={"candidates": 0, "returned": 0},
117
+ )
118
+ query_vector = _term_frequencies(query_tokens)
119
+ query_norm = _vector_norm(query_vector)
120
+ catalog = corpus.load_catalog()
121
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
122
+ scored_candidates = _score_items(
123
+ corpus,
124
+ catalog.items.values(),
125
+ query_tokens=query_tokens,
126
+ query_vector=query_vector,
127
+ query_norm=query_norm,
128
+ snippet_characters=recipe_config.snippet_characters,
129
+ extraction_reference=extraction_reference,
130
+ )
131
+ sorted_candidates = sorted(
132
+ scored_candidates,
133
+ key=lambda evidence_item: (-evidence_item.score, evidence_item.item_id),
134
+ )
135
+ ranked = [
136
+ evidence_item.model_copy(
137
+ update={
138
+ "rank": index,
139
+ "recipe_id": run.recipe.recipe_id,
140
+ "run_id": run.run_id,
141
+ }
142
+ )
143
+ for index, evidence_item in enumerate(sorted_candidates, start=1)
144
+ ]
145
+ evidence = apply_budget(ranked, budget)
146
+ stats = {"candidates": len(sorted_candidates), "returned": len(evidence)}
147
+ return RetrievalResult(
148
+ query_text=query_text,
149
+ budget=budget,
150
+ run_id=run.run_id,
151
+ recipe_id=run.recipe.recipe_id,
152
+ backend_id=self.backend_id,
153
+ generated_at=utc_now_iso(),
154
+ evidence=evidence,
155
+ stats=stats,
156
+ )
157
+
158
+
159
+ def _resolve_extraction_reference(
160
+ corpus: Corpus, recipe_config: VectorRecipeConfig
161
+ ) -> Optional[ExtractionRunReference]:
162
+ """
163
+ Resolve an extraction run reference from a recipe config.
164
+
165
+ :param corpus: Corpus associated with the recipe.
166
+ :type corpus: Corpus
167
+ :param recipe_config: Parsed vector recipe configuration.
168
+ :type recipe_config: VectorRecipeConfig
169
+ :return: Parsed extraction reference or None.
170
+ :rtype: ExtractionRunReference or None
171
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
172
+ """
173
+ if not recipe_config.extraction_run:
174
+ return None
175
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
176
+ run_dir = corpus.extraction_run_dir(
177
+ extractor_id=extraction_reference.extractor_id,
178
+ run_id=extraction_reference.run_id,
179
+ )
180
+ if not run_dir.is_dir():
181
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
182
+ return extraction_reference
183
+
184
+
185
+ def _count_text_items(
186
+ corpus: Corpus, items: Iterable[object], recipe_config: VectorRecipeConfig
187
+ ) -> int:
188
+ """
189
+ Count catalog items that represent text content.
190
+
191
+ :param corpus: Corpus containing the items.
192
+ :type corpus: Corpus
193
+ :param items: Catalog items to inspect.
194
+ :type items: Iterable[object]
195
+ :param recipe_config: Parsed vector recipe configuration.
196
+ :type recipe_config: VectorRecipeConfig
197
+ :return: Number of text items.
198
+ :rtype: int
199
+ """
200
+ text_item_count = 0
201
+ extraction_reference = _resolve_extraction_reference(corpus, recipe_config)
202
+ for catalog_item in items:
203
+ item_id = str(getattr(catalog_item, "id", ""))
204
+ if extraction_reference and item_id:
205
+ extracted_text = corpus.read_extracted_text(
206
+ extractor_id=extraction_reference.extractor_id,
207
+ run_id=extraction_reference.run_id,
208
+ item_id=item_id,
209
+ )
210
+ if isinstance(extracted_text, str) and extracted_text.strip():
211
+ text_item_count += 1
212
+ continue
213
+ media_type = getattr(catalog_item, "media_type", "")
214
+ if media_type == "text/markdown" or str(media_type).startswith("text/"):
215
+ text_item_count += 1
216
+ return text_item_count
217
+
218
+
219
+ def _tokenize_text(text: str) -> List[str]:
220
+ """
221
+ Tokenize text into lowercase word tokens.
222
+
223
+ :param text: Input text.
224
+ :type text: str
225
+ :return: Token list.
226
+ :rtype: list[str]
227
+ """
228
+ return re.findall(r"[a-z0-9]+", text.lower())
229
+
230
+
231
+ def _term_frequencies(tokens: List[str]) -> Dict[str, float]:
232
+ """
233
+ Build term frequency weights from tokens.
234
+
235
+ :param tokens: Token list.
236
+ :type tokens: list[str]
237
+ :return: Term frequency mapping.
238
+ :rtype: dict[str, float]
239
+ """
240
+ frequencies: Dict[str, float] = {}
241
+ for token in tokens:
242
+ frequencies[token] = frequencies.get(token, 0.0) + 1.0
243
+ return frequencies
244
+
245
+
246
+ def _vector_norm(vector: Dict[str, float]) -> float:
247
+ """
248
+ Compute the Euclidean norm of a term-frequency vector.
249
+
250
+ :param vector: Term frequency mapping.
251
+ :type vector: dict[str, float]
252
+ :return: Vector norm.
253
+ :rtype: float
254
+ """
255
+ return math.sqrt(sum(value * value for value in vector.values()))
256
+
257
+
258
+ def _cosine_similarity(
259
+ left: Dict[str, float],
260
+ *,
261
+ left_norm: float,
262
+ right: Dict[str, float],
263
+ right_norm: float,
264
+ ) -> float:
265
+ """
266
+ Compute cosine similarity between two term-frequency vectors.
267
+
268
+ :param left: Left term-frequency vector.
269
+ :type left: dict[str, float]
270
+ :param left_norm: Precomputed left vector norm.
271
+ :type left_norm: float
272
+ :param right: Right term-frequency vector.
273
+ :type right: dict[str, float]
274
+ :param right_norm: Precomputed right vector norm.
275
+ :type right_norm: float
276
+ :return: Cosine similarity score.
277
+ :rtype: float
278
+ """
279
+ dot = 0.0
280
+ if len(left) < len(right):
281
+ for token, value in left.items():
282
+ dot += value * right.get(token, 0.0)
283
+ else:
284
+ for token, value in right.items():
285
+ dot += value * left.get(token, 0.0)
286
+ return dot / (left_norm * right_norm)
287
+
288
+
289
+ def _load_text_from_item(
290
+ corpus: Corpus,
291
+ *,
292
+ item_id: str,
293
+ relpath: str,
294
+ media_type: str,
295
+ extraction_reference: Optional[ExtractionRunReference],
296
+ ) -> Optional[str]:
297
+ """
298
+ Load a text payload from a catalog item.
299
+
300
+ :param corpus: Corpus containing the item.
301
+ :type corpus: Corpus
302
+ :param item_id: Item identifier.
303
+ :type item_id: str
304
+ :param relpath: Relative path to the stored content.
305
+ :type relpath: str
306
+ :param media_type: Media type for the stored content.
307
+ :type media_type: str
308
+ :param extraction_reference: Optional extraction run reference.
309
+ :type extraction_reference: ExtractionRunReference or None
310
+ :return: Text payload or None if not decodable as text.
311
+ :rtype: str or None
312
+ """
313
+ if extraction_reference:
314
+ extracted_text = corpus.read_extracted_text(
315
+ extractor_id=extraction_reference.extractor_id,
316
+ run_id=extraction_reference.run_id,
317
+ item_id=item_id,
318
+ )
319
+ if isinstance(extracted_text, str) and extracted_text.strip():
320
+ return extracted_text
321
+
322
+ content_path = corpus.root / relpath
323
+ raw_bytes = content_path.read_bytes()
324
+ if media_type == "text/markdown":
325
+ markdown_text = raw_bytes.decode("utf-8")
326
+ parsed_document = parse_front_matter(markdown_text)
327
+ return parsed_document.body
328
+ if media_type.startswith("text/"):
329
+ return raw_bytes.decode("utf-8")
330
+ return None
331
+
332
+
333
+ def _find_first_match(text: str, tokens: List[str]) -> Optional[Tuple[int, int]]:
334
+ """
335
+ Locate the earliest token match span in a text payload.
336
+
337
+ :param text: Text to scan.
338
+ :type text: str
339
+ :param tokens: Query tokens.
340
+ :type tokens: list[str]
341
+ :return: Start/end span for the earliest match, or None if no matches.
342
+ :rtype: tuple[int, int] or None
343
+ """
344
+ lower_text = text.lower()
345
+ best_start: Optional[int] = None
346
+ best_end: Optional[int] = None
347
+ for token in tokens:
348
+ if not token:
349
+ continue
350
+ token_start = lower_text.find(token)
351
+ if token_start == -1:
352
+ continue
353
+ token_end = token_start + len(token)
354
+ if best_start is None or token_start < best_start:
355
+ best_start = token_start
356
+ best_end = token_end
357
+ if best_start is None or best_end is None:
358
+ return None
359
+ return best_start, best_end
360
+
361
+
362
+ def _build_snippet(text: str, span: Optional[Tuple[int, int]], *, max_chars: int) -> str:
363
+ """
364
+ Build a snippet around a match span, constrained by a character budget.
365
+
366
+ :param text: Source text to slice.
367
+ :type text: str
368
+ :param span: Match span to center on.
369
+ :type span: tuple[int, int] or None
370
+ :param max_chars: Maximum snippet length.
371
+ :type max_chars: int
372
+ :return: Snippet text.
373
+ :rtype: str
374
+ """
375
+ if not text:
376
+ return ""
377
+ if span is None:
378
+ return text[:max_chars]
379
+ span_start, span_end = span
380
+ half_window = max_chars // 2
381
+ snippet_start = max(span_start - half_window, 0)
382
+ snippet_end = min(span_end + half_window, len(text))
383
+ return text[snippet_start:snippet_end]
384
+
385
+
386
+ def _score_items(
387
+ corpus: Corpus,
388
+ items: Iterable[object],
389
+ *,
390
+ query_tokens: List[str],
391
+ query_vector: Dict[str, float],
392
+ query_norm: float,
393
+ snippet_characters: int,
394
+ extraction_reference: Optional[ExtractionRunReference],
395
+ ) -> List[Evidence]:
396
+ """
397
+ Score catalog items and return evidence candidates.
398
+
399
+ :param corpus: Corpus containing the items.
400
+ :type corpus: Corpus
401
+ :param items: Catalog items to score.
402
+ :type items: Iterable[object]
403
+ :param query_tokens: Tokenized query text.
404
+ :type query_tokens: list[str]
405
+ :param query_vector: Query term-frequency vector.
406
+ :type query_vector: dict[str, float]
407
+ :param query_norm: Query vector norm.
408
+ :type query_norm: float
409
+ :param snippet_characters: Snippet length budget.
410
+ :type snippet_characters: int
411
+ :param extraction_reference: Optional extraction run reference.
412
+ :type extraction_reference: ExtractionRunReference or None
413
+ :return: Evidence candidates with provisional ranks.
414
+ :rtype: list[Evidence]
415
+ """
416
+ evidence_items: List[Evidence] = []
417
+ for catalog_item in items:
418
+ media_type = getattr(catalog_item, "media_type", "")
419
+ relpath = getattr(catalog_item, "relpath", "")
420
+ item_id = str(getattr(catalog_item, "id", ""))
421
+ item_text = _load_text_from_item(
422
+ corpus,
423
+ item_id=item_id,
424
+ relpath=relpath,
425
+ media_type=str(media_type),
426
+ extraction_reference=extraction_reference,
427
+ )
428
+ if item_text is None:
429
+ continue
430
+ tokens = _tokenize_text(item_text)
431
+ if not tokens:
432
+ continue
433
+ vector = _term_frequencies(tokens)
434
+ similarity = _cosine_similarity(
435
+ query_vector, left_norm=query_norm, right=vector, right_norm=_vector_norm(vector)
436
+ )
437
+ if similarity <= 0:
438
+ continue
439
+ span = _find_first_match(item_text, query_tokens)
440
+ snippet = _build_snippet(item_text, span, max_chars=snippet_characters)
441
+ span_start = span[0] if span else None
442
+ span_end = span[1] if span else None
443
+ evidence_items.append(
444
+ Evidence(
445
+ item_id=str(getattr(catalog_item, "id")),
446
+ source_uri=getattr(catalog_item, "source_uri", None),
447
+ media_type=str(media_type),
448
+ score=float(similarity),
449
+ rank=1,
450
+ text=snippet,
451
+ content_ref=None,
452
+ span_start=span_start,
453
+ span_end=span_end,
454
+ stage="vector",
455
+ recipe_id="",
456
+ run_id="",
457
+ hash=hash_text(snippet),
458
+ )
459
+ )
460
+ return evidence_items
biblicus/models.py CHANGED
@@ -263,6 +263,8 @@ class Evidence(BaseModel):
263
263
  :vartype span_end: int or None
264
264
  :ivar stage: Retrieval stage label (for example, scan, full-text search, rerank).
265
265
  :vartype stage: str
266
+ :ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
267
+ :vartype stage_scores: dict[str, float] or None
266
268
  :ivar recipe_id: Recipe identifier used to create the run.
267
269
  :vartype recipe_id: str
268
270
  :ivar run_id: Retrieval run identifier.
@@ -283,6 +285,7 @@ class Evidence(BaseModel):
283
285
  span_start: Optional[int] = None
284
286
  span_end: Optional[int] = None
285
287
  stage: str
288
+ stage_scores: Optional[Dict[str, float]] = None
286
289
  recipe_id: str
287
290
  run_id: str
288
291
  hash: Optional[str] = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -493,6 +493,12 @@ Two backends are included.
493
493
 
494
494
  For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
495
495
 
496
+ ## Retrieval documentation
497
+
498
+ For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
499
+ (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
500
+ and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
501
+
496
502
  ## Extraction backends
497
503
 
498
504
  These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
@@ -1,4 +1,4 @@
1
- biblicus/__init__.py,sha256=BejOPHIlCnT74pu9fNuLm14HsmWjGqCIwpfD9hDOqSo,496
1
+ biblicus/__init__.py,sha256=sT0PFc3DRGFRcN7Zx4Yooc8OzmLvaj1-ZjbvFHce8lU,496
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
3
  biblicus/cli.py,sha256=aH3plnednnYgcPnSoYQf200nboKc6N-tuc3FuLPQEcU,35132
4
4
  biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
@@ -16,7 +16,7 @@ biblicus/hooks.py,sha256=OHQOmOi7rUcQqYWVeod4oPe8nVLepD7F_SlN7O_-BsE,7863
16
16
  biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
17
17
  biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
18
18
  biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
19
- biblicus/models.py,sha256=vlvPP7AOZGtnHSq47-s9YW-fqLwjgYR6NBcSfeC8YKk,15665
19
+ biblicus/models.py,sha256=r28O6cg3d1bjJnKqpLieVLTgtXTfzb_60wMORvVuDN0,15846
20
20
  biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
21
21
  biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
22
22
  biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
@@ -30,13 +30,15 @@ biblicus/analysis/__init__.py,sha256=Z4Wb4d-EoUuGHkcfRm9ILuZ8vr9FBqRxC0u1i6Fp_0w
30
30
  biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
31
31
  biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
32
32
  biblicus/analysis/models.py,sha256=LuR52w27JRzV-Mr-WAOduZrBOCTrp5uYkMc46QHTRrI,27300
33
- biblicus/analysis/profiling.py,sha256=z4w14LVJrTEXcQ3PBNwwb_61KuuwQgXw4-EiAaxOQ4Y,10672
33
+ biblicus/analysis/profiling.py,sha256=v2B4Tn9WiXRRP_wIADBPRQVKkMc92KXCas7OBa7n0LU,10670
34
34
  biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
35
35
  biblicus/analysis/topic_modeling.py,sha256=ZGXvm2MyU6plxz2FE1RQU-3bra6QZ-t8EJj8kG1TW0M,19438
36
- biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
36
+ biblicus/backends/__init__.py,sha256=3HJY0oMm8pFFVGC4Z-dlPRHhIPVDdUzsa4IMjKP_9dI,1378
37
37
  biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
38
+ biblicus/backends/hybrid.py,sha256=CXh6QrlE0RsTJjSlZRdtomLlILfkglBDQG3YVa8RpFU,10589
38
39
  biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
39
- biblicus/backends/sqlite_full_text_search.py,sha256=XFuIbEHYWMD9JkjgRZcgYH3kP3b4hRnJ3PwP8rSFjUU,16502
40
+ biblicus/backends/sqlite_full_text_search.py,sha256=VAn4fDdfiaS1Rn6zHlYz3E10_3vMU9P94QU8cL0l8Mk,24466
41
+ biblicus/backends/vector.py,sha256=3RdxSBPb1kOX4Sfd4d1qXFW9ecuiRvGpOHadLCbeh1g,15183
40
42
  biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
41
43
  biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
42
44
  biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
@@ -55,9 +57,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
55
57
  biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
56
58
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
57
59
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
58
- biblicus-0.10.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
59
- biblicus-0.10.0.dist-info/METADATA,sha256=xZ7scJLdlKHRtm0EU5Ravq5ih2mS2KNfMbbLXNqZ8Ek,27455
60
- biblicus-0.10.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
61
- biblicus-0.10.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
62
- biblicus-0.10.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
63
- biblicus-0.10.0.dist-info/RECORD,,
60
+ biblicus-0.11.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
61
+ biblicus-0.11.0.dist-info/METADATA,sha256=zrJESYGfGLu7Iq1I--GPIkEY9gXDb9szBIuenlWor7I,27765
62
+ biblicus-0.11.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
63
+ biblicus-0.11.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
64
+ biblicus-0.11.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
65
+ biblicus-0.11.0.dist-info/RECORD,,