biblicus 0.15.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. biblicus/__init__.py +21 -1
  2. biblicus/analysis/markov.py +35 -3
  3. biblicus/backends/__init__.py +6 -2
  4. biblicus/backends/embedding_index_common.py +334 -0
  5. biblicus/backends/embedding_index_file.py +272 -0
  6. biblicus/backends/embedding_index_inmemory.py +270 -0
  7. biblicus/backends/hybrid.py +8 -5
  8. biblicus/backends/scan.py +1 -0
  9. biblicus/backends/sqlite_full_text_search.py +1 -1
  10. biblicus/backends/{vector.py → tf_vector.py} +28 -35
  11. biblicus/chunking.py +396 -0
  12. biblicus/cli.py +75 -25
  13. biblicus/context.py +27 -12
  14. biblicus/context_engine/__init__.py +53 -0
  15. biblicus/context_engine/assembler.py +1060 -0
  16. biblicus/context_engine/compaction.py +110 -0
  17. biblicus/context_engine/models.py +423 -0
  18. biblicus/context_engine/retrieval.py +129 -0
  19. biblicus/corpus.py +117 -16
  20. biblicus/embedding_providers.py +122 -0
  21. biblicus/errors.py +24 -0
  22. biblicus/frontmatter.py +2 -0
  23. biblicus/knowledge_base.py +1 -1
  24. biblicus/models.py +15 -3
  25. biblicus/retrieval.py +7 -2
  26. biblicus/sources.py +46 -11
  27. biblicus/text/link.py +6 -0
  28. biblicus/text/prompts.py +2 -0
  29. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/METADATA +4 -3
  30. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/RECORD +34 -24
  31. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/WHEEL +0 -0
  32. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/entry_points.txt +0 -0
  33. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/licenses/LICENSE +0 -0
  34. {biblicus-0.15.1.dist-info → biblicus-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,272 @@
1
+ """
2
+ Embedding-index retrieval backend that reads the embedding matrix via memory mapping.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Dict, List, Optional
9
+
10
+ import numpy as np
11
+
12
+ from ..corpus import Corpus
13
+ from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
14
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
15
+ from ..time import utc_now_iso
16
+ from .embedding_index_common import (
17
+ ChunkRecord,
18
+ EmbeddingIndexRecipeConfig,
19
+ _build_snippet,
20
+ _extract_span_text,
21
+ artifact_paths_for_run,
22
+ chunks_to_records,
23
+ collect_chunks,
24
+ cosine_similarity_scores,
25
+ read_chunks_jsonl,
26
+ read_embeddings,
27
+ resolve_extraction_reference,
28
+ write_chunks_jsonl,
29
+ write_embeddings,
30
+ )
31
+
32
+
33
+ class EmbeddingIndexFileBackend:
34
+ """
35
+ Embedding retrieval backend using memory-mapped similarity scanning.
36
+ """
37
+
38
+ backend_id = "embedding-index-file"
39
+
40
+ def build_run(
41
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
42
+ ) -> RetrievalRun:
43
+ """
44
+ Build an embedding index run by chunking text payloads and materializing embeddings.
45
+
46
+ :param corpus: Corpus to build against.
47
+ :type corpus: Corpus
48
+ :param recipe_name: Human-readable recipe name.
49
+ :type recipe_name: str
50
+ :param config: Backend-specific configuration values.
51
+ :type config: dict[str, object]
52
+ :return: Run manifest describing the build.
53
+ :rtype: biblicus.models.RetrievalRun
54
+ """
55
+ recipe_config = EmbeddingIndexRecipeConfig.model_validate(config)
56
+ chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
57
+
58
+ provider = recipe_config.embedding_provider.build_provider()
59
+ embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
60
+
61
+ recipe = create_recipe_manifest(
62
+ backend_id=self.backend_id,
63
+ name=recipe_name,
64
+ config=recipe_config.model_dump(),
65
+ )
66
+ run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
67
+
68
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
69
+ embeddings_path = corpus.root / paths["embeddings"]
70
+ chunks_path = corpus.root / paths["chunks"]
71
+ corpus.runs_dir.mkdir(parents=True, exist_ok=True)
72
+
73
+ write_embeddings(embeddings_path, embeddings)
74
+ write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
75
+
76
+ stats = {
77
+ "items": len(corpus.load_catalog().items),
78
+ "text_items": text_items,
79
+ "chunks": len(chunks),
80
+ "dimensions": (
81
+ int(embeddings.shape[1])
82
+ if embeddings.size
83
+ else recipe_config.embedding_provider.dimensions
84
+ ),
85
+ }
86
+ run = run.model_copy(
87
+ update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
88
+ )
89
+ corpus.write_run(run)
90
+ return run
91
+
92
+ def query(
93
+ self,
94
+ corpus: Corpus,
95
+ *,
96
+ run: RetrievalRun,
97
+ query_text: str,
98
+ budget: QueryBudget,
99
+ ) -> RetrievalResult:
100
+ """
101
+ Query an embedding index run and return ranked evidence.
102
+
103
+ :param corpus: Corpus associated with the run.
104
+ :type corpus: Corpus
105
+ :param run: Run manifest to use for querying.
106
+ :type run: biblicus.models.RetrievalRun
107
+ :param query_text: Query text to embed.
108
+ :type query_text: str
109
+ :param budget: Evidence selection budget.
110
+ :type budget: biblicus.models.QueryBudget
111
+ :return: Retrieval results containing evidence.
112
+ :rtype: biblicus.models.RetrievalResult
113
+ """
114
+ recipe_config = EmbeddingIndexRecipeConfig.model_validate(run.recipe.config)
115
+ extraction_reference = resolve_extraction_reference(corpus, recipe_config)
116
+
117
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
118
+ embeddings_path = corpus.root / paths["embeddings"]
119
+ chunks_path = corpus.root / paths["chunks"]
120
+ if not embeddings_path.is_file() or not chunks_path.is_file():
121
+ raise FileNotFoundError("Embedding index artifacts are missing for this run")
122
+
123
+ embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
124
+ chunk_records = read_chunks_jsonl(chunks_path)
125
+ if embeddings.shape[0] != len(chunk_records):
126
+ raise ValueError(
127
+ "Embedding index artifacts are inconsistent: "
128
+ "embeddings row count does not match chunk record count"
129
+ )
130
+
131
+ provider = recipe_config.embedding_provider.build_provider()
132
+ query_embedding = provider.embed_texts([query_text]).astype(np.float32)
133
+ if query_embedding.shape[0] != 1:
134
+ raise ValueError("Embedding provider returned an invalid query embedding shape")
135
+
136
+ batch_rows = recipe_config.maximum_cache_total_items or 4096
137
+ candidates = _top_indices_batched(
138
+ embeddings=embeddings,
139
+ query_vector=query_embedding[0],
140
+ limit=_candidate_limit(budget.max_total_items + budget.offset),
141
+ batch_rows=batch_rows,
142
+ )
143
+ evidence_items = _build_evidence(
144
+ corpus,
145
+ run=run,
146
+ recipe_config=recipe_config,
147
+ candidates=candidates,
148
+ embeddings=embeddings,
149
+ query_vector=query_embedding[0],
150
+ chunk_records=chunk_records,
151
+ extraction_reference=extraction_reference,
152
+ )
153
+ ranked = [
154
+ item.model_copy(
155
+ update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
156
+ )
157
+ for index, item in enumerate(evidence_items, start=1)
158
+ ]
159
+ evidence = apply_budget(ranked, budget)
160
+ return RetrievalResult(
161
+ query_text=query_text,
162
+ budget=budget,
163
+ run_id=run.run_id,
164
+ recipe_id=run.recipe.recipe_id,
165
+ backend_id=self.backend_id,
166
+ generated_at=utc_now_iso(),
167
+ evidence=evidence,
168
+ stats={"candidates": len(evidence_items), "returned": len(evidence)},
169
+ )
170
+
171
+
172
+ def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
173
+ return max(1, int(max_total_items) * int(multiplier))
174
+
175
+
176
+ @dataclass(frozen=True)
177
+ class _ScoredIndex:
178
+ score: float
179
+ index: int
180
+
181
+
182
+ def _top_indices_batched(
183
+ *, embeddings: np.ndarray, query_vector: np.ndarray, limit: int, batch_rows: int = 4096
184
+ ) -> List[int]:
185
+ if embeddings.size == 0:
186
+ return []
187
+ limit = min(int(limit), int(embeddings.shape[0]))
188
+
189
+ best: List[_ScoredIndex] = []
190
+ for start in range(0, embeddings.shape[0], int(batch_rows)):
191
+ end = min(start + int(batch_rows), embeddings.shape[0])
192
+ scores = cosine_similarity_scores(embeddings[start:end], query_vector)
193
+ batch_limit = min(limit, int(scores.size))
194
+ if batch_limit <= 0:
195
+ continue
196
+ indices = np.argpartition(-scores, batch_limit - 1)[:batch_limit]
197
+ for local_index in indices:
198
+ global_index = int(start + int(local_index))
199
+ best.append(_ScoredIndex(score=float(scores[int(local_index)]), index=global_index))
200
+
201
+ best.sort(key=lambda item: (-item.score, item.index))
202
+ return [int(item.index) for item in best[:limit]]
203
+
204
+
205
+ def _build_evidence(
206
+ corpus: Corpus,
207
+ *,
208
+ run: RetrievalRun,
209
+ recipe_config: EmbeddingIndexRecipeConfig,
210
+ candidates: List[int],
211
+ embeddings: np.ndarray,
212
+ query_vector: np.ndarray,
213
+ chunk_records: List[ChunkRecord],
214
+ extraction_reference: Optional[ExtractionRunReference],
215
+ ) -> List[Evidence]:
216
+ catalog = corpus.load_catalog()
217
+ evidence_items: List[Evidence] = []
218
+ for idx in candidates:
219
+ record = chunk_records[idx]
220
+ catalog_item = catalog.items[record.item_id]
221
+ text = _load_text_for_evidence(
222
+ corpus,
223
+ item_id=record.item_id,
224
+ relpath=str(getattr(catalog_item, "relpath")),
225
+ media_type=str(getattr(catalog_item, "media_type")),
226
+ extraction_reference=extraction_reference,
227
+ )
228
+ span_text = _build_snippet(
229
+ text, (record.span_start, record.span_end), recipe_config.snippet_characters
230
+ )
231
+ if span_text is None:
232
+ span_text = _extract_span_text(text, (record.span_start, record.span_end))
233
+ score = float(cosine_similarity_scores(embeddings[idx : idx + 1], query_vector)[0])
234
+ evidence_items.append(
235
+ Evidence(
236
+ item_id=record.item_id,
237
+ source_uri=getattr(catalog_item, "source_uri", None),
238
+ media_type=str(getattr(catalog_item, "media_type")),
239
+ score=score,
240
+ rank=1,
241
+ text=span_text,
242
+ content_ref=None,
243
+ span_start=record.span_start,
244
+ span_end=record.span_end,
245
+ stage=EmbeddingIndexFileBackend.backend_id,
246
+ stage_scores=None,
247
+ recipe_id=run.recipe.recipe_id,
248
+ run_id=run.run_id,
249
+ metadata=getattr(catalog_item, "metadata", {}) or {},
250
+ hash=hash_text(span_text or ""),
251
+ )
252
+ )
253
+ return evidence_items
254
+
255
+
256
+ def _load_text_for_evidence(
257
+ corpus: Corpus,
258
+ *,
259
+ item_id: str,
260
+ relpath: str,
261
+ media_type: str,
262
+ extraction_reference: Optional[ExtractionRunReference],
263
+ ) -> Optional[str]:
264
+ from .embedding_index_common import _load_text_from_item
265
+
266
+ return _load_text_from_item(
267
+ corpus,
268
+ item_id=item_id,
269
+ relpath=relpath,
270
+ media_type=media_type,
271
+ extraction_reference=extraction_reference,
272
+ )
@@ -0,0 +1,270 @@
1
+ """
2
+ Embedding-index retrieval backend that loads the full embedding matrix into memory at query time.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, List, Optional
8
+
9
+ import numpy as np
10
+ from pydantic import ConfigDict, Field
11
+
12
+ from ..corpus import Corpus
13
+ from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
14
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
15
+ from ..time import utc_now_iso
16
+ from .embedding_index_common import (
17
+ ChunkRecord,
18
+ EmbeddingIndexRecipeConfig,
19
+ _build_snippet,
20
+ _extract_span_text,
21
+ artifact_paths_for_run,
22
+ chunks_to_records,
23
+ collect_chunks,
24
+ cosine_similarity_scores,
25
+ read_chunks_jsonl,
26
+ read_embeddings,
27
+ resolve_extraction_reference,
28
+ write_chunks_jsonl,
29
+ write_embeddings,
30
+ )
31
+
32
+
33
+ class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
34
+ """
35
+ Configuration for embedding-index-inmemory retrieval.
36
+
37
+ :ivar maximum_cache_total_items: Maximum chunks allowed for in-memory query loading.
38
+ :vartype maximum_cache_total_items: int
39
+ """
40
+
41
+ model_config = ConfigDict(extra="forbid")
42
+
43
+ maximum_cache_total_items: int = Field(default=25000, ge=1)
44
+
45
+
46
+ class EmbeddingIndexInMemoryBackend:
47
+ """
48
+ Embedding retrieval backend using an in-memory similarity scan.
49
+ """
50
+
51
+ backend_id = "embedding-index-inmemory"
52
+
53
+ def build_run(
54
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
55
+ ) -> RetrievalRun:
56
+ """
57
+ Build an embedding index run by chunking text payloads and materializing embeddings.
58
+
59
+ :param corpus: Corpus to build against.
60
+ :type corpus: Corpus
61
+ :param recipe_name: Human-readable recipe name.
62
+ :type recipe_name: str
63
+ :param config: Backend-specific configuration values.
64
+ :type config: dict[str, object]
65
+ :return: Run manifest describing the build.
66
+ :rtype: biblicus.models.RetrievalRun
67
+ """
68
+ recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(config)
69
+ chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
70
+ if len(chunks) > recipe_config.maximum_cache_total_items:
71
+ raise ValueError(
72
+ "embedding-index-inmemory exceeded maximum_cache_total_items. "
73
+ "Use embedding-index-file or increase maximum_cache_total_items."
74
+ )
75
+
76
+ provider = recipe_config.embedding_provider.build_provider()
77
+ chunk_texts = [chunk.text for chunk in chunks]
78
+ embeddings = provider.embed_texts(chunk_texts)
79
+ embeddings = embeddings.astype(np.float32)
80
+
81
+ recipe = create_recipe_manifest(
82
+ backend_id=self.backend_id,
83
+ name=recipe_name,
84
+ config=recipe_config.model_dump(),
85
+ )
86
+ run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
87
+
88
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
89
+ embeddings_path = corpus.root / paths["embeddings"]
90
+ chunks_path = corpus.root / paths["chunks"]
91
+ corpus.runs_dir.mkdir(parents=True, exist_ok=True)
92
+
93
+ write_embeddings(embeddings_path, embeddings)
94
+ write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
95
+
96
+ stats = {
97
+ "items": len(corpus.load_catalog().items),
98
+ "text_items": text_items,
99
+ "chunks": len(chunks),
100
+ "dimensions": (
101
+ int(embeddings.shape[1])
102
+ if embeddings.size
103
+ else recipe_config.embedding_provider.dimensions
104
+ ),
105
+ }
106
+ run = run.model_copy(
107
+ update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
108
+ )
109
+ corpus.write_run(run)
110
+ return run
111
+
112
+ def query(
113
+ self,
114
+ corpus: Corpus,
115
+ *,
116
+ run: RetrievalRun,
117
+ query_text: str,
118
+ budget: QueryBudget,
119
+ ) -> RetrievalResult:
120
+ """
121
+ Query an embedding index run and return ranked evidence.
122
+
123
+ :param corpus: Corpus associated with the run.
124
+ :type corpus: Corpus
125
+ :param run: Run manifest to use for querying.
126
+ :type run: biblicus.models.RetrievalRun
127
+ :param query_text: Query text to embed.
128
+ :type query_text: str
129
+ :param budget: Evidence selection budget.
130
+ :type budget: biblicus.models.QueryBudget
131
+ :return: Retrieval results containing evidence.
132
+ :rtype: biblicus.models.RetrievalResult
133
+ """
134
+ recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(run.recipe.config)
135
+ extraction_reference = resolve_extraction_reference(corpus, recipe_config)
136
+
137
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
138
+ embeddings_path = corpus.root / paths["embeddings"]
139
+ chunks_path = corpus.root / paths["chunks"]
140
+ if not embeddings_path.is_file() or not chunks_path.is_file():
141
+ raise FileNotFoundError("Embedding index artifacts are missing for this run")
142
+
143
+ embeddings = read_embeddings(embeddings_path, mmap=False).astype(np.float32)
144
+ chunk_records = read_chunks_jsonl(chunks_path)
145
+ if embeddings.shape[0] != len(chunk_records):
146
+ raise ValueError(
147
+ "Embedding index artifacts are inconsistent: "
148
+ "embeddings row count does not match chunk record count"
149
+ )
150
+
151
+ provider = recipe_config.embedding_provider.build_provider()
152
+ query_embedding = provider.embed_texts([query_text]).astype(np.float32)
153
+ if query_embedding.shape[0] != 1:
154
+ raise ValueError("Embedding provider returned an invalid query embedding shape")
155
+ scores = cosine_similarity_scores(embeddings, query_embedding[0])
156
+
157
+ candidates = _top_indices(
158
+ scores,
159
+ limit=_candidate_limit(budget.max_total_items + budget.offset),
160
+ )
161
+ evidence_items = _build_evidence(
162
+ corpus,
163
+ run=run,
164
+ recipe_config=recipe_config,
165
+ candidates=candidates,
166
+ scores=scores,
167
+ chunk_records=chunk_records,
168
+ extraction_reference=extraction_reference,
169
+ )
170
+ ranked = [
171
+ item.model_copy(
172
+ update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
173
+ )
174
+ for index, item in enumerate(evidence_items, start=1)
175
+ ]
176
+ evidence = apply_budget(ranked, budget)
177
+ return RetrievalResult(
178
+ query_text=query_text,
179
+ budget=budget,
180
+ run_id=run.run_id,
181
+ recipe_id=run.recipe.recipe_id,
182
+ backend_id=self.backend_id,
183
+ generated_at=utc_now_iso(),
184
+ evidence=evidence,
185
+ stats={"candidates": len(evidence_items), "returned": len(evidence)},
186
+ )
187
+
188
+
189
+ def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
190
+ return max(1, int(max_total_items) * int(multiplier))
191
+
192
+
193
+ def _top_indices(scores: np.ndarray, *, limit: int) -> List[int]:
194
+ if scores.size == 0:
195
+ return []
196
+ limit = min(int(limit), int(scores.size))
197
+ indices = np.argpartition(-scores, limit - 1)[:limit]
198
+ sorted_indices = indices[np.argsort(-scores[indices])]
199
+ return [int(index) for index in sorted_indices]
200
+
201
+
202
+ def _build_evidence(
203
+ corpus: Corpus,
204
+ *,
205
+ run: RetrievalRun,
206
+ recipe_config: EmbeddingIndexInMemoryRecipeConfig,
207
+ candidates: List[int],
208
+ scores: np.ndarray,
209
+ chunk_records: List[ChunkRecord],
210
+ extraction_reference: Optional[ExtractionRunReference],
211
+ ) -> List[Evidence]:
212
+ catalog = corpus.load_catalog()
213
+ evidence_items: List[Evidence] = []
214
+ for idx in candidates:
215
+ record = chunk_records[idx]
216
+ item_id = record.item_id
217
+ span_start = record.span_start
218
+ span_end = record.span_end
219
+ catalog_item = catalog.items[item_id]
220
+ relpath = str(getattr(catalog_item, "relpath"))
221
+ media_type = str(getattr(catalog_item, "media_type"))
222
+ text = _load_text_for_evidence(
223
+ corpus,
224
+ item_id=item_id,
225
+ relpath=relpath,
226
+ media_type=media_type,
227
+ extraction_reference=extraction_reference,
228
+ )
229
+ span_text = _build_snippet(text, (span_start, span_end), recipe_config.snippet_characters)
230
+ if span_text is None:
231
+ span_text = _extract_span_text(text, (span_start, span_end))
232
+ evidence_items.append(
233
+ Evidence(
234
+ item_id=item_id,
235
+ source_uri=getattr(catalog_item, "source_uri", None),
236
+ media_type=media_type,
237
+ score=float(scores[idx]),
238
+ rank=1,
239
+ text=span_text,
240
+ content_ref=None,
241
+ span_start=span_start,
242
+ span_end=span_end,
243
+ stage=EmbeddingIndexInMemoryBackend.backend_id,
244
+ stage_scores=None,
245
+ recipe_id=run.recipe.recipe_id,
246
+ run_id=run.run_id,
247
+ metadata=getattr(catalog_item, "metadata", {}) or {},
248
+ hash=hash_text(span_text or ""),
249
+ )
250
+ )
251
+ return evidence_items
252
+
253
+
254
+ def _load_text_for_evidence(
255
+ corpus: Corpus,
256
+ *,
257
+ item_id: str,
258
+ relpath: str,
259
+ media_type: str,
260
+ extraction_reference: Optional[ExtractionRunReference],
261
+ ) -> Optional[str]:
262
+ from .embedding_index_common import _load_text_from_item
263
+
264
+ return _load_text_from_item(
265
+ corpus,
266
+ item_id=item_id,
267
+ relpath=relpath,
268
+ media_type=media_type,
269
+ extraction_reference=extraction_reference,
270
+ )
@@ -35,7 +35,7 @@ class HybridRecipeConfig(BaseModel):
35
35
  model_config = ConfigDict(extra="forbid")
36
36
 
37
37
  lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
38
- embedding_backend: str = Field(default="vector", min_length=1)
38
+ embedding_backend: str = Field(default="tf-vector", min_length=1)
39
39
  lexical_weight: float = Field(default=0.5, ge=0, le=1)
40
40
  embedding_weight: float = Field(default=0.5, ge=0, le=1)
41
41
  lexical_config: Dict[str, object] = Field(default_factory=dict)
@@ -217,18 +217,20 @@ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> Que
217
217
  :return: Expanded budget for component backends.
218
218
  :rtype: QueryBudget
219
219
  """
220
- max_total_characters = budget.max_total_characters
220
+ maximum_total_characters = budget.maximum_total_characters
221
221
  expanded_characters = (
222
- max_total_characters * multiplier if max_total_characters is not None else None
222
+ maximum_total_characters * multiplier if maximum_total_characters is not None else None
223
223
  )
224
224
  expanded_max_items_per_source = (
225
225
  budget.max_items_per_source * multiplier
226
226
  if budget.max_items_per_source is not None
227
227
  else None
228
228
  )
229
+ requested_items = budget.max_total_items + budget.offset
229
230
  return QueryBudget(
230
- max_total_items=budget.max_total_items * multiplier,
231
- max_total_characters=expanded_characters,
231
+ max_total_items=requested_items * multiplier,
232
+ offset=0,
233
+ maximum_total_characters=expanded_characters,
232
234
  max_items_per_source=expanded_max_items_per_source,
233
235
  )
234
236
 
@@ -283,6 +285,7 @@ def _fuse_evidence(
283
285
  stage_scores={"lexical": lexical_score, "embedding": embedding_score},
284
286
  recipe_id="",
285
287
  run_id="",
288
+ metadata=base_evidence.metadata,
286
289
  hash=base_evidence.hash,
287
290
  )
288
291
  )
biblicus/backends/scan.py CHANGED
@@ -368,6 +368,7 @@ def _score_items(
368
368
  stage="scan",
369
369
  recipe_id="",
370
370
  run_id="",
371
+ metadata=getattr(catalog_item, "metadata", {}) or {},
371
372
  hash=hash_text(snippet),
372
373
  )
373
374
  )
@@ -231,7 +231,7 @@ class SqliteFullTextSearchBackend:
231
231
  candidates = _query_full_text_search_index(
232
232
  db_path=db_path,
233
233
  query_text=" ".join(filtered_tokens),
234
- limit=_candidate_limit(budget.max_total_items),
234
+ limit=_candidate_limit(budget.max_total_items + budget.offset),
235
235
  snippet_characters=recipe_config.snippet_characters,
236
236
  )
237
237
  sorted_candidates = _rank_candidates(candidates)