biblicus 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ """
2
+ Embedding-index retrieval backend that loads the full embedding matrix into memory at query time.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, List, Optional
8
+
9
+ import numpy as np
10
+ from pydantic import ConfigDict, Field
11
+
12
+ from ..corpus import Corpus
13
+ from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
14
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
15
+ from ..time import utc_now_iso
16
+ from .embedding_index_common import (
17
+ ChunkRecord,
18
+ EmbeddingIndexRecipeConfig,
19
+ artifact_paths_for_run,
20
+ chunks_to_records,
21
+ collect_chunks,
22
+ cosine_similarity_scores,
23
+ read_chunks_jsonl,
24
+ read_embeddings,
25
+ resolve_extraction_reference,
26
+ write_chunks_jsonl,
27
+ write_embeddings,
28
+ )
29
+ from .scan import _build_snippet
30
+
31
+
32
+ class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
33
+ """
34
+ Configuration for embedding-index-inmemory retrieval.
35
+
36
+ :ivar max_chunks: Maximum chunks allowed for in-memory query loading.
37
+ :vartype max_chunks: int
38
+ """
39
+
40
+ model_config = ConfigDict(extra="forbid")
41
+
42
+ max_chunks: int = Field(default=25000, ge=1)
43
+
44
+
45
+ class EmbeddingIndexInMemoryBackend:
46
+ """
47
+ Embedding retrieval backend using an in-memory similarity scan.
48
+ """
49
+
50
+ backend_id = "embedding-index-inmemory"
51
+
52
+ def build_run(
53
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
54
+ ) -> RetrievalRun:
55
+ """
56
+ Build an embedding index run by chunking text payloads and materializing embeddings.
57
+
58
+ :param corpus: Corpus to build against.
59
+ :type corpus: Corpus
60
+ :param recipe_name: Human-readable recipe name.
61
+ :type recipe_name: str
62
+ :param config: Backend-specific configuration values.
63
+ :type config: dict[str, object]
64
+ :return: Run manifest describing the build.
65
+ :rtype: biblicus.models.RetrievalRun
66
+ """
67
+ recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(config)
68
+ chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
69
+ if len(chunks) > recipe_config.max_chunks:
70
+ raise ValueError(
71
+ "embedding-index-inmemory exceeded max_chunks. "
72
+ "Use embedding-index-file or increase max_chunks."
73
+ )
74
+
75
+ provider = recipe_config.embedding_provider.build_provider()
76
+ chunk_texts = [chunk.text for chunk in chunks]
77
+ embeddings = provider.embed_texts(chunk_texts)
78
+ embeddings = embeddings.astype(np.float32)
79
+
80
+ recipe = create_recipe_manifest(
81
+ backend_id=self.backend_id,
82
+ name=recipe_name,
83
+ config=recipe_config.model_dump(),
84
+ )
85
+ run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
86
+
87
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
88
+ embeddings_path = corpus.root / paths["embeddings"]
89
+ chunks_path = corpus.root / paths["chunks"]
90
+ corpus.runs_dir.mkdir(parents=True, exist_ok=True)
91
+
92
+ write_embeddings(embeddings_path, embeddings)
93
+ write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
94
+
95
+ stats = {
96
+ "items": len(corpus.load_catalog().items),
97
+ "text_items": text_items,
98
+ "chunks": len(chunks),
99
+ "dimensions": (
100
+ int(embeddings.shape[1])
101
+ if embeddings.size
102
+ else recipe_config.embedding_provider.dimensions
103
+ ),
104
+ }
105
+ run = run.model_copy(
106
+ update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
107
+ )
108
+ corpus.write_run(run)
109
+ return run
110
+
111
+ def query(
112
+ self,
113
+ corpus: Corpus,
114
+ *,
115
+ run: RetrievalRun,
116
+ query_text: str,
117
+ budget: QueryBudget,
118
+ ) -> RetrievalResult:
119
+ """
120
+ Query an embedding index run and return ranked evidence.
121
+
122
+ :param corpus: Corpus associated with the run.
123
+ :type corpus: Corpus
124
+ :param run: Run manifest to use for querying.
125
+ :type run: biblicus.models.RetrievalRun
126
+ :param query_text: Query text to embed.
127
+ :type query_text: str
128
+ :param budget: Evidence selection budget.
129
+ :type budget: biblicus.models.QueryBudget
130
+ :return: Retrieval results containing evidence.
131
+ :rtype: biblicus.models.RetrievalResult
132
+ """
133
+ recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(run.recipe.config)
134
+ extraction_reference = resolve_extraction_reference(corpus, recipe_config)
135
+
136
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
137
+ embeddings_path = corpus.root / paths["embeddings"]
138
+ chunks_path = corpus.root / paths["chunks"]
139
+ if not embeddings_path.is_file() or not chunks_path.is_file():
140
+ raise FileNotFoundError("Embedding index artifacts are missing for this run")
141
+
142
+ embeddings = read_embeddings(embeddings_path, mmap=False).astype(np.float32)
143
+ chunk_records = read_chunks_jsonl(chunks_path)
144
+ if embeddings.shape[0] != len(chunk_records):
145
+ raise ValueError(
146
+ "Embedding index artifacts are inconsistent: "
147
+ "embeddings row count does not match chunk record count"
148
+ )
149
+
150
+ provider = recipe_config.embedding_provider.build_provider()
151
+ query_embedding = provider.embed_texts([query_text]).astype(np.float32)
152
+ if query_embedding.shape[0] != 1:
153
+ raise ValueError("Embedding provider returned an invalid query embedding shape")
154
+ scores = cosine_similarity_scores(embeddings, query_embedding[0])
155
+
156
+ candidates = _top_indices(
157
+ scores,
158
+ limit=_candidate_limit(budget.max_total_items + budget.offset),
159
+ )
160
+ evidence_items = _build_evidence(
161
+ corpus,
162
+ run=run,
163
+ recipe_config=recipe_config,
164
+ candidates=candidates,
165
+ scores=scores,
166
+ chunk_records=chunk_records,
167
+ extraction_reference=extraction_reference,
168
+ )
169
+ ranked = [
170
+ item.model_copy(
171
+ update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
172
+ )
173
+ for index, item in enumerate(evidence_items, start=1)
174
+ ]
175
+ evidence = apply_budget(ranked, budget)
176
+ return RetrievalResult(
177
+ query_text=query_text,
178
+ budget=budget,
179
+ run_id=run.run_id,
180
+ recipe_id=run.recipe.recipe_id,
181
+ backend_id=self.backend_id,
182
+ generated_at=utc_now_iso(),
183
+ evidence=evidence,
184
+ stats={"candidates": len(evidence_items), "returned": len(evidence)},
185
+ )
186
+
187
+
188
+ def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
189
+ return max(1, int(max_total_items) * int(multiplier))
190
+
191
+
192
+ def _top_indices(scores: np.ndarray, *, limit: int) -> List[int]:
193
+ if scores.size == 0:
194
+ return []
195
+ limit = min(int(limit), int(scores.size))
196
+ indices = np.argpartition(-scores, limit - 1)[:limit]
197
+ sorted_indices = indices[np.argsort(-scores[indices])]
198
+ return [int(index) for index in sorted_indices]
199
+
200
+
201
+ def _build_evidence(
202
+ corpus: Corpus,
203
+ *,
204
+ run: RetrievalRun,
205
+ recipe_config: EmbeddingIndexInMemoryRecipeConfig,
206
+ candidates: List[int],
207
+ scores: np.ndarray,
208
+ chunk_records: List[ChunkRecord],
209
+ extraction_reference: Optional[ExtractionRunReference],
210
+ ) -> List[Evidence]:
211
+ catalog = corpus.load_catalog()
212
+ evidence_items: List[Evidence] = []
213
+ for idx in candidates:
214
+ record = chunk_records[idx]
215
+ item_id = record.item_id
216
+ span_start = record.span_start
217
+ span_end = record.span_end
218
+ catalog_item = catalog.items[item_id]
219
+ relpath = str(getattr(catalog_item, "relpath"))
220
+ media_type = str(getattr(catalog_item, "media_type"))
221
+ text = _load_text_for_evidence(
222
+ corpus,
223
+ item_id=item_id,
224
+ relpath=relpath,
225
+ media_type=media_type,
226
+ extraction_reference=extraction_reference,
227
+ )
228
+ snippet = _build_snippet(
229
+ text, (span_start, span_end), max_chars=recipe_config.snippet_characters
230
+ )
231
+ evidence_items.append(
232
+ Evidence(
233
+ item_id=item_id,
234
+ source_uri=getattr(catalog_item, "source_uri", None),
235
+ media_type=media_type,
236
+ score=float(scores[idx]),
237
+ rank=1,
238
+ text=snippet,
239
+ content_ref=None,
240
+ span_start=span_start,
241
+ span_end=span_end,
242
+ stage=EmbeddingIndexInMemoryBackend.backend_id,
243
+ stage_scores=None,
244
+ recipe_id=run.recipe.recipe_id,
245
+ run_id=run.run_id,
246
+ hash=hash_text(snippet),
247
+ )
248
+ )
249
+ return evidence_items
250
+
251
+
252
+ def _load_text_for_evidence(
253
+ corpus: Corpus,
254
+ *,
255
+ item_id: str,
256
+ relpath: str,
257
+ media_type: str,
258
+ extraction_reference: Optional[ExtractionRunReference],
259
+ ) -> Optional[str]:
260
+ from .embedding_index_common import _load_text_from_item
261
+
262
+ return _load_text_from_item(
263
+ corpus,
264
+ item_id=item_id,
265
+ relpath=relpath,
266
+ media_type=media_type,
267
+ extraction_reference=extraction_reference,
268
+ )
@@ -35,7 +35,7 @@ class HybridRecipeConfig(BaseModel):
35
35
  model_config = ConfigDict(extra="forbid")
36
36
 
37
37
  lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
38
- embedding_backend: str = Field(default="vector", min_length=1)
38
+ embedding_backend: str = Field(default="tf-vector", min_length=1)
39
39
  lexical_weight: float = Field(default=0.5, ge=0, le=1)
40
40
  embedding_weight: float = Field(default=0.5, ge=0, le=1)
41
41
  lexical_config: Dict[str, object] = Field(default_factory=dict)
@@ -226,8 +226,10 @@ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> Que
226
226
  if budget.max_items_per_source is not None
227
227
  else None
228
228
  )
229
+ requested_items = budget.max_total_items + budget.offset
229
230
  return QueryBudget(
230
- max_total_items=budget.max_total_items * multiplier,
231
+ max_total_items=requested_items * multiplier,
232
+ offset=0,
231
233
  max_total_characters=expanded_characters,
232
234
  max_items_per_source=expanded_max_items_per_source,
233
235
  )
@@ -231,7 +231,7 @@ class SqliteFullTextSearchBackend:
231
231
  candidates = _query_full_text_search_index(
232
232
  db_path=db_path,
233
233
  query_text=" ".join(filtered_tokens),
234
- limit=_candidate_limit(budget.max_total_items),
234
+ limit=_candidate_limit(budget.max_total_items + budget.offset),
235
235
  snippet_characters=recipe_config.snippet_characters,
236
236
  )
237
237
  sorted_candidates = _rank_candidates(candidates)
@@ -24,9 +24,9 @@ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifes
24
24
  from ..time import utc_now_iso
25
25
 
26
26
 
27
- class VectorRecipeConfig(BaseModel):
27
+ class TfVectorRecipeConfig(BaseModel):
28
28
  """
29
- Configuration for the vector retrieval backend.
29
+ Configuration for the term-frequency vector retrieval backend.
30
30
 
31
31
  :ivar snippet_characters: Maximum characters to include in evidence snippets.
32
32
  :vartype snippet_characters: int
@@ -40,7 +40,7 @@ class VectorRecipeConfig(BaseModel):
40
40
  extraction_run: Optional[str] = None
41
41
 
42
42
 
43
- class VectorBackend:
43
+ class TfVectorBackend:
44
44
  """
45
45
  Deterministic vector backend using term-frequency cosine similarity.
46
46
 
@@ -48,7 +48,7 @@ class VectorBackend:
48
48
  :vartype backend_id: str
49
49
  """
50
50
 
51
- backend_id = "vector"
51
+ backend_id = "tf-vector"
52
52
 
53
53
  def build_run(
54
54
  self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
@@ -65,7 +65,7 @@ class VectorBackend:
65
65
  :return: Run manifest describing the build.
66
66
  :rtype: RetrievalRun
67
67
  """
68
- recipe_config = VectorRecipeConfig.model_validate(config)
68
+ recipe_config = TfVectorRecipeConfig.model_validate(config)
69
69
  catalog = corpus.load_catalog()
70
70
  recipe = create_recipe_manifest(
71
71
  backend_id=self.backend_id,
@@ -102,7 +102,7 @@ class VectorBackend:
102
102
  :return: Retrieval results containing evidence.
103
103
  :rtype: RetrievalResult
104
104
  """
105
- recipe_config = VectorRecipeConfig.model_validate(run.recipe.config)
105
+ recipe_config = TfVectorRecipeConfig.model_validate(run.recipe.config)
106
106
  query_tokens = _tokenize_text(query_text)
107
107
  if not query_tokens:
108
108
  return RetrievalResult(
@@ -157,7 +157,7 @@ class VectorBackend:
157
157
 
158
158
 
159
159
  def _resolve_extraction_reference(
160
- corpus: Corpus, recipe_config: VectorRecipeConfig
160
+ corpus: Corpus, recipe_config: TfVectorRecipeConfig
161
161
  ) -> Optional[ExtractionRunReference]:
162
162
  """
163
163
  Resolve an extraction run reference from a recipe config.
@@ -165,7 +165,7 @@ def _resolve_extraction_reference(
165
165
  :param corpus: Corpus associated with the recipe.
166
166
  :type corpus: Corpus
167
167
  :param recipe_config: Parsed vector recipe configuration.
168
- :type recipe_config: VectorRecipeConfig
168
+ :type recipe_config: TfVectorRecipeConfig
169
169
  :return: Parsed extraction reference or None.
170
170
  :rtype: ExtractionRunReference or None
171
171
  :raises FileNotFoundError: If an extraction run is referenced but not present.
@@ -183,7 +183,7 @@ def _resolve_extraction_reference(
183
183
 
184
184
 
185
185
  def _count_text_items(
186
- corpus: Corpus, items: Iterable[object], recipe_config: VectorRecipeConfig
186
+ corpus: Corpus, items: Iterable[object], recipe_config: TfVectorRecipeConfig
187
187
  ) -> int:
188
188
  """
189
189
  Count catalog items that represent text content.
@@ -193,7 +193,7 @@ def _count_text_items(
193
193
  :param items: Catalog items to inspect.
194
194
  :type items: Iterable[object]
195
195
  :param recipe_config: Parsed vector recipe configuration.
196
- :type recipe_config: VectorRecipeConfig
196
+ :type recipe_config: TfVectorRecipeConfig
197
197
  :return: Number of text items.
198
198
  :rtype: int
199
199
  """
@@ -451,7 +451,7 @@ def _score_items(
451
451
  content_ref=None,
452
452
  span_start=span_start,
453
453
  span_end=span_end,
454
- stage="vector",
454
+ stage="tf-vector",
455
455
  recipe_id="",
456
456
  run_id="",
457
457
  hash=hash_text(snippet),