biblicus 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -27,4 +27,4 @@ __all__ = [
27
27
  "RetrievalRun",
28
28
  ]
29
29
 
30
- __version__ = "0.15.1"
30
+ __version__ = "0.16.0"
@@ -686,6 +686,15 @@ def _build_observations(
686
686
  llm = config.llm_observations
687
687
  assert llm.client is not None and llm.prompt_template is not None
688
688
  for index, observation in enumerate(observations):
689
+ if observation.segment_text in {"START", "END"}:
690
+ observations[index] = observation.model_copy(
691
+ update={
692
+ "llm_label": observation.segment_text,
693
+ "llm_label_confidence": 1.0,
694
+ "llm_summary": observation.segment_text,
695
+ }
696
+ )
697
+ continue
689
698
  prompt = llm.prompt_template.format(segment=observation.segment_text)
690
699
  response_text = generate_completion(
691
700
  client=llm.client,
@@ -707,8 +716,12 @@ def _build_observations(
707
716
  if config.embeddings.enabled:
708
717
  embedding_config = config.embeddings
709
718
  assert embedding_config.client is not None
719
+ embed_indices: List[int] = []
710
720
  embed_texts: List[str] = []
711
- for observation in observations:
721
+ for index, observation in enumerate(observations):
722
+ if observation.segment_text in {"START", "END"}:
723
+ continue
724
+ embed_indices.append(index)
712
725
  if embedding_config.text_source == "segment_text":
713
726
  embed_texts.append(observation.segment_text)
714
727
  else:
@@ -717,10 +730,29 @@ def _build_observations(
717
730
  "embeddings.text_source is 'llm_summary' but llm_summary is missing"
718
731
  )
719
732
  embed_texts.append(observation.llm_summary)
733
+
734
+ if not embed_indices:
735
+ raise ValueError("Embeddings require at least one non-boundary segment")
736
+
720
737
  vectors = generate_embeddings_batch(client=embedding_config.client, texts=embed_texts)
738
+ if len(vectors) != len(embed_indices):
739
+ raise ValueError(
740
+ "Embedding provider returned unexpected vector count: "
741
+ f"expected {len(embed_indices)} but got {len(vectors)}"
742
+ )
743
+
744
+ vector_by_observation_index: Dict[int, List[float]] = {}
745
+ for observation_index, vector in zip(embed_indices, vectors):
746
+ vector_by_observation_index[observation_index] = list(vector)
747
+
748
+ embedding_dimension = len(next(iter(vector_by_observation_index.values())))
749
+ boundary_embedding = [0.0 for _ in range(embedding_dimension)]
721
750
  updated: List[MarkovAnalysisObservation] = []
722
- for observation, vector in zip(observations, vectors):
723
- updated.append(observation.model_copy(update={"embedding": vector}))
751
+ for index, observation in enumerate(observations):
752
+ vector = vector_by_observation_index.get(index)
753
+ updated.append(
754
+ observation.model_copy(update={"embedding": vector or boundary_embedding})
755
+ )
724
756
  observations = updated
725
757
 
726
758
  return observations
@@ -7,10 +7,12 @@ from __future__ import annotations
7
7
  from typing import Dict, Type
8
8
 
9
9
  from .base import RetrievalBackend
10
+ from .embedding_index_file import EmbeddingIndexFileBackend
11
+ from .embedding_index_inmemory import EmbeddingIndexInMemoryBackend
10
12
  from .hybrid import HybridBackend
11
13
  from .scan import ScanBackend
12
14
  from .sqlite_full_text_search import SqliteFullTextSearchBackend
13
- from .vector import VectorBackend
15
+ from .tf_vector import TfVectorBackend
14
16
 
15
17
 
16
18
  def available_backends() -> Dict[str, Type[RetrievalBackend]]:
@@ -21,10 +23,12 @@ def available_backends() -> Dict[str, Type[RetrievalBackend]]:
21
23
  :rtype: dict[str, Type[RetrievalBackend]]
22
24
  """
23
25
  return {
26
+ EmbeddingIndexFileBackend.backend_id: EmbeddingIndexFileBackend,
27
+ EmbeddingIndexInMemoryBackend.backend_id: EmbeddingIndexInMemoryBackend,
24
28
  HybridBackend.backend_id: HybridBackend,
25
29
  ScanBackend.backend_id: ScanBackend,
26
30
  SqliteFullTextSearchBackend.backend_id: SqliteFullTextSearchBackend,
27
- VectorBackend.backend_id: VectorBackend,
31
+ TfVectorBackend.backend_id: TfVectorBackend,
28
32
  }
29
33
 
30
34
 
@@ -0,0 +1,301 @@
1
+ """
2
+ Shared primitives for embedding-index retrieval backends.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Dict, Iterable, Iterator, List, Optional, Tuple
10
+
11
+ import numpy as np
12
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
13
+
14
+ from ..chunking import ChunkerConfig, TextChunk, TokenizerConfig
15
+ from ..corpus import CORPUS_DIR_NAME, RUNS_DIR_NAME, Corpus
16
+ from ..embedding_providers import EmbeddingProviderConfig, _l2_normalize_rows
17
+ from ..frontmatter import parse_front_matter
18
+ from ..models import ExtractionRunReference, parse_extraction_run_reference
19
+
20
+
21
+ class ChunkRecord(BaseModel):
22
+ """
23
+ Minimal persisted representation of a chunk.
24
+
25
+ :ivar item_id: Item identifier that produced the chunk.
26
+ :vartype item_id: str
27
+ :ivar span_start: Inclusive start character offset.
28
+ :vartype span_start: int
29
+ :ivar span_end: Exclusive end character offset.
30
+ :vartype span_end: int
31
+ """
32
+
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ item_id: str = Field(min_length=1)
36
+ span_start: int = Field(ge=0)
37
+ span_end: int = Field(ge=0)
38
+
39
+ @model_validator(mode="after")
40
+ def _validate_span(self) -> "ChunkRecord":
41
+ if self.span_end <= self.span_start:
42
+ raise ValueError("chunk span_end must be greater than span_start")
43
+ return self
44
+
45
+
46
+ class EmbeddingIndexRecipeConfig(BaseModel):
47
+ """
48
+ Configuration for embedding-index retrieval backends.
49
+
50
+ :ivar snippet_characters: Maximum characters to include in evidence snippets.
51
+ :vartype snippet_characters: int
52
+ :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
53
+ :vartype extraction_run: str or None
54
+ :ivar chunker: Chunker configuration.
55
+ :vartype chunker: biblicus.chunking.ChunkerConfig
56
+ :ivar tokenizer: Optional tokenizer configuration.
57
+ :vartype tokenizer: biblicus.chunking.TokenizerConfig or None
58
+ :ivar embedding_provider: Embedding provider configuration.
59
+ :vartype embedding_provider: biblicus.embedding_providers.EmbeddingProviderConfig
60
+ """
61
+
62
+ model_config = ConfigDict(extra="forbid")
63
+
64
+ snippet_characters: int = Field(default=400, ge=1)
65
+ extraction_run: Optional[str] = None
66
+ chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
67
+ tokenizer: Optional[TokenizerConfig] = None
68
+ embedding_provider: EmbeddingProviderConfig
69
+
70
+
71
+ def resolve_extraction_reference(
72
+ corpus: Corpus, recipe_config: EmbeddingIndexRecipeConfig
73
+ ) -> Optional[ExtractionRunReference]:
74
+ """
75
+ Resolve an extraction run reference from an embedding-index recipe config.
76
+
77
+ :param corpus: Corpus associated with the recipe.
78
+ :type corpus: Corpus
79
+ :param recipe_config: Parsed embedding-index recipe configuration.
80
+ :type recipe_config: EmbeddingIndexRecipeConfig
81
+ :return: Parsed extraction reference or None.
82
+ :rtype: ExtractionRunReference or None
83
+ :raises FileNotFoundError: If an extraction run is referenced but not present.
84
+ """
85
+ if not recipe_config.extraction_run:
86
+ return None
87
+ extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
88
+ run_dir = corpus.extraction_run_dir(
89
+ extractor_id=extraction_reference.extractor_id,
90
+ run_id=extraction_reference.run_id,
91
+ )
92
+ if not run_dir.is_dir():
93
+ raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
94
+ return extraction_reference
95
+
96
+
97
+ def _load_text_from_item(
98
+ corpus: Corpus,
99
+ *,
100
+ item_id: str,
101
+ relpath: str,
102
+ media_type: str,
103
+ extraction_reference: Optional[ExtractionRunReference],
104
+ ) -> Optional[str]:
105
+ if extraction_reference:
106
+ extracted_text = corpus.read_extracted_text(
107
+ extractor_id=extraction_reference.extractor_id,
108
+ run_id=extraction_reference.run_id,
109
+ item_id=item_id,
110
+ )
111
+ if isinstance(extracted_text, str):
112
+ return extracted_text
113
+
114
+ if media_type == "text/markdown":
115
+ raw = (corpus.root / relpath).read_text(encoding="utf-8")
116
+ return parse_front_matter(raw).body
117
+ if media_type.startswith("text/"):
118
+ return (corpus.root / relpath).read_text(encoding="utf-8")
119
+ return None
120
+
121
+
122
+ def iter_text_payloads(
123
+ corpus: Corpus, *, extraction_reference: Optional[ExtractionRunReference]
124
+ ) -> Iterator[Tuple[object, str]]:
125
+ """
126
+ Yield catalog items and their text payloads.
127
+
128
+ :param corpus: Corpus containing the items.
129
+ :type corpus: Corpus
130
+ :param extraction_reference: Optional extraction reference.
131
+ :type extraction_reference: ExtractionRunReference or None
132
+ :yield: (catalog_item, text) pairs.
133
+ :rtype: Iterator[tuple[object, str]]
134
+ """
135
+ catalog = corpus.load_catalog()
136
+ for catalog_item in catalog.items.values():
137
+ item_id = str(getattr(catalog_item, "id", ""))
138
+ relpath = str(getattr(catalog_item, "relpath", ""))
139
+ media_type = str(getattr(catalog_item, "media_type", ""))
140
+ if not item_id or not relpath or not media_type:
141
+ continue
142
+ text = _load_text_from_item(
143
+ corpus,
144
+ item_id=item_id,
145
+ relpath=relpath,
146
+ media_type=media_type,
147
+ extraction_reference=extraction_reference,
148
+ )
149
+ if not isinstance(text, str) or not text.strip():
150
+ continue
151
+ yield catalog_item, text
152
+
153
+
154
+ def collect_chunks(
155
+ corpus: Corpus, *, recipe_config: EmbeddingIndexRecipeConfig
156
+ ) -> Tuple[List[TextChunk], int]:
157
+ """
158
+ Collect chunks from text payloads in a corpus.
159
+
160
+ :param corpus: Corpus to chunk.
161
+ :type corpus: Corpus
162
+ :param recipe_config: Parsed embedding-index recipe configuration.
163
+ :type recipe_config: EmbeddingIndexRecipeConfig
164
+ :return: (chunks, text_item_count)
165
+ :rtype: tuple[list[TextChunk], int]
166
+ """
167
+ tokenizer = recipe_config.tokenizer.build_tokenizer() if recipe_config.tokenizer else None
168
+ chunker = recipe_config.chunker.build_chunker(tokenizer=tokenizer)
169
+ extraction_reference = resolve_extraction_reference(corpus, recipe_config)
170
+
171
+ chunks: List[TextChunk] = []
172
+ next_chunk_id = 0
173
+ text_items = 0
174
+ for catalog_item, text in iter_text_payloads(corpus, extraction_reference=extraction_reference):
175
+ text_items += 1
176
+ item_id = str(getattr(catalog_item, "id"))
177
+ item_chunks = chunker.chunk_text(
178
+ item_id=item_id, text=text, starting_chunk_id=next_chunk_id
179
+ )
180
+ if item_chunks:
181
+ next_chunk_id = item_chunks[-1].chunk_id + 1
182
+ chunks.extend(item_chunks)
183
+ return chunks, text_items
184
+
185
+
186
+ def chunks_to_records(chunks: Iterable[TextChunk]) -> List[ChunkRecord]:
187
+ """
188
+ Convert chunk objects to persisted chunk records.
189
+
190
+ :param chunks: Chunk list.
191
+ :type chunks: Iterable[TextChunk]
192
+ :return: Chunk record list.
193
+ :rtype: list[ChunkRecord]
194
+ """
195
+ records: List[ChunkRecord] = []
196
+ for chunk in chunks:
197
+ records.append(
198
+ ChunkRecord(
199
+ item_id=chunk.item_id,
200
+ span_start=chunk.span_start,
201
+ span_end=chunk.span_end,
202
+ )
203
+ )
204
+ return records
205
+
206
+
207
+ def write_chunks_jsonl(path: Path, records: Iterable[ChunkRecord]) -> None:
208
+ """
209
+ Write chunk records as newline-delimited JSON.
210
+
211
+ :param path: Destination path.
212
+ :type path: pathlib.Path
213
+ :param records: Chunk records.
214
+ :type records: Iterable[ChunkRecord]
215
+ :return: None.
216
+ :rtype: None
217
+ """
218
+ with path.open("w", encoding="utf-8") as handle:
219
+ for record in records:
220
+ handle.write(json.dumps(record.model_dump(), separators=(",", ":")) + "\n")
221
+
222
+
223
+ def read_chunks_jsonl(path: Path) -> List[ChunkRecord]:
224
+ """
225
+ Read chunk records from a JSON Lines file.
226
+
227
+ :param path: Source path.
228
+ :type path: pathlib.Path
229
+ :return: Chunk record list.
230
+ :rtype: list[ChunkRecord]
231
+ """
232
+ records: List[ChunkRecord] = []
233
+ for line in path.read_text(encoding="utf-8").splitlines():
234
+ if not line.strip():
235
+ continue
236
+ records.append(ChunkRecord.model_validate(json.loads(line)))
237
+ return records
238
+
239
+
240
+ def write_embeddings(path: Path, embeddings: np.ndarray) -> None:
241
+ """
242
+ Write embeddings to disk.
243
+
244
+ :param path: Destination path.
245
+ :type path: pathlib.Path
246
+ :param embeddings: Embedding matrix.
247
+ :type embeddings: numpy.ndarray
248
+ :return: None.
249
+ :rtype: None
250
+ """
251
+ np.save(path, embeddings.astype(np.float32))
252
+
253
+
254
+ def read_embeddings(path: Path, *, mmap: bool) -> np.ndarray:
255
+ """
256
+ Read embeddings from disk.
257
+
258
+ :param path: Source path.
259
+ :type path: pathlib.Path
260
+ :param mmap: Whether to memory-map the file.
261
+ :type mmap: bool
262
+ :return: Embedding matrix.
263
+ :rtype: numpy.ndarray
264
+ """
265
+ mode = "r" if mmap else None
266
+ return np.load(path, mmap_mode=mode)
267
+
268
+
269
+ def cosine_similarity_scores(embeddings: np.ndarray, query_vector: np.ndarray) -> np.ndarray:
270
+ """
271
+ Compute cosine similarity scores for a query vector.
272
+
273
+ The embedding matrix must already be L2-normalized.
274
+
275
+ :param embeddings: Embedding matrix of shape (n, d).
276
+ :type embeddings: numpy.ndarray
277
+ :param query_vector: Query vector of shape (d,).
278
+ :type query_vector: numpy.ndarray
279
+ :return: Score vector of shape (n,).
280
+ :rtype: numpy.ndarray
281
+ """
282
+ query_vector = query_vector.astype(np.float32).reshape(-1)
283
+ query_vector = _l2_normalize_rows(query_vector.reshape(1, -1)).reshape(-1)
284
+ return embeddings @ query_vector
285
+
286
+
287
+ def artifact_paths_for_run(*, run_id: str, backend_id: str) -> Dict[str, str]:
288
+ """
289
+ Build deterministic artifact relative paths for an embedding index run.
290
+
291
+ :param run_id: Run identifier.
292
+ :type run_id: str
293
+ :param backend_id: Backend identifier.
294
+ :type backend_id: str
295
+ :return: Mapping with keys embeddings and chunks.
296
+ :rtype: dict[str, str]
297
+ """
298
+ prefix = f"{run_id}.{backend_id}"
299
+ embeddings_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.embeddings.npy")
300
+ chunks_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.chunks.jsonl")
301
+ return {"embeddings": embeddings_relpath, "chunks": chunks_relpath}
@@ -0,0 +1,266 @@
1
+ """
2
+ Embedding-index retrieval backend that reads the embedding matrix via memory mapping.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Dict, List, Optional
9
+
10
+ import numpy as np
11
+
12
+ from ..corpus import Corpus
13
+ from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
14
+ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
15
+ from ..time import utc_now_iso
16
+ from .embedding_index_common import (
17
+ ChunkRecord,
18
+ EmbeddingIndexRecipeConfig,
19
+ artifact_paths_for_run,
20
+ chunks_to_records,
21
+ collect_chunks,
22
+ cosine_similarity_scores,
23
+ read_chunks_jsonl,
24
+ read_embeddings,
25
+ resolve_extraction_reference,
26
+ write_chunks_jsonl,
27
+ write_embeddings,
28
+ )
29
+ from .scan import _build_snippet
30
+
31
+
32
+ class EmbeddingIndexFileBackend:
33
+ """
34
+ Embedding retrieval backend using memory-mapped similarity scanning.
35
+ """
36
+
37
+ backend_id = "embedding-index-file"
38
+
39
+ def build_run(
40
+ self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
41
+ ) -> RetrievalRun:
42
+ """
43
+ Build an embedding index run by chunking text payloads and materializing embeddings.
44
+
45
+ :param corpus: Corpus to build against.
46
+ :type corpus: Corpus
47
+ :param recipe_name: Human-readable recipe name.
48
+ :type recipe_name: str
49
+ :param config: Backend-specific configuration values.
50
+ :type config: dict[str, object]
51
+ :return: Run manifest describing the build.
52
+ :rtype: biblicus.models.RetrievalRun
53
+ """
54
+ recipe_config = EmbeddingIndexRecipeConfig.model_validate(config)
55
+ chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
56
+
57
+ provider = recipe_config.embedding_provider.build_provider()
58
+ embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
59
+
60
+ recipe = create_recipe_manifest(
61
+ backend_id=self.backend_id,
62
+ name=recipe_name,
63
+ config=recipe_config.model_dump(),
64
+ )
65
+ run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
66
+
67
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
68
+ embeddings_path = corpus.root / paths["embeddings"]
69
+ chunks_path = corpus.root / paths["chunks"]
70
+ corpus.runs_dir.mkdir(parents=True, exist_ok=True)
71
+
72
+ write_embeddings(embeddings_path, embeddings)
73
+ write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
74
+
75
+ stats = {
76
+ "items": len(corpus.load_catalog().items),
77
+ "text_items": text_items,
78
+ "chunks": len(chunks),
79
+ "dimensions": (
80
+ int(embeddings.shape[1])
81
+ if embeddings.size
82
+ else recipe_config.embedding_provider.dimensions
83
+ ),
84
+ }
85
+ run = run.model_copy(
86
+ update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
87
+ )
88
+ corpus.write_run(run)
89
+ return run
90
+
91
+ def query(
92
+ self,
93
+ corpus: Corpus,
94
+ *,
95
+ run: RetrievalRun,
96
+ query_text: str,
97
+ budget: QueryBudget,
98
+ ) -> RetrievalResult:
99
+ """
100
+ Query an embedding index run and return ranked evidence.
101
+
102
+ :param corpus: Corpus associated with the run.
103
+ :type corpus: Corpus
104
+ :param run: Run manifest to use for querying.
105
+ :type run: biblicus.models.RetrievalRun
106
+ :param query_text: Query text to embed.
107
+ :type query_text: str
108
+ :param budget: Evidence selection budget.
109
+ :type budget: biblicus.models.QueryBudget
110
+ :return: Retrieval results containing evidence.
111
+ :rtype: biblicus.models.RetrievalResult
112
+ """
113
+ recipe_config = EmbeddingIndexRecipeConfig.model_validate(run.recipe.config)
114
+ extraction_reference = resolve_extraction_reference(corpus, recipe_config)
115
+
116
+ paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
117
+ embeddings_path = corpus.root / paths["embeddings"]
118
+ chunks_path = corpus.root / paths["chunks"]
119
+ if not embeddings_path.is_file() or not chunks_path.is_file():
120
+ raise FileNotFoundError("Embedding index artifacts are missing for this run")
121
+
122
+ embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
123
+ chunk_records = read_chunks_jsonl(chunks_path)
124
+ if embeddings.shape[0] != len(chunk_records):
125
+ raise ValueError(
126
+ "Embedding index artifacts are inconsistent: "
127
+ "embeddings row count does not match chunk record count"
128
+ )
129
+
130
+ provider = recipe_config.embedding_provider.build_provider()
131
+ query_embedding = provider.embed_texts([query_text]).astype(np.float32)
132
+ if query_embedding.shape[0] != 1:
133
+ raise ValueError("Embedding provider returned an invalid query embedding shape")
134
+
135
+ candidates = _top_indices_batched(
136
+ embeddings=embeddings,
137
+ query_vector=query_embedding[0],
138
+ limit=_candidate_limit(budget.max_total_items + budget.offset),
139
+ )
140
+ evidence_items = _build_evidence(
141
+ corpus,
142
+ run=run,
143
+ recipe_config=recipe_config,
144
+ candidates=candidates,
145
+ embeddings=embeddings,
146
+ query_vector=query_embedding[0],
147
+ chunk_records=chunk_records,
148
+ extraction_reference=extraction_reference,
149
+ )
150
+ ranked = [
151
+ item.model_copy(
152
+ update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
153
+ )
154
+ for index, item in enumerate(evidence_items, start=1)
155
+ ]
156
+ evidence = apply_budget(ranked, budget)
157
+ return RetrievalResult(
158
+ query_text=query_text,
159
+ budget=budget,
160
+ run_id=run.run_id,
161
+ recipe_id=run.recipe.recipe_id,
162
+ backend_id=self.backend_id,
163
+ generated_at=utc_now_iso(),
164
+ evidence=evidence,
165
+ stats={"candidates": len(evidence_items), "returned": len(evidence)},
166
+ )
167
+
168
+
169
+ def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
170
+ return max(1, int(max_total_items) * int(multiplier))
171
+
172
+
173
+ @dataclass(frozen=True)
174
+ class _ScoredIndex:
175
+ score: float
176
+ index: int
177
+
178
+
179
+ def _top_indices_batched(
180
+ *, embeddings: np.ndarray, query_vector: np.ndarray, limit: int, batch_rows: int = 4096
181
+ ) -> List[int]:
182
+ if embeddings.size == 0:
183
+ return []
184
+ limit = min(int(limit), int(embeddings.shape[0]))
185
+
186
+ best: List[_ScoredIndex] = []
187
+ for start in range(0, embeddings.shape[0], int(batch_rows)):
188
+ end = min(start + int(batch_rows), embeddings.shape[0])
189
+ scores = cosine_similarity_scores(embeddings[start:end], query_vector)
190
+ batch_limit = min(limit, int(scores.size))
191
+ if batch_limit <= 0:
192
+ continue
193
+ indices = np.argpartition(-scores, batch_limit - 1)[:batch_limit]
194
+ for local_index in indices:
195
+ global_index = int(start + int(local_index))
196
+ best.append(_ScoredIndex(score=float(scores[int(local_index)]), index=global_index))
197
+
198
+ best.sort(key=lambda item: (-item.score, item.index))
199
+ return [int(item.index) for item in best[:limit]]
200
+
201
+
202
+ def _build_evidence(
203
+ corpus: Corpus,
204
+ *,
205
+ run: RetrievalRun,
206
+ recipe_config: EmbeddingIndexRecipeConfig,
207
+ candidates: List[int],
208
+ embeddings: np.ndarray,
209
+ query_vector: np.ndarray,
210
+ chunk_records: List[ChunkRecord],
211
+ extraction_reference: Optional[ExtractionRunReference],
212
+ ) -> List[Evidence]:
213
+ catalog = corpus.load_catalog()
214
+ evidence_items: List[Evidence] = []
215
+ for idx in candidates:
216
+ record = chunk_records[idx]
217
+ catalog_item = catalog.items[record.item_id]
218
+ text = _load_text_for_evidence(
219
+ corpus,
220
+ item_id=record.item_id,
221
+ relpath=str(getattr(catalog_item, "relpath")),
222
+ media_type=str(getattr(catalog_item, "media_type")),
223
+ extraction_reference=extraction_reference,
224
+ )
225
+ snippet = _build_snippet(
226
+ text, (record.span_start, record.span_end), max_chars=recipe_config.snippet_characters
227
+ )
228
+ score = float(cosine_similarity_scores(embeddings[idx : idx + 1], query_vector)[0])
229
+ evidence_items.append(
230
+ Evidence(
231
+ item_id=record.item_id,
232
+ source_uri=getattr(catalog_item, "source_uri", None),
233
+ media_type=str(getattr(catalog_item, "media_type")),
234
+ score=score,
235
+ rank=1,
236
+ text=snippet,
237
+ content_ref=None,
238
+ span_start=record.span_start,
239
+ span_end=record.span_end,
240
+ stage=EmbeddingIndexFileBackend.backend_id,
241
+ stage_scores=None,
242
+ recipe_id=run.recipe.recipe_id,
243
+ run_id=run.run_id,
244
+ hash=hash_text(snippet),
245
+ )
246
+ )
247
+ return evidence_items
248
+
249
+
250
+ def _load_text_for_evidence(
251
+ corpus: Corpus,
252
+ *,
253
+ item_id: str,
254
+ relpath: str,
255
+ media_type: str,
256
+ extraction_reference: Optional[ExtractionRunReference],
257
+ ) -> Optional[str]:
258
+ from .embedding_index_common import _load_text_from_item
259
+
260
+ return _load_text_from_item(
261
+ corpus,
262
+ item_id=item_id,
263
+ relpath=relpath,
264
+ media_type=media_type,
265
+ extraction_reference=extraction_reference,
266
+ )