biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
  43. biblicus-1.1.0.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Shared primitives for embedding-index retrieval backends.
2
+ Shared primitives for embedding-index retrievers.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -12,10 +12,11 @@ import numpy as np
12
12
  from pydantic import BaseModel, ConfigDict, Field, model_validator
13
13
 
14
14
  from ..chunking import ChunkerConfig, TextChunk, TokenizerConfig
15
- from ..corpus import CORPUS_DIR_NAME, RUNS_DIR_NAME, Corpus
15
+ from ..constants import CORPUS_DIR_NAME, SNAPSHOTS_DIR_NAME
16
+ from ..corpus import Corpus
16
17
  from ..embedding_providers import EmbeddingProviderConfig, _l2_normalize_rows
17
18
  from ..frontmatter import parse_front_matter
18
- from ..models import ExtractionRunReference, parse_extraction_run_reference
19
+ from ..models import ExtractionSnapshotReference, parse_extraction_snapshot_reference
19
20
 
20
21
 
21
22
  class ChunkRecord(BaseModel):
@@ -43,12 +44,12 @@ class ChunkRecord(BaseModel):
43
44
  return self
44
45
 
45
46
 
46
- class EmbeddingIndexRecipeConfig(BaseModel):
47
+ class EmbeddingIndexConfiguration(BaseModel):
47
48
  """
48
- Configuration for embedding-index retrieval backends.
49
+ Configuration for embedding-index retrievers.
49
50
 
50
- :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
51
- :vartype extraction_run: str or None
51
+ :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
52
+ :vartype extraction_snapshot: str or None
52
53
  :ivar chunker: Chunker configuration.
53
54
  :vartype chunker: biblicus.chunking.ChunkerConfig
54
55
  :ivar tokenizer: Optional tokenizer configuration.
@@ -68,7 +69,7 @@ class EmbeddingIndexRecipeConfig(BaseModel):
68
69
  snippet_characters: Optional[int] = Field(default=None, ge=1)
69
70
  maximum_cache_total_items: Optional[int] = Field(default=None, ge=1)
70
71
  maximum_cache_total_characters: Optional[int] = Field(default=None, ge=1)
71
- extraction_run: Optional[str] = None
72
+ extraction_snapshot: Optional[str] = None
72
73
  chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
73
74
  tokenizer: Optional[TokenizerConfig] = None
74
75
  embedding_provider: EmbeddingProviderConfig
@@ -102,28 +103,28 @@ def _build_snippet(
102
103
 
103
104
 
104
105
  def resolve_extraction_reference(
105
- corpus: Corpus, recipe_config: EmbeddingIndexRecipeConfig
106
- ) -> Optional[ExtractionRunReference]:
106
+ corpus: Corpus, configuration: EmbeddingIndexConfiguration
107
+ ) -> Optional[ExtractionSnapshotReference]:
107
108
  """
108
- Resolve an extraction run reference from an embedding-index recipe config.
109
+ Resolve an extraction snapshot reference from an embedding-index configuration.
109
110
 
110
- :param corpus: Corpus associated with the recipe.
111
+ :param corpus: Corpus associated with the configuration.
111
112
  :type corpus: Corpus
112
- :param recipe_config: Parsed embedding-index recipe configuration.
113
- :type recipe_config: EmbeddingIndexRecipeConfig
113
+ :param configuration: Parsed embedding-index configuration.
114
+ :type configuration: EmbeddingIndexConfiguration
114
115
  :return: Parsed extraction reference or None.
115
- :rtype: ExtractionRunReference or None
116
- :raises FileNotFoundError: If an extraction run is referenced but not present.
116
+ :rtype: ExtractionSnapshotReference or None
117
+ :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
117
118
  """
118
- if not recipe_config.extraction_run:
119
+ if not configuration.extraction_snapshot:
119
120
  return None
120
- extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
121
- run_dir = corpus.extraction_run_dir(
121
+ extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
122
+ snapshot_dir = corpus.extraction_snapshot_dir(
122
123
  extractor_id=extraction_reference.extractor_id,
123
- run_id=extraction_reference.run_id,
124
+ snapshot_id=extraction_reference.snapshot_id,
124
125
  )
125
- if not run_dir.is_dir():
126
- raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
126
+ if not snapshot_dir.is_dir():
127
+ raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
127
128
  return extraction_reference
128
129
 
129
130
 
@@ -133,12 +134,12 @@ def _load_text_from_item(
133
134
  item_id: str,
134
135
  relpath: str,
135
136
  media_type: str,
136
- extraction_reference: Optional[ExtractionRunReference],
137
+ extraction_reference: Optional[ExtractionSnapshotReference],
137
138
  ) -> Optional[str]:
138
139
  if extraction_reference:
139
140
  extracted_text = corpus.read_extracted_text(
140
141
  extractor_id=extraction_reference.extractor_id,
141
- run_id=extraction_reference.run_id,
142
+ snapshot_id=extraction_reference.snapshot_id,
142
143
  item_id=item_id,
143
144
  )
144
145
  if isinstance(extracted_text, str):
@@ -153,7 +154,7 @@ def _load_text_from_item(
153
154
 
154
155
 
155
156
  def iter_text_payloads(
156
- corpus: Corpus, *, extraction_reference: Optional[ExtractionRunReference]
157
+ corpus: Corpus, *, extraction_reference: Optional[ExtractionSnapshotReference]
157
158
  ) -> Iterator[Tuple[object, str]]:
158
159
  """
159
160
  Yield catalog items and their text payloads.
@@ -161,7 +162,7 @@ def iter_text_payloads(
161
162
  :param corpus: Corpus containing the items.
162
163
  :type corpus: Corpus
163
164
  :param extraction_reference: Optional extraction reference.
164
- :type extraction_reference: ExtractionRunReference or None
165
+ :type extraction_reference: ExtractionSnapshotReference or None
165
166
  :yield: (catalog_item, text) pairs.
166
167
  :rtype: Iterator[tuple[object, str]]
167
168
  """
@@ -185,21 +186,21 @@ def iter_text_payloads(
185
186
 
186
187
 
187
188
  def collect_chunks(
188
- corpus: Corpus, *, recipe_config: EmbeddingIndexRecipeConfig
189
+ corpus: Corpus, *, configuration: EmbeddingIndexConfiguration
189
190
  ) -> Tuple[List[TextChunk], int]:
190
191
  """
191
192
  Collect chunks from text payloads in a corpus.
192
193
 
193
194
  :param corpus: Corpus to chunk.
194
195
  :type corpus: Corpus
195
- :param recipe_config: Parsed embedding-index recipe configuration.
196
- :type recipe_config: EmbeddingIndexRecipeConfig
196
+ :param configuration: Parsed embedding-index configuration.
197
+ :type configuration: EmbeddingIndexConfiguration
197
198
  :return: (chunks, text_item_count)
198
199
  :rtype: tuple[list[TextChunk], int]
199
200
  """
200
- tokenizer = recipe_config.tokenizer.build_tokenizer() if recipe_config.tokenizer else None
201
- chunker = recipe_config.chunker.build_chunker(tokenizer=tokenizer)
202
- extraction_reference = resolve_extraction_reference(corpus, recipe_config)
201
+ tokenizer = configuration.tokenizer.build_tokenizer() if configuration.tokenizer else None
202
+ chunker = configuration.chunker.build_chunker(tokenizer=tokenizer)
203
+ extraction_reference = resolve_extraction_reference(corpus, configuration)
203
204
 
204
205
  chunks: List[TextChunk] = []
205
206
  next_chunk_id = 0
@@ -317,18 +318,20 @@ def cosine_similarity_scores(embeddings: np.ndarray, query_vector: np.ndarray) -
317
318
  return embeddings @ query_vector
318
319
 
319
320
 
320
- def artifact_paths_for_run(*, run_id: str, backend_id: str) -> Dict[str, str]:
321
+ def artifact_paths_for_snapshot(*, snapshot_id: str, retriever_id: str) -> Dict[str, str]:
321
322
  """
322
- Build deterministic artifact relative paths for an embedding index run.
323
+ Build deterministic artifact relative paths for an embedding index snapshot.
323
324
 
324
- :param run_id: Run identifier.
325
- :type run_id: str
326
- :param backend_id: Backend identifier.
327
- :type backend_id: str
325
+ :param snapshot_id: Snapshot identifier.
326
+ :type snapshot_id: str
327
+ :param retriever_id: Retriever identifier.
328
+ :type retriever_id: str
328
329
  :return: Mapping with keys embeddings and chunks.
329
330
  :rtype: dict[str, str]
330
331
  """
331
- prefix = f"{run_id}.{backend_id}"
332
- embeddings_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.embeddings.npy")
333
- chunks_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.chunks.jsonl")
332
+ prefix = f"{snapshot_id}.{retriever_id}"
333
+ embeddings_relpath = str(
334
+ Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.embeddings.npy"
335
+ )
336
+ chunks_relpath = str(Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.chunks.jsonl")
334
337
  return {"embeddings": embeddings_relpath, "chunks": chunks_relpath}
@@ -1,5 +1,5 @@
1
1
  """
2
- Embedding-index retrieval backend that reads the embedding matrix via memory mapping.
2
+ Embedding-index retriever that reads the embedding matrix via memory mapping.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -10,15 +10,26 @@ from typing import Dict, List, Optional
10
10
  import numpy as np
11
11
 
12
12
  from ..corpus import Corpus
13
- from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
14
- from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
13
+ from ..models import (
14
+ Evidence,
15
+ ExtractionSnapshotReference,
16
+ QueryBudget,
17
+ RetrievalResult,
18
+ RetrievalSnapshot,
19
+ )
20
+ from ..retrieval import (
21
+ apply_budget,
22
+ create_configuration_manifest,
23
+ create_snapshot_manifest,
24
+ hash_text,
25
+ )
15
26
  from ..time import utc_now_iso
16
27
  from .embedding_index_common import (
17
28
  ChunkRecord,
18
- EmbeddingIndexRecipeConfig,
29
+ EmbeddingIndexConfiguration,
19
30
  _build_snippet,
20
31
  _extract_span_text,
21
- artifact_paths_for_run,
32
+ artifact_paths_for_snapshot,
22
33
  chunks_to_records,
23
34
  collect_chunks,
24
35
  cosine_similarity_scores,
@@ -30,45 +41,52 @@ from .embedding_index_common import (
30
41
  )
31
42
 
32
43
 
33
- class EmbeddingIndexFileBackend:
44
+ class EmbeddingIndexFileRetriever:
34
45
  """
35
- Embedding retrieval backend using memory-mapped similarity scanning.
46
+ Embedding retrieval retriever using memory-mapped similarity scanning.
36
47
  """
37
48
 
38
- backend_id = "embedding-index-file"
49
+ retriever_id = "embedding-index-file"
39
50
 
40
- def build_run(
41
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
42
- ) -> RetrievalRun:
51
+ def build_snapshot(
52
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
53
+ ) -> RetrievalSnapshot:
43
54
  """
44
- Build an embedding index run by chunking text payloads and materializing embeddings.
55
+ Build an embedding index snapshot by chunking text payloads and materializing embeddings.
45
56
 
46
57
  :param corpus: Corpus to build against.
47
58
  :type corpus: Corpus
48
- :param recipe_name: Human-readable recipe name.
49
- :type recipe_name: str
50
- :param config: Backend-specific configuration values.
51
- :type config: dict[str, object]
52
- :return: Run manifest describing the build.
53
- :rtype: biblicus.models.RetrievalRun
59
+ :param configuration_name: Human-readable configuration name.
60
+ :type configuration_name: str
61
+ :param configuration: Retriever-specific configuration values.
62
+ :type configuration: dict[str, object]
63
+ :return: Snapshot manifest describing the build.
64
+ :rtype: biblicus.models.RetrievalSnapshot
54
65
  """
55
- recipe_config = EmbeddingIndexRecipeConfig.model_validate(config)
56
- chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
66
+ parsed_config = EmbeddingIndexConfiguration.model_validate(configuration)
67
+ chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
57
68
 
58
- provider = recipe_config.embedding_provider.build_provider()
69
+ provider = parsed_config.embedding_provider.build_provider()
59
70
  embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
60
71
 
61
- recipe = create_recipe_manifest(
62
- backend_id=self.backend_id,
63
- name=recipe_name,
64
- config=recipe_config.model_dump(),
72
+ configuration_manifest = create_configuration_manifest(
73
+ retriever_id=self.retriever_id,
74
+ name=configuration_name,
75
+ configuration=parsed_config.model_dump(),
76
+ )
77
+ snapshot = create_snapshot_manifest(
78
+ corpus,
79
+ configuration=configuration_manifest,
80
+ stats={},
81
+ snapshot_artifacts=[],
65
82
  )
66
- run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
67
83
 
68
- paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
84
+ paths = artifact_paths_for_snapshot(
85
+ snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
86
+ )
69
87
  embeddings_path = corpus.root / paths["embeddings"]
70
88
  chunks_path = corpus.root / paths["chunks"]
71
- corpus.runs_dir.mkdir(parents=True, exist_ok=True)
89
+ corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
72
90
 
73
91
  write_embeddings(embeddings_path, embeddings)
74
92
  write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
@@ -80,30 +98,33 @@ class EmbeddingIndexFileBackend:
80
98
  "dimensions": (
81
99
  int(embeddings.shape[1])
82
100
  if embeddings.size
83
- else recipe_config.embedding_provider.dimensions
101
+ else parsed_config.embedding_provider.dimensions
84
102
  ),
85
103
  }
86
- run = run.model_copy(
87
- update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
104
+ snapshot = snapshot.model_copy(
105
+ update={
106
+ "snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
107
+ "stats": stats,
108
+ }
88
109
  )
89
- corpus.write_run(run)
90
- return run
110
+ corpus.write_snapshot(snapshot)
111
+ return snapshot
91
112
 
92
113
  def query(
93
114
  self,
94
115
  corpus: Corpus,
95
116
  *,
96
- run: RetrievalRun,
117
+ snapshot: RetrievalSnapshot,
97
118
  query_text: str,
98
119
  budget: QueryBudget,
99
120
  ) -> RetrievalResult:
100
121
  """
101
- Query an embedding index run and return ranked evidence.
122
+ Query an embedding index snapshot and return ranked evidence.
102
123
 
103
- :param corpus: Corpus associated with the run.
124
+ :param corpus: Corpus associated with the snapshot.
104
125
  :type corpus: Corpus
105
- :param run: Run manifest to use for querying.
106
- :type run: biblicus.models.RetrievalRun
126
+ :param snapshot: Snapshot manifest to use for querying.
127
+ :type snapshot: biblicus.models.RetrievalSnapshot
107
128
  :param query_text: Query text to embed.
108
129
  :type query_text: str
109
130
  :param budget: Evidence selection budget.
@@ -111,14 +132,18 @@ class EmbeddingIndexFileBackend:
111
132
  :return: Retrieval results containing evidence.
112
133
  :rtype: biblicus.models.RetrievalResult
113
134
  """
114
- recipe_config = EmbeddingIndexRecipeConfig.model_validate(run.recipe.config)
115
- extraction_reference = resolve_extraction_reference(corpus, recipe_config)
135
+ parsed_config = EmbeddingIndexConfiguration.model_validate(
136
+ snapshot.configuration.configuration
137
+ )
138
+ extraction_reference = resolve_extraction_reference(corpus, parsed_config)
116
139
 
117
- paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
140
+ paths = artifact_paths_for_snapshot(
141
+ snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
142
+ )
118
143
  embeddings_path = corpus.root / paths["embeddings"]
119
144
  chunks_path = corpus.root / paths["chunks"]
120
145
  if not embeddings_path.is_file() or not chunks_path.is_file():
121
- raise FileNotFoundError("Embedding index artifacts are missing for this run")
146
+ raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
122
147
 
123
148
  embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
124
149
  chunk_records = read_chunks_jsonl(chunks_path)
@@ -128,12 +153,12 @@ class EmbeddingIndexFileBackend:
128
153
  "embeddings row count does not match chunk record count"
129
154
  )
130
155
 
131
- provider = recipe_config.embedding_provider.build_provider()
156
+ provider = parsed_config.embedding_provider.build_provider()
132
157
  query_embedding = provider.embed_texts([query_text]).astype(np.float32)
133
158
  if query_embedding.shape[0] != 1:
134
159
  raise ValueError("Embedding provider returned an invalid query embedding shape")
135
160
 
136
- batch_rows = recipe_config.maximum_cache_total_items or 4096
161
+ batch_rows = parsed_config.maximum_cache_total_items or 4096
137
162
  candidates = _top_indices_batched(
138
163
  embeddings=embeddings,
139
164
  query_vector=query_embedding[0],
@@ -142,8 +167,8 @@ class EmbeddingIndexFileBackend:
142
167
  )
143
168
  evidence_items = _build_evidence(
144
169
  corpus,
145
- run=run,
146
- recipe_config=recipe_config,
170
+ snapshot=snapshot,
171
+ configuration=parsed_config,
147
172
  candidates=candidates,
148
173
  embeddings=embeddings,
149
174
  query_vector=query_embedding[0],
@@ -152,7 +177,11 @@ class EmbeddingIndexFileBackend:
152
177
  )
153
178
  ranked = [
154
179
  item.model_copy(
155
- update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
180
+ update={
181
+ "rank": index,
182
+ "configuration_id": snapshot.configuration.configuration_id,
183
+ "snapshot_id": snapshot.snapshot_id,
184
+ }
156
185
  )
157
186
  for index, item in enumerate(evidence_items, start=1)
158
187
  ]
@@ -160,9 +189,9 @@ class EmbeddingIndexFileBackend:
160
189
  return RetrievalResult(
161
190
  query_text=query_text,
162
191
  budget=budget,
163
- run_id=run.run_id,
164
- recipe_id=run.recipe.recipe_id,
165
- backend_id=self.backend_id,
192
+ snapshot_id=snapshot.snapshot_id,
193
+ configuration_id=snapshot.configuration.configuration_id,
194
+ retriever_id=snapshot.configuration.retriever_id,
166
195
  generated_at=utc_now_iso(),
167
196
  evidence=evidence,
168
197
  stats={"candidates": len(evidence_items), "returned": len(evidence)},
@@ -205,13 +234,13 @@ def _top_indices_batched(
205
234
  def _build_evidence(
206
235
  corpus: Corpus,
207
236
  *,
208
- run: RetrievalRun,
209
- recipe_config: EmbeddingIndexRecipeConfig,
237
+ snapshot: RetrievalSnapshot,
238
+ configuration: EmbeddingIndexConfiguration,
210
239
  candidates: List[int],
211
240
  embeddings: np.ndarray,
212
241
  query_vector: np.ndarray,
213
242
  chunk_records: List[ChunkRecord],
214
- extraction_reference: Optional[ExtractionRunReference],
243
+ extraction_reference: Optional[ExtractionSnapshotReference],
215
244
  ) -> List[Evidence]:
216
245
  catalog = corpus.load_catalog()
217
246
  evidence_items: List[Evidence] = []
@@ -226,7 +255,7 @@ def _build_evidence(
226
255
  extraction_reference=extraction_reference,
227
256
  )
228
257
  span_text = _build_snippet(
229
- text, (record.span_start, record.span_end), recipe_config.snippet_characters
258
+ text, (record.span_start, record.span_end), configuration.snippet_characters
230
259
  )
231
260
  if span_text is None:
232
261
  span_text = _extract_span_text(text, (record.span_start, record.span_end))
@@ -242,10 +271,10 @@ def _build_evidence(
242
271
  content_ref=None,
243
272
  span_start=record.span_start,
244
273
  span_end=record.span_end,
245
- stage=EmbeddingIndexFileBackend.backend_id,
274
+ stage=EmbeddingIndexFileRetriever.retriever_id,
246
275
  stage_scores=None,
247
- recipe_id=run.recipe.recipe_id,
248
- run_id=run.run_id,
276
+ configuration_id=snapshot.configuration.configuration_id,
277
+ snapshot_id=snapshot.snapshot_id,
249
278
  metadata=getattr(catalog_item, "metadata", {}) or {},
250
279
  hash=hash_text(span_text or ""),
251
280
  )
@@ -259,7 +288,7 @@ def _load_text_for_evidence(
259
288
  item_id: str,
260
289
  relpath: str,
261
290
  media_type: str,
262
- extraction_reference: Optional[ExtractionRunReference],
291
+ extraction_reference: Optional[ExtractionSnapshotReference],
263
292
  ) -> Optional[str]:
264
293
  from .embedding_index_common import _load_text_from_item
265
294