biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ """
2
+ Retriever interface for Biblicus retrieval engines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Dict
9
+
10
+ from ..corpus import Corpus
11
+ from ..models import QueryBudget, RetrievalResult, RetrievalSnapshot
12
+
13
+
14
+ class Retriever(ABC):
15
+ """
16
+ Abstract interface for retrievers.
17
+
18
+ :ivar retriever_id: Identifier string for the retriever.
19
+ :vartype retriever_id: str
20
+ """
21
+
22
+ retriever_id: str
23
+
24
+ @abstractmethod
25
+ def build_snapshot(
26
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
27
+ ) -> RetrievalSnapshot:
28
+ """
29
+ Build or register a retrieval snapshot for the retriever.
30
+
31
+ :param corpus: Corpus to build against.
32
+ :type corpus: Corpus
33
+ :param configuration_name: Human name for the configuration.
34
+ :type configuration_name: str
35
+ :param configuration: Retriever-specific configuration values.
36
+ :type configuration: dict[str, object]
37
+ :return: Snapshot manifest describing the build.
38
+ :rtype: RetrievalSnapshot
39
+ """
40
+ raise NotImplementedError
41
+
42
+ @abstractmethod
43
+ def query(
44
+ self,
45
+ corpus: Corpus,
46
+ *,
47
+ snapshot: RetrievalSnapshot,
48
+ query_text: str,
49
+ budget: QueryBudget,
50
+ ) -> RetrievalResult:
51
+ """
52
+ Run a retrieval query against a retriever.
53
+
54
+ :param corpus: Corpus associated with the snapshot.
55
+ :type corpus: Corpus
56
+ :param snapshot: Snapshot manifest to use for querying.
57
+ :type snapshot: RetrievalSnapshot
58
+ :param query_text: Query text to execute.
59
+ :type query_text: str
60
+ :param budget: Evidence selection budget.
61
+ :type budget: QueryBudget
62
+ :return: Retrieval results containing evidence.
63
+ :rtype: RetrievalResult
64
+ """
65
+ raise NotImplementedError
@@ -1,5 +1,5 @@
1
1
  """
2
- Shared primitives for embedding-index retrieval backends.
2
+ Shared primitives for embedding-index retrievers.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -12,10 +12,11 @@ import numpy as np
12
12
  from pydantic import BaseModel, ConfigDict, Field, model_validator
13
13
 
14
14
  from ..chunking import ChunkerConfig, TextChunk, TokenizerConfig
15
- from ..corpus import CORPUS_DIR_NAME, RUNS_DIR_NAME, Corpus
15
+ from ..constants import CORPUS_DIR_NAME, SNAPSHOTS_DIR_NAME
16
+ from ..corpus import Corpus
16
17
  from ..embedding_providers import EmbeddingProviderConfig, _l2_normalize_rows
17
18
  from ..frontmatter import parse_front_matter
18
- from ..models import ExtractionRunReference, parse_extraction_run_reference
19
+ from ..models import ExtractionSnapshotReference, parse_extraction_snapshot_reference
19
20
 
20
21
 
21
22
  class ChunkRecord(BaseModel):
@@ -43,54 +44,87 @@ class ChunkRecord(BaseModel):
43
44
  return self
44
45
 
45
46
 
46
- class EmbeddingIndexRecipeConfig(BaseModel):
47
+ class EmbeddingIndexConfiguration(BaseModel):
47
48
  """
48
- Configuration for embedding-index retrieval backends.
49
+ Configuration for embedding-index retrievers.
49
50
 
50
- :ivar snippet_characters: Maximum characters to include in evidence snippets.
51
- :vartype snippet_characters: int
52
- :ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
53
- :vartype extraction_run: str or None
51
+ :ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
52
+ :vartype extraction_snapshot: str or None
54
53
  :ivar chunker: Chunker configuration.
55
54
  :vartype chunker: biblicus.chunking.ChunkerConfig
56
55
  :ivar tokenizer: Optional tokenizer configuration.
57
56
  :vartype tokenizer: biblicus.chunking.TokenizerConfig or None
58
57
  :ivar embedding_provider: Embedding provider configuration.
59
58
  :vartype embedding_provider: biblicus.embedding_providers.EmbeddingProviderConfig
59
+ :ivar snippet_characters: Optional maximum character count for returned evidence text.
60
+ :vartype snippet_characters: int or None
61
+ :ivar maximum_cache_total_items: Optional maximum number of vectors cached per scan batch.
62
+ :vartype maximum_cache_total_items: int or None
63
+ :ivar maximum_cache_total_characters: Optional maximum characters cached per scan batch.
64
+ :vartype maximum_cache_total_characters: int or None
60
65
  """
61
66
 
62
67
  model_config = ConfigDict(extra="forbid")
63
68
 
64
- snippet_characters: int = Field(default=400, ge=1)
65
- extraction_run: Optional[str] = None
69
+ snippet_characters: Optional[int] = Field(default=None, ge=1)
70
+ maximum_cache_total_items: Optional[int] = Field(default=None, ge=1)
71
+ maximum_cache_total_characters: Optional[int] = Field(default=None, ge=1)
72
+ extraction_snapshot: Optional[str] = None
66
73
  chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
67
74
  tokenizer: Optional[TokenizerConfig] = None
68
75
  embedding_provider: EmbeddingProviderConfig
69
76
 
70
77
 
78
+ def _extract_span_text(text: Optional[str], span: Tuple[int, int]) -> Optional[str]:
79
+ if not isinstance(text, str):
80
+ return None
81
+ span_start, span_end = span
82
+ if span_start < 0 or span_end <= span_start:
83
+ return text
84
+ return text[span_start:span_end]
85
+
86
+
87
+ def _build_snippet(
88
+ text: Optional[str], span: Tuple[int, int], max_chars: Optional[int]
89
+ ) -> Optional[str]:
90
+ if not isinstance(text, str):
91
+ return None
92
+ if max_chars is None:
93
+ return _extract_span_text(text, span)
94
+ if max_chars <= 0:
95
+ return ""
96
+ span_start, span_end = span
97
+ if span_start < 0 or span_end <= span_start:
98
+ return text[:max_chars]
99
+ half_window = max_chars // 2
100
+ snippet_start = max(span_start - half_window, 0)
101
+ snippet_end = min(span_end + half_window, len(text))
102
+ return text[snippet_start:snippet_end]
103
+
104
+
71
105
  def resolve_extraction_reference(
72
- corpus: Corpus, recipe_config: EmbeddingIndexRecipeConfig
73
- ) -> Optional[ExtractionRunReference]:
106
+ corpus: Corpus, configuration: EmbeddingIndexConfiguration
107
+ ) -> Optional[ExtractionSnapshotReference]:
74
108
  """
75
- Resolve an extraction run reference from an embedding-index recipe config.
109
+ Resolve an extraction snapshot reference from an embedding-index configuration.
76
110
 
77
- :param corpus: Corpus associated with the recipe.
111
+ :param corpus: Corpus associated with the configuration.
78
112
  :type corpus: Corpus
79
- :param recipe_config: Parsed embedding-index recipe configuration.
80
- :type recipe_config: EmbeddingIndexRecipeConfig
113
+ :param configuration: Parsed embedding-index configuration.
114
+ :type configuration: EmbeddingIndexConfiguration
81
115
  :return: Parsed extraction reference or None.
82
- :rtype: ExtractionRunReference or None
83
- :raises FileNotFoundError: If an extraction run is referenced but not present.
116
+ :rtype: ExtractionSnapshotReference or None
117
+ :raises FileNotFoundError: If an extraction snapshot is referenced but not present.
84
118
  """
85
- if not recipe_config.extraction_run:
119
+ if not configuration.extraction_snapshot:
86
120
  return None
87
- extraction_reference = parse_extraction_run_reference(recipe_config.extraction_run)
88
- run_dir = corpus.extraction_run_dir(
121
+ extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
122
+ snapshot_dir = corpus.extraction_snapshot_dir(
89
123
  extractor_id=extraction_reference.extractor_id,
90
- run_id=extraction_reference.run_id,
124
+ snapshot_id=extraction_reference.snapshot_id,
91
125
  )
92
- if not run_dir.is_dir():
93
- raise FileNotFoundError(f"Missing extraction run: {extraction_reference.as_string()}")
126
+ if not snapshot_dir.is_dir():
127
+ raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
94
128
  return extraction_reference
95
129
 
96
130
 
@@ -100,12 +134,12 @@ def _load_text_from_item(
100
134
  item_id: str,
101
135
  relpath: str,
102
136
  media_type: str,
103
- extraction_reference: Optional[ExtractionRunReference],
137
+ extraction_reference: Optional[ExtractionSnapshotReference],
104
138
  ) -> Optional[str]:
105
139
  if extraction_reference:
106
140
  extracted_text = corpus.read_extracted_text(
107
141
  extractor_id=extraction_reference.extractor_id,
108
- run_id=extraction_reference.run_id,
142
+ snapshot_id=extraction_reference.snapshot_id,
109
143
  item_id=item_id,
110
144
  )
111
145
  if isinstance(extracted_text, str):
@@ -120,7 +154,7 @@ def _load_text_from_item(
120
154
 
121
155
 
122
156
  def iter_text_payloads(
123
- corpus: Corpus, *, extraction_reference: Optional[ExtractionRunReference]
157
+ corpus: Corpus, *, extraction_reference: Optional[ExtractionSnapshotReference]
124
158
  ) -> Iterator[Tuple[object, str]]:
125
159
  """
126
160
  Yield catalog items and their text payloads.
@@ -128,7 +162,7 @@ def iter_text_payloads(
128
162
  :param corpus: Corpus containing the items.
129
163
  :type corpus: Corpus
130
164
  :param extraction_reference: Optional extraction reference.
131
- :type extraction_reference: ExtractionRunReference or None
165
+ :type extraction_reference: ExtractionSnapshotReference or None
132
166
  :yield: (catalog_item, text) pairs.
133
167
  :rtype: Iterator[tuple[object, str]]
134
168
  """
@@ -152,21 +186,21 @@ def iter_text_payloads(
152
186
 
153
187
 
154
188
  def collect_chunks(
155
- corpus: Corpus, *, recipe_config: EmbeddingIndexRecipeConfig
189
+ corpus: Corpus, *, configuration: EmbeddingIndexConfiguration
156
190
  ) -> Tuple[List[TextChunk], int]:
157
191
  """
158
192
  Collect chunks from text payloads in a corpus.
159
193
 
160
194
  :param corpus: Corpus to chunk.
161
195
  :type corpus: Corpus
162
- :param recipe_config: Parsed embedding-index recipe configuration.
163
- :type recipe_config: EmbeddingIndexRecipeConfig
196
+ :param configuration: Parsed embedding-index configuration.
197
+ :type configuration: EmbeddingIndexConfiguration
164
198
  :return: (chunks, text_item_count)
165
199
  :rtype: tuple[list[TextChunk], int]
166
200
  """
167
- tokenizer = recipe_config.tokenizer.build_tokenizer() if recipe_config.tokenizer else None
168
- chunker = recipe_config.chunker.build_chunker(tokenizer=tokenizer)
169
- extraction_reference = resolve_extraction_reference(corpus, recipe_config)
201
+ tokenizer = configuration.tokenizer.build_tokenizer() if configuration.tokenizer else None
202
+ chunker = configuration.chunker.build_chunker(tokenizer=tokenizer)
203
+ extraction_reference = resolve_extraction_reference(corpus, configuration)
170
204
 
171
205
  chunks: List[TextChunk] = []
172
206
  next_chunk_id = 0
@@ -284,18 +318,20 @@ def cosine_similarity_scores(embeddings: np.ndarray, query_vector: np.ndarray) -
284
318
  return embeddings @ query_vector
285
319
 
286
320
 
287
- def artifact_paths_for_run(*, run_id: str, backend_id: str) -> Dict[str, str]:
321
+ def artifact_paths_for_snapshot(*, snapshot_id: str, retriever_id: str) -> Dict[str, str]:
288
322
  """
289
- Build deterministic artifact relative paths for an embedding index run.
323
+ Build deterministic artifact relative paths for an embedding index snapshot.
290
324
 
291
- :param run_id: Run identifier.
292
- :type run_id: str
293
- :param backend_id: Backend identifier.
294
- :type backend_id: str
325
+ :param snapshot_id: Snapshot identifier.
326
+ :type snapshot_id: str
327
+ :param retriever_id: Retriever identifier.
328
+ :type retriever_id: str
295
329
  :return: Mapping with keys embeddings and chunks.
296
330
  :rtype: dict[str, str]
297
331
  """
298
- prefix = f"{run_id}.{backend_id}"
299
- embeddings_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.embeddings.npy")
300
- chunks_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{prefix}.chunks.jsonl")
332
+ prefix = f"{snapshot_id}.{retriever_id}"
333
+ embeddings_relpath = str(
334
+ Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.embeddings.npy"
335
+ )
336
+ chunks_relpath = str(Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.chunks.jsonl")
301
337
  return {"embeddings": embeddings_relpath, "chunks": chunks_relpath}
@@ -1,5 +1,5 @@
1
1
  """
2
- Embedding-index retrieval backend that reads the embedding matrix via memory mapping.
2
+ Embedding-index retriever that reads the embedding matrix via memory mapping.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -10,13 +10,26 @@ from typing import Dict, List, Optional
10
10
  import numpy as np
11
11
 
12
12
  from ..corpus import Corpus
13
- from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
14
- from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
13
+ from ..models import (
14
+ Evidence,
15
+ ExtractionSnapshotReference,
16
+ QueryBudget,
17
+ RetrievalResult,
18
+ RetrievalSnapshot,
19
+ )
20
+ from ..retrieval import (
21
+ apply_budget,
22
+ create_configuration_manifest,
23
+ create_snapshot_manifest,
24
+ hash_text,
25
+ )
15
26
  from ..time import utc_now_iso
16
27
  from .embedding_index_common import (
17
28
  ChunkRecord,
18
- EmbeddingIndexRecipeConfig,
19
- artifact_paths_for_run,
29
+ EmbeddingIndexConfiguration,
30
+ _build_snippet,
31
+ _extract_span_text,
32
+ artifact_paths_for_snapshot,
20
33
  chunks_to_records,
21
34
  collect_chunks,
22
35
  cosine_similarity_scores,
@@ -26,48 +39,54 @@ from .embedding_index_common import (
26
39
  write_chunks_jsonl,
27
40
  write_embeddings,
28
41
  )
29
- from .scan import _build_snippet
30
42
 
31
43
 
32
- class EmbeddingIndexFileBackend:
44
+ class EmbeddingIndexFileRetriever:
33
45
  """
34
- Embedding retrieval backend using memory-mapped similarity scanning.
46
+ Embedding retrieval retriever using memory-mapped similarity scanning.
35
47
  """
36
48
 
37
- backend_id = "embedding-index-file"
49
+ retriever_id = "embedding-index-file"
38
50
 
39
- def build_run(
40
- self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
41
- ) -> RetrievalRun:
51
+ def build_snapshot(
52
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
53
+ ) -> RetrievalSnapshot:
42
54
  """
43
- Build an embedding index run by chunking text payloads and materializing embeddings.
55
+ Build an embedding index snapshot by chunking text payloads and materializing embeddings.
44
56
 
45
57
  :param corpus: Corpus to build against.
46
58
  :type corpus: Corpus
47
- :param recipe_name: Human-readable recipe name.
48
- :type recipe_name: str
49
- :param config: Backend-specific configuration values.
50
- :type config: dict[str, object]
51
- :return: Run manifest describing the build.
52
- :rtype: biblicus.models.RetrievalRun
59
+ :param configuration_name: Human-readable configuration name.
60
+ :type configuration_name: str
61
+ :param configuration: Retriever-specific configuration values.
62
+ :type configuration: dict[str, object]
63
+ :return: Snapshot manifest describing the build.
64
+ :rtype: biblicus.models.RetrievalSnapshot
53
65
  """
54
- recipe_config = EmbeddingIndexRecipeConfig.model_validate(config)
55
- chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
66
+ parsed_config = EmbeddingIndexConfiguration.model_validate(configuration)
67
+ chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
56
68
 
57
- provider = recipe_config.embedding_provider.build_provider()
69
+ provider = parsed_config.embedding_provider.build_provider()
58
70
  embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
59
71
 
60
- recipe = create_recipe_manifest(
61
- backend_id=self.backend_id,
62
- name=recipe_name,
63
- config=recipe_config.model_dump(),
72
+ configuration_manifest = create_configuration_manifest(
73
+ retriever_id=self.retriever_id,
74
+ name=configuration_name,
75
+ configuration=parsed_config.model_dump(),
76
+ )
77
+ snapshot = create_snapshot_manifest(
78
+ corpus,
79
+ configuration=configuration_manifest,
80
+ stats={},
81
+ snapshot_artifacts=[],
64
82
  )
65
- run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
66
83
 
67
- paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
84
+ paths = artifact_paths_for_snapshot(
85
+ snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
86
+ )
68
87
  embeddings_path = corpus.root / paths["embeddings"]
69
88
  chunks_path = corpus.root / paths["chunks"]
70
- corpus.runs_dir.mkdir(parents=True, exist_ok=True)
89
+ corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
71
90
 
72
91
  write_embeddings(embeddings_path, embeddings)
73
92
  write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
@@ -79,30 +98,33 @@ class EmbeddingIndexFileBackend:
79
98
  "dimensions": (
80
99
  int(embeddings.shape[1])
81
100
  if embeddings.size
82
- else recipe_config.embedding_provider.dimensions
101
+ else parsed_config.embedding_provider.dimensions
83
102
  ),
84
103
  }
85
- run = run.model_copy(
86
- update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
104
+ snapshot = snapshot.model_copy(
105
+ update={
106
+ "snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
107
+ "stats": stats,
108
+ }
87
109
  )
88
- corpus.write_run(run)
89
- return run
110
+ corpus.write_snapshot(snapshot)
111
+ return snapshot
90
112
 
91
113
  def query(
92
114
  self,
93
115
  corpus: Corpus,
94
116
  *,
95
- run: RetrievalRun,
117
+ snapshot: RetrievalSnapshot,
96
118
  query_text: str,
97
119
  budget: QueryBudget,
98
120
  ) -> RetrievalResult:
99
121
  """
100
- Query an embedding index run and return ranked evidence.
122
+ Query an embedding index snapshot and return ranked evidence.
101
123
 
102
- :param corpus: Corpus associated with the run.
124
+ :param corpus: Corpus associated with the snapshot.
103
125
  :type corpus: Corpus
104
- :param run: Run manifest to use for querying.
105
- :type run: biblicus.models.RetrievalRun
126
+ :param snapshot: Snapshot manifest to use for querying.
127
+ :type snapshot: biblicus.models.RetrievalSnapshot
106
128
  :param query_text: Query text to embed.
107
129
  :type query_text: str
108
130
  :param budget: Evidence selection budget.
@@ -110,14 +132,18 @@ class EmbeddingIndexFileBackend:
110
132
  :return: Retrieval results containing evidence.
111
133
  :rtype: biblicus.models.RetrievalResult
112
134
  """
113
- recipe_config = EmbeddingIndexRecipeConfig.model_validate(run.recipe.config)
114
- extraction_reference = resolve_extraction_reference(corpus, recipe_config)
135
+ parsed_config = EmbeddingIndexConfiguration.model_validate(
136
+ snapshot.configuration.configuration
137
+ )
138
+ extraction_reference = resolve_extraction_reference(corpus, parsed_config)
115
139
 
116
- paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
140
+ paths = artifact_paths_for_snapshot(
141
+ snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
142
+ )
117
143
  embeddings_path = corpus.root / paths["embeddings"]
118
144
  chunks_path = corpus.root / paths["chunks"]
119
145
  if not embeddings_path.is_file() or not chunks_path.is_file():
120
- raise FileNotFoundError("Embedding index artifacts are missing for this run")
146
+ raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
121
147
 
122
148
  embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
123
149
  chunk_records = read_chunks_jsonl(chunks_path)
@@ -127,20 +153,22 @@ class EmbeddingIndexFileBackend:
127
153
  "embeddings row count does not match chunk record count"
128
154
  )
129
155
 
130
- provider = recipe_config.embedding_provider.build_provider()
156
+ provider = parsed_config.embedding_provider.build_provider()
131
157
  query_embedding = provider.embed_texts([query_text]).astype(np.float32)
132
158
  if query_embedding.shape[0] != 1:
133
159
  raise ValueError("Embedding provider returned an invalid query embedding shape")
134
160
 
161
+ batch_rows = parsed_config.maximum_cache_total_items or 4096
135
162
  candidates = _top_indices_batched(
136
163
  embeddings=embeddings,
137
164
  query_vector=query_embedding[0],
138
165
  limit=_candidate_limit(budget.max_total_items + budget.offset),
166
+ batch_rows=batch_rows,
139
167
  )
140
168
  evidence_items = _build_evidence(
141
169
  corpus,
142
- run=run,
143
- recipe_config=recipe_config,
170
+ snapshot=snapshot,
171
+ configuration=parsed_config,
144
172
  candidates=candidates,
145
173
  embeddings=embeddings,
146
174
  query_vector=query_embedding[0],
@@ -149,7 +177,11 @@ class EmbeddingIndexFileBackend:
149
177
  )
150
178
  ranked = [
151
179
  item.model_copy(
152
- update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
180
+ update={
181
+ "rank": index,
182
+ "configuration_id": snapshot.configuration.configuration_id,
183
+ "snapshot_id": snapshot.snapshot_id,
184
+ }
153
185
  )
154
186
  for index, item in enumerate(evidence_items, start=1)
155
187
  ]
@@ -157,9 +189,9 @@ class EmbeddingIndexFileBackend:
157
189
  return RetrievalResult(
158
190
  query_text=query_text,
159
191
  budget=budget,
160
- run_id=run.run_id,
161
- recipe_id=run.recipe.recipe_id,
162
- backend_id=self.backend_id,
192
+ snapshot_id=snapshot.snapshot_id,
193
+ configuration_id=snapshot.configuration.configuration_id,
194
+ retriever_id=snapshot.configuration.retriever_id,
163
195
  generated_at=utc_now_iso(),
164
196
  evidence=evidence,
165
197
  stats={"candidates": len(evidence_items), "returned": len(evidence)},
@@ -202,13 +234,13 @@ def _top_indices_batched(
202
234
  def _build_evidence(
203
235
  corpus: Corpus,
204
236
  *,
205
- run: RetrievalRun,
206
- recipe_config: EmbeddingIndexRecipeConfig,
237
+ snapshot: RetrievalSnapshot,
238
+ configuration: EmbeddingIndexConfiguration,
207
239
  candidates: List[int],
208
240
  embeddings: np.ndarray,
209
241
  query_vector: np.ndarray,
210
242
  chunk_records: List[ChunkRecord],
211
- extraction_reference: Optional[ExtractionRunReference],
243
+ extraction_reference: Optional[ExtractionSnapshotReference],
212
244
  ) -> List[Evidence]:
213
245
  catalog = corpus.load_catalog()
214
246
  evidence_items: List[Evidence] = []
@@ -222,9 +254,11 @@ def _build_evidence(
222
254
  media_type=str(getattr(catalog_item, "media_type")),
223
255
  extraction_reference=extraction_reference,
224
256
  )
225
- snippet = _build_snippet(
226
- text, (record.span_start, record.span_end), max_chars=recipe_config.snippet_characters
257
+ span_text = _build_snippet(
258
+ text, (record.span_start, record.span_end), configuration.snippet_characters
227
259
  )
260
+ if span_text is None:
261
+ span_text = _extract_span_text(text, (record.span_start, record.span_end))
228
262
  score = float(cosine_similarity_scores(embeddings[idx : idx + 1], query_vector)[0])
229
263
  evidence_items.append(
230
264
  Evidence(
@@ -233,15 +267,16 @@ def _build_evidence(
233
267
  media_type=str(getattr(catalog_item, "media_type")),
234
268
  score=score,
235
269
  rank=1,
236
- text=snippet,
270
+ text=span_text,
237
271
  content_ref=None,
238
272
  span_start=record.span_start,
239
273
  span_end=record.span_end,
240
- stage=EmbeddingIndexFileBackend.backend_id,
274
+ stage=EmbeddingIndexFileRetriever.retriever_id,
241
275
  stage_scores=None,
242
- recipe_id=run.recipe.recipe_id,
243
- run_id=run.run_id,
244
- hash=hash_text(snippet),
276
+ configuration_id=snapshot.configuration.configuration_id,
277
+ snapshot_id=snapshot.snapshot_id,
278
+ metadata=getattr(catalog_item, "metadata", {}) or {},
279
+ hash=hash_text(span_text or ""),
245
280
  )
246
281
  )
247
282
  return evidence_items
@@ -253,7 +288,7 @@ def _load_text_for_evidence(
253
288
  item_id: str,
254
289
  relpath: str,
255
290
  media_type: str,
256
- extraction_reference: Optional[ExtractionRunReference],
291
+ extraction_reference: Optional[ExtractionSnapshotReference],
257
292
  ) -> Optional[str]:
258
293
  from .embedding_index_common import _load_text_from_item
259
294