biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,6 @@ from typing import List, Optional, Sequence
11
11
 
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
- from .backends import get_backend
15
14
  from .context import (
16
15
  ContextPack,
17
16
  ContextPackPolicy,
@@ -20,17 +19,18 @@ from .context import (
20
19
  fit_context_pack_to_token_budget,
21
20
  )
22
21
  from .corpus import Corpus
23
- from .models import QueryBudget, RetrievalResult, RetrievalRun
22
+ from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
23
+ from .retrievers import get_retriever
24
24
 
25
25
 
26
26
  class KnowledgeBaseDefaults(BaseModel):
27
27
  """
28
28
  Default configuration for a knowledge base workflow.
29
29
 
30
- :ivar backend_id: Backend identifier to use for retrieval.
31
- :vartype backend_id: str
32
- :ivar recipe_name: Human-readable retrieval recipe name.
33
- :vartype recipe_name: str
30
+ :ivar retriever_id: Retriever identifier to use for retrieval.
31
+ :vartype retriever_id: str
32
+ :ivar configuration_name: Human-readable retrieval configuration name.
33
+ :vartype configuration_name: str
34
34
  :ivar query_budget: Default query budget to apply to retrieval.
35
35
  :vartype query_budget: QueryBudget
36
36
  :ivar tags: Tags to apply when importing the folder.
@@ -39,12 +39,12 @@ class KnowledgeBaseDefaults(BaseModel):
39
39
 
40
40
  model_config = ConfigDict(extra="forbid")
41
41
 
42
- backend_id: str = Field(default="scan", min_length=1)
43
- recipe_name: str = Field(default="Knowledge base", min_length=1)
42
+ retriever_id: str = Field(default="scan", min_length=1)
43
+ configuration_name: str = Field(default="Knowledge base", min_length=1)
44
44
  query_budget: QueryBudget = Field(
45
45
  default_factory=lambda: QueryBudget(
46
46
  max_total_items=5,
47
- max_total_characters=2000,
47
+ maximum_total_characters=2000,
48
48
  max_items_per_source=None,
49
49
  )
50
50
  )
@@ -58,17 +58,17 @@ class KnowledgeBase:
58
58
 
59
59
  :ivar corpus: Corpus instance that stores the ingested items.
60
60
  :vartype corpus: Corpus
61
- :ivar backend_id: Backend identifier used for retrieval.
62
- :vartype backend_id: str
63
- :ivar run: Retrieval run manifest associated with the knowledge base.
64
- :vartype run: RetrievalRun
61
+ :ivar retriever_id: Retriever identifier used for retrieval.
62
+ :vartype retriever_id: str
63
+ :ivar snapshot: Retrieval snapshot manifest associated with the knowledge base.
64
+ :vartype snapshot: RetrievalSnapshot
65
65
  :ivar defaults: Default configuration used for this knowledge base.
66
66
  :vartype defaults: KnowledgeBaseDefaults
67
67
  """
68
68
 
69
69
  corpus: Corpus
70
- backend_id: str
71
- run: RetrievalRun
70
+ retriever_id: str
71
+ snapshot: RetrievalSnapshot
72
72
  defaults: KnowledgeBaseDefaults
73
73
  _temp_dir: Optional[TemporaryDirectory]
74
74
 
@@ -77,8 +77,8 @@ class KnowledgeBase:
77
77
  cls,
78
78
  folder: str | Path,
79
79
  *,
80
- backend_id: Optional[str] = None,
81
- recipe_name: Optional[str] = None,
80
+ retriever_id: Optional[str] = None,
81
+ configuration_name: Optional[str] = None,
82
82
  query_budget: Optional[QueryBudget] = None,
83
83
  tags: Optional[Sequence[str]] = None,
84
84
  corpus_root: Optional[str | Path] = None,
@@ -88,10 +88,10 @@ class KnowledgeBase:
88
88
 
89
89
  :param folder: Folder containing source files.
90
90
  :type folder: str or Path
91
- :param backend_id: Optional backend identifier override.
92
- :type backend_id: str or None
93
- :param recipe_name: Optional recipe name override.
94
- :type recipe_name: str or None
91
+ :param retriever_id: Optional retriever identifier override.
92
+ :type retriever_id: str or None
93
+ :param configuration_name: Optional configuration name override.
94
+ :type configuration_name: str or None
95
95
  :param query_budget: Optional query budget override.
96
96
  :type query_budget: QueryBudget or None
97
97
  :param tags: Optional tags to apply during import.
@@ -110,8 +110,8 @@ class KnowledgeBase:
110
110
  raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
111
111
 
112
112
  defaults = KnowledgeBaseDefaults()
113
- resolved_backend_id = backend_id or defaults.backend_id
114
- resolved_recipe_name = recipe_name or defaults.recipe_name
113
+ resolved_retriever_id = retriever_id or defaults.retriever_id
114
+ resolved_configuration_name = configuration_name or defaults.configuration_name
115
115
  resolved_query_budget = query_budget or defaults.query_budget
116
116
  resolved_tags = list(tags) if tags is not None else defaults.tags
117
117
 
@@ -125,16 +125,18 @@ class KnowledgeBase:
125
125
  corpus = Corpus.init(corpus_root_path)
126
126
  corpus.import_tree(source_root, tags=resolved_tags)
127
127
 
128
- backend = get_backend(resolved_backend_id)
129
- run = backend.build_run(corpus, recipe_name=resolved_recipe_name, config={})
128
+ retriever = get_retriever(resolved_retriever_id)
129
+ snapshot = retriever.build_snapshot(
130
+ corpus, configuration_name=resolved_configuration_name, configuration={}
131
+ )
130
132
 
131
133
  return cls(
132
134
  corpus=corpus,
133
- backend_id=resolved_backend_id,
134
- run=run,
135
+ retriever_id=resolved_retriever_id,
136
+ snapshot=snapshot,
135
137
  defaults=KnowledgeBaseDefaults(
136
- backend_id=resolved_backend_id,
137
- recipe_name=resolved_recipe_name,
138
+ retriever_id=resolved_retriever_id,
139
+ configuration_name=resolved_configuration_name,
138
140
  query_budget=resolved_query_budget,
139
141
  tags=resolved_tags,
140
142
  ),
@@ -152,11 +154,11 @@ class KnowledgeBase:
152
154
  :return: Retrieval result containing evidence.
153
155
  :rtype: RetrievalResult
154
156
  """
155
- backend = get_backend(self.backend_id)
157
+ retriever = get_retriever(self.retriever_id)
156
158
  resolved_budget = budget or self.defaults.query_budget
157
- return backend.query(
159
+ return retriever.query(
158
160
  self.corpus,
159
- run=self.run,
161
+ snapshot=self.snapshot,
160
162
  query_text=query_text,
161
163
  budget=resolved_budget,
162
164
  )
biblicus/models.py CHANGED
@@ -117,8 +117,8 @@ class CorpusCatalog(BaseModel):
117
117
  :vartype corpus_uri: str
118
118
  :ivar raw_dir: Relative path to the raw items folder.
119
119
  :vartype raw_dir: str
120
- :ivar latest_run_id: Latest retrieval run identifier, if any.
121
- :vartype latest_run_id: str or None
120
+ :ivar latest_snapshot_id: Latest retrieval snapshot identifier, if any.
121
+ :vartype latest_snapshot_id: str or None
122
122
  :ivar items: Mapping of item IDs to catalog entries.
123
123
  :vartype items: dict[str, CatalogItem]
124
124
  :ivar order: Display order of item IDs (most recent first).
@@ -131,7 +131,7 @@ class CorpusCatalog(BaseModel):
131
131
  generated_at: str
132
132
  corpus_uri: str
133
133
  raw_dir: str = "raw"
134
- latest_run_id: Optional[str] = None
134
+ latest_snapshot_id: Optional[str] = None
135
135
  items: Dict[str, CatalogItem] = Field(default_factory=dict)
136
136
  order: List[str] = Field(default_factory=list)
137
137
 
@@ -142,79 +142,79 @@ class CorpusCatalog(BaseModel):
142
142
  return self
143
143
 
144
144
 
145
- class ExtractionRunReference(BaseModel):
145
+ class ExtractionSnapshotReference(BaseModel):
146
146
  """
147
- Reference to an extraction run.
147
+ Reference to an extraction snapshot.
148
148
 
149
149
  :ivar extractor_id: Extractor plugin identifier.
150
150
  :vartype extractor_id: str
151
- :ivar run_id: Extraction run identifier.
152
- :vartype run_id: str
151
+ :ivar snapshot_id: Extraction snapshot identifier.
152
+ :vartype snapshot_id: str
153
153
  """
154
154
 
155
155
  model_config = ConfigDict(extra="forbid")
156
156
 
157
157
  extractor_id: str = Field(min_length=1)
158
- run_id: str = Field(min_length=1)
158
+ snapshot_id: str = Field(min_length=1)
159
159
 
160
160
  def as_string(self) -> str:
161
161
  """
162
162
  Serialize the reference as a single string.
163
163
 
164
- :return: Reference in the form extractor_id:run_id.
164
+ :return: Reference in the form extractor_id:snapshot_id.
165
165
  :rtype: str
166
166
  """
167
- return f"{self.extractor_id}:{self.run_id}"
167
+ return f"{self.extractor_id}:{self.snapshot_id}"
168
168
 
169
169
 
170
- def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
170
+ def parse_extraction_snapshot_reference(value: str) -> ExtractionSnapshotReference:
171
171
  """
172
- Parse an extraction run reference in the form extractor_id:run_id.
172
+ Parse an extraction snapshot reference in the form extractor_id:snapshot_id.
173
173
 
174
174
  :param value: Raw reference string.
175
175
  :type value: str
176
- :return: Parsed extraction run reference.
177
- :rtype: ExtractionRunReference
176
+ :return: Parsed extraction snapshot reference.
177
+ :rtype: ExtractionSnapshotReference
178
178
  :raises ValueError: If the reference is not well formed.
179
179
  """
180
180
  if ":" not in value:
181
- raise ValueError("Extraction run reference must be extractor_id:run_id")
182
- extractor_id, run_id = value.split(":", 1)
181
+ raise ValueError("Extraction snapshot reference must be extractor_id:snapshot_id")
182
+ extractor_id, snapshot_id = value.split(":", 1)
183
183
  extractor_id = extractor_id.strip()
184
- run_id = run_id.strip()
185
- if not extractor_id or not run_id:
184
+ snapshot_id = snapshot_id.strip()
185
+ if not extractor_id or not snapshot_id:
186
186
  raise ValueError(
187
- "Extraction run reference must be extractor_id:run_id with non-empty parts"
187
+ "Extraction snapshot reference must be extractor_id:snapshot_id with non-empty parts"
188
188
  )
189
- return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
189
+ return ExtractionSnapshotReference(extractor_id=extractor_id, snapshot_id=snapshot_id)
190
190
 
191
191
 
192
- class ExtractionRunListEntry(BaseModel):
192
+ class ExtractionSnapshotListEntry(BaseModel):
193
193
  """
194
- Summary entry for an extraction run stored in a corpus.
194
+ Summary entry for an extraction snapshot stored in a corpus.
195
195
 
196
196
  :ivar extractor_id: Extractor plugin identifier.
197
197
  :vartype extractor_id: str
198
- :ivar run_id: Extraction run identifier.
199
- :vartype run_id: str
200
- :ivar recipe_id: Deterministic recipe identifier.
201
- :vartype recipe_id: str
202
- :ivar recipe_name: Human-readable recipe name.
203
- :vartype recipe_name: str
204
- :ivar catalog_generated_at: Catalog timestamp used for the run.
198
+ :ivar snapshot_id: Extraction snapshot identifier.
199
+ :vartype snapshot_id: str
200
+ :ivar configuration_id: Deterministic configuration identifier.
201
+ :vartype configuration_id: str
202
+ :ivar configuration_name: Human-readable configuration name.
203
+ :vartype configuration_name: str
204
+ :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
205
205
  :vartype catalog_generated_at: str
206
- :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
206
+ :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
207
207
  :vartype created_at: str
208
- :ivar stats: Run statistics.
208
+ :ivar stats: Snapshot statistics.
209
209
  :vartype stats: dict[str, object]
210
210
  """
211
211
 
212
212
  model_config = ConfigDict(extra="forbid")
213
213
 
214
214
  extractor_id: str = Field(min_length=1)
215
- run_id: str = Field(min_length=1)
216
- recipe_id: str = Field(min_length=1)
217
- recipe_name: str = Field(min_length=1)
215
+ snapshot_id: str = Field(min_length=1)
216
+ configuration_id: str = Field(min_length=1)
217
+ configuration_name: str = Field(min_length=1)
218
218
  catalog_generated_at: str = Field(min_length=1)
219
219
  created_at: str = Field(min_length=1)
220
220
  stats: Dict[str, object] = Field(default_factory=dict)
@@ -234,8 +234,8 @@ class QueryBudget(BaseModel):
234
234
  This enables simple pagination by re-running the same query with a
235
235
  higher offset.
236
236
  :vartype offset: int
237
- :ivar max_total_characters: Optional maximum total characters across evidence text.
238
- :vartype max_total_characters: int or None
237
+ :ivar maximum_total_characters: Optional maximum total characters across evidence text.
238
+ :vartype maximum_total_characters: int or None
239
239
  :ivar max_items_per_source: Optional cap per source uniform resource identifier.
240
240
  :vartype max_items_per_source: int or None
241
241
  """
@@ -244,13 +244,13 @@ class QueryBudget(BaseModel):
244
244
 
245
245
  max_total_items: int = Field(ge=1)
246
246
  offset: int = Field(default=0, ge=0)
247
- max_total_characters: Optional[int] = Field(default=None, ge=1)
247
+ maximum_total_characters: Optional[int] = Field(default=None, ge=1)
248
248
  max_items_per_source: Optional[int] = Field(default=None, ge=1)
249
249
 
250
250
 
251
251
  class Evidence(BaseModel):
252
252
  """
253
- Structured retrieval evidence returned from a backend.
253
+ Structured retrieval evidence returned from a retriever.
254
254
 
255
255
  :ivar item_id: Item identifier that produced the evidence.
256
256
  :vartype item_id: str
@@ -274,10 +274,12 @@ class Evidence(BaseModel):
274
274
  :vartype stage: str
275
275
  :ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
276
276
  :vartype stage_scores: dict[str, float] or None
277
- :ivar recipe_id: Recipe identifier used to create the run.
278
- :vartype recipe_id: str
279
- :ivar run_id: Retrieval run identifier.
280
- :vartype run_id: str
277
+ :ivar configuration_id: Configuration identifier used to create the snapshot.
278
+ :vartype configuration_id: str
279
+ :ivar snapshot_id: Retrieval snapshot identifier.
280
+ :vartype snapshot_id: str
281
+ :ivar metadata: Optional metadata payload from the catalog item.
282
+ :vartype metadata: dict[str, Any]
281
283
  :ivar hash: Optional content hash for provenance.
282
284
  :vartype hash: str or None
283
285
  """
@@ -295,8 +297,9 @@ class Evidence(BaseModel):
295
297
  span_end: Optional[int] = None
296
298
  stage: str
297
299
  stage_scores: Optional[Dict[str, float]] = None
298
- recipe_id: str
299
- run_id: str
300
+ configuration_id: str
301
+ snapshot_id: str
302
+ metadata: Dict[str, Any] = Field(default_factory=dict)
300
303
  hash: Optional[str] = None
301
304
 
302
305
  @model_validator(mode="after")
@@ -308,79 +311,79 @@ class Evidence(BaseModel):
308
311
  return self
309
312
 
310
313
 
311
- class RecipeManifest(BaseModel):
314
+ class ConfigurationManifest(BaseModel):
312
315
  """
313
- Reproducible configuration for a retrieval backend.
316
+ Reproducible configuration for a retriever.
314
317
 
315
- :ivar recipe_id: Deterministic recipe identifier.
316
- :vartype recipe_id: str
317
- :ivar backend_id: Backend identifier for the recipe.
318
- :vartype backend_id: str
319
- :ivar name: Human-readable name for the recipe.
318
+ :ivar configuration_id: Deterministic configuration identifier.
319
+ :vartype configuration_id: str
320
+ :ivar retriever_id: Retriever identifier for the configuration.
321
+ :vartype retriever_id: str
322
+ :ivar name: Human-readable name for the configuration.
320
323
  :vartype name: str
321
- :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
324
+ :ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
322
325
  :vartype created_at: str
323
- :ivar config: Backend-specific configuration values.
324
- :vartype config: dict[str, Any]
326
+ :ivar configuration: Retriever-specific configuration values.
327
+ :vartype configuration: dict[str, Any]
325
328
  :ivar description: Optional human description.
326
329
  :vartype description: str or None
327
330
  """
328
331
 
329
332
  model_config = ConfigDict(extra="forbid")
330
333
 
331
- recipe_id: str
332
- backend_id: str
334
+ configuration_id: str
335
+ retriever_id: str
333
336
  name: str
334
337
  created_at: str
335
- config: Dict[str, Any] = Field(default_factory=dict)
338
+ configuration: Dict[str, Any] = Field(default_factory=dict)
336
339
  description: Optional[str] = None
337
340
 
338
341
 
339
- class RetrievalRun(BaseModel):
342
+ class RetrievalSnapshot(BaseModel):
340
343
  """
341
- Immutable record of a retrieval materialization or on-demand run.
344
+ Immutable record of a retrieval snapshot.
342
345
 
343
- :ivar run_id: Unique run identifier.
344
- :vartype run_id: str
345
- :ivar recipe: Recipe manifest for this run.
346
- :vartype recipe: RecipeManifest
346
+ :ivar snapshot_id: Unique snapshot identifier.
347
+ :vartype snapshot_id: str
348
+ :ivar configuration: Configuration manifest for this snapshot.
349
+ :vartype configuration: ConfigurationManifest
347
350
  :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
348
351
  :vartype corpus_uri: str
349
- :ivar catalog_generated_at: Catalog timestamp used for the run.
352
+ :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
350
353
  :vartype catalog_generated_at: str
351
- :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
354
+ :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
352
355
  :vartype created_at: str
353
- :ivar artifact_paths: Relative paths to materialized artifacts.
354
- :vartype artifact_paths: list[str]
355
- :ivar stats: Backend-specific run statistics.
356
+ :ivar snapshot_artifacts: Relative paths to materialized artifacts.
357
+ :vartype snapshot_artifacts: list[str]
358
+ :ivar stats: Retriever-specific snapshot statistics.
356
359
  :vartype stats: dict[str, Any]
357
360
  """
358
361
 
359
362
  model_config = ConfigDict(extra="forbid")
360
363
 
361
- run_id: str
362
- recipe: RecipeManifest
364
+ snapshot_id: str
365
+ configuration: ConfigurationManifest
363
366
  corpus_uri: str
364
367
  catalog_generated_at: str
365
368
  created_at: str
366
- artifact_paths: List[str] = Field(default_factory=list)
369
+ snapshot_artifacts: List[str] = Field(default_factory=list)
367
370
  stats: Dict[str, Any] = Field(default_factory=dict)
368
371
 
369
372
 
370
373
  class RetrievalResult(BaseModel):
371
374
  """
372
- Retrieval result bundle returned from a backend query.
375
+ Retrieval result bundle returned from a retriever query.
373
376
 
374
377
  :ivar query_text: Query text issued against the backend.
375
378
  :vartype query_text: str
376
379
  :ivar budget: Evidence selection budget applied to results.
377
380
  :vartype budget: QueryBudget
378
- :ivar run_id: Retrieval run identifier.
379
- :vartype run_id: str
380
- :ivar recipe_id: Recipe identifier used for this query.
381
- :vartype recipe_id: str
382
- :ivar backend_id: Backend identifier used for this query.
383
- :vartype backend_id: str
381
+ :ivar snapshot_id: Retrieval snapshot identifier.
382
+ :vartype snapshot_id: str
383
+ :ivar configuration_id: Configuration identifier used for this query.
384
+ :vartype configuration_id: str
385
+ :ivar retriever_id: Retriever identifier used for this query.
386
+ :vartype retriever_id: str
384
387
  :ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
385
388
  :vartype generated_at: str
386
389
  :ivar evidence: Evidence objects selected under the budget.
@@ -393,9 +396,9 @@ class RetrievalResult(BaseModel):
393
396
 
394
397
  query_text: str
395
398
  budget: QueryBudget
396
- run_id: str
397
- recipe_id: str
398
- backend_id: str
399
+ snapshot_id: str
400
+ configuration_id: str
401
+ retriever_id: str
399
402
  generated_at: str
400
403
  evidence: List[Evidence] = Field(default_factory=list)
401
404
  stats: Dict[str, Any] = Field(default_factory=dict)
biblicus/retrieval.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- Shared retrieval helpers for Biblicus backends.
2
+ Shared retrieval helpers for Biblicus retrievers.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -9,75 +9,82 @@ import json
9
9
  from typing import Any, Dict, Iterable, List, Optional
10
10
 
11
11
  from .corpus import Corpus
12
- from .models import Evidence, QueryBudget, RecipeManifest, RetrievalRun
12
+ from .models import (
13
+ ConfigurationManifest,
14
+ Evidence,
15
+ QueryBudget,
16
+ RetrievalSnapshot,
17
+ )
13
18
  from .time import utc_now_iso
14
19
 
15
20
 
16
- def create_recipe_manifest(
21
+ def create_configuration_manifest(
17
22
  *,
18
- backend_id: str,
23
+ retriever_id: str,
19
24
  name: str,
20
- config: Dict[str, Any],
25
+ configuration: Dict[str, Any],
21
26
  description: Optional[str] = None,
22
- ) -> RecipeManifest:
27
+ ) -> ConfigurationManifest:
23
28
  """
24
- Create a deterministic recipe manifest from a backend configuration.
29
+ Create a deterministic configuration manifest from a retriever configuration.
25
30
 
26
- :param backend_id: Backend identifier for the recipe.
27
- :type backend_id: str
28
- :param name: Human-readable recipe name.
31
+ :param retriever_id: Retriever identifier for the configuration.
32
+ :type retriever_id: str
33
+ :param name: Human-readable configuration name.
29
34
  :type name: str
30
- :param config: Backend-specific configuration values.
31
- :type config: dict[str, Any]
32
- :param description: Optional recipe description.
35
+ :param configuration: Retriever-specific configuration values.
36
+ :type configuration: dict[str, Any]
37
+ :param description: Optional configuration description.
33
38
  :type description: str or None
34
- :return: Deterministic recipe manifest.
35
- :rtype: RecipeManifest
39
+ :return: Deterministic configuration manifest.
40
+ :rtype: ConfigurationManifest
36
41
  """
37
- config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
38
- recipe_seed = f"{backend_id}:{config_json}"
39
- recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
40
- return RecipeManifest(
41
- recipe_id=recipe_id,
42
- backend_id=backend_id,
42
+ config_json = json.dumps(configuration, sort_keys=True, separators=(",", ":"))
43
+ configuration_seed = f"{retriever_id}:{config_json}"
44
+ configuration_id = hashlib.sha256(configuration_seed.encode("utf-8")).hexdigest()
45
+ return ConfigurationManifest(
46
+ configuration_id=configuration_id,
47
+ retriever_id=retriever_id,
43
48
  name=name,
44
49
  created_at=utc_now_iso(),
45
- config=config,
50
+ configuration=configuration,
46
51
  description=description,
47
52
  )
48
53
 
49
54
 
50
- def create_run_manifest(
55
+ def create_snapshot_manifest(
51
56
  corpus: Corpus,
52
57
  *,
53
- recipe: RecipeManifest,
58
+ configuration: ConfigurationManifest,
54
59
  stats: Dict[str, Any],
55
- artifact_paths: Optional[List[str]] = None,
56
- ) -> RetrievalRun:
60
+ snapshot_artifacts: Optional[List[str]] = None,
61
+ ) -> RetrievalSnapshot:
57
62
  """
58
- Create a retrieval run manifest tied to the current catalog snapshot.
63
+ Create a retrieval snapshot manifest tied to the current catalog snapshot.
59
64
 
60
- :param corpus: Corpus used to generate the run.
65
+ :param corpus: Corpus used to generate the snapshot.
61
66
  :type corpus: Corpus
62
- :param recipe: Recipe manifest for the run.
63
- :type recipe: RecipeManifest
64
- :param stats: Backend-specific run statistics.
67
+ :param configuration: Configuration manifest for the snapshot.
68
+ :type configuration: ConfigurationManifest
69
+ :param stats: Retriever-specific snapshot statistics.
65
70
  :type stats: dict[str, Any]
66
- :param artifact_paths: Optional relative paths to materialized artifacts.
67
- :type artifact_paths: list[str] or None
68
- :return: Run manifest.
69
- :rtype: RetrievalRun
71
+ :param snapshot_artifacts: Optional relative paths to materialized artifacts.
72
+ :type snapshot_artifacts: list[str] or None
73
+ :return: Snapshot manifest.
74
+ :rtype: RetrievalSnapshot
70
75
  """
71
76
  catalog = corpus.load_catalog()
72
77
  created_at = utc_now_iso()
73
- run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
74
- return RetrievalRun(
75
- run_id=run_id,
76
- recipe=recipe,
78
+ snapshot_id = hashlib.sha256(
79
+ f"{configuration.configuration_id}:{created_at}".encode("utf-8")
80
+ ).hexdigest()
81
+ return RetrievalSnapshot(
82
+ snapshot_id=snapshot_id,
83
+ configuration=configuration,
77
84
  corpus_uri=catalog.corpus_uri,
78
85
  catalog_generated_at=catalog.generated_at,
79
86
  created_at=created_at,
80
- artifact_paths=list(artifact_paths or []),
87
+ snapshot_artifacts=list(snapshot_artifacts or []),
81
88
  stats=stats,
82
89
  )
83
90
 
@@ -124,8 +131,8 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
124
131
  continue
125
132
 
126
133
  text_character_count = len(candidate_evidence.text or "")
127
- if budget.max_total_characters is not None:
128
- if total_characters + text_character_count > budget.max_total_characters:
134
+ if budget.maximum_total_characters is not None:
135
+ if total_characters + text_character_count > budget.maximum_total_characters:
129
136
  continue
130
137
 
131
138
  selected_evidence.append(candidate_evidence)
@@ -0,0 +1,50 @@
1
+ """
2
+ Retriever registry for Biblicus retrieval engines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, Type
8
+
9
+ from .base import Retriever
10
+ from .embedding_index_file import EmbeddingIndexFileRetriever
11
+ from .embedding_index_inmemory import EmbeddingIndexInMemoryRetriever
12
+ from .hybrid import HybridRetriever
13
+ from .scan import ScanRetriever
14
+ from .sqlite_full_text_search import SqliteFullTextSearchRetriever
15
+ from .tf_vector import TfVectorRetriever
16
+
17
+
18
+ def available_retrievers() -> Dict[str, Type[Retriever]]:
19
+ """
20
+ Return the registered retrievers.
21
+
22
+ :return: Mapping of retriever identifiers to retriever classes.
23
+ :rtype: dict[str, Type[Retriever]]
24
+ """
25
+ return {
26
+ EmbeddingIndexFileRetriever.retriever_id: EmbeddingIndexFileRetriever,
27
+ EmbeddingIndexInMemoryRetriever.retriever_id: EmbeddingIndexInMemoryRetriever,
28
+ HybridRetriever.retriever_id: HybridRetriever,
29
+ ScanRetriever.retriever_id: ScanRetriever,
30
+ SqliteFullTextSearchRetriever.retriever_id: SqliteFullTextSearchRetriever,
31
+ TfVectorRetriever.retriever_id: TfVectorRetriever,
32
+ }
33
+
34
+
35
+ def get_retriever(retriever_id: str) -> Retriever:
36
+ """
37
+ Instantiate a retriever by identifier.
38
+
39
+ :param retriever_id: Retriever identifier.
40
+ :type retriever_id: str
41
+ :return: Retriever instance.
42
+ :rtype: Retriever
43
+ :raises KeyError: If the retriever identifier is unknown.
44
+ """
45
+ registry = available_retrievers()
46
+ retriever_class = registry.get(retriever_id)
47
+ if retriever_class is None:
48
+ known = ", ".join(sorted(registry))
49
+ raise KeyError(f"Unknown retriever '{retriever_id}'. Known retrievers: {known}")
50
+ return retriever_class()