biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
  43. biblicus-1.1.1.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
biblicus/models.py CHANGED
@@ -117,8 +117,8 @@ class CorpusCatalog(BaseModel):
117
117
  :vartype corpus_uri: str
118
118
  :ivar raw_dir: Relative path to the raw items folder.
119
119
  :vartype raw_dir: str
120
- :ivar latest_run_id: Latest retrieval run identifier, if any.
121
- :vartype latest_run_id: str or None
120
+ :ivar latest_snapshot_id: Latest retrieval snapshot identifier, if any.
121
+ :vartype latest_snapshot_id: str or None
122
122
  :ivar items: Mapping of item IDs to catalog entries.
123
123
  :vartype items: dict[str, CatalogItem]
124
124
  :ivar order: Display order of item IDs (most recent first).
@@ -131,7 +131,7 @@ class CorpusCatalog(BaseModel):
131
131
  generated_at: str
132
132
  corpus_uri: str
133
133
  raw_dir: str = "raw"
134
- latest_run_id: Optional[str] = None
134
+ latest_snapshot_id: Optional[str] = None
135
135
  items: Dict[str, CatalogItem] = Field(default_factory=dict)
136
136
  order: List[str] = Field(default_factory=list)
137
137
 
@@ -142,79 +142,79 @@ class CorpusCatalog(BaseModel):
142
142
  return self
143
143
 
144
144
 
145
- class ExtractionRunReference(BaseModel):
145
+ class ExtractionSnapshotReference(BaseModel):
146
146
  """
147
- Reference to an extraction run.
147
+ Reference to an extraction snapshot.
148
148
 
149
149
  :ivar extractor_id: Extractor plugin identifier.
150
150
  :vartype extractor_id: str
151
- :ivar run_id: Extraction run identifier.
152
- :vartype run_id: str
151
+ :ivar snapshot_id: Extraction snapshot identifier.
152
+ :vartype snapshot_id: str
153
153
  """
154
154
 
155
155
  model_config = ConfigDict(extra="forbid")
156
156
 
157
157
  extractor_id: str = Field(min_length=1)
158
- run_id: str = Field(min_length=1)
158
+ snapshot_id: str = Field(min_length=1)
159
159
 
160
160
  def as_string(self) -> str:
161
161
  """
162
162
  Serialize the reference as a single string.
163
163
 
164
- :return: Reference in the form extractor_id:run_id.
164
+ :return: Reference in the form extractor_id:snapshot_id.
165
165
  :rtype: str
166
166
  """
167
- return f"{self.extractor_id}:{self.run_id}"
167
+ return f"{self.extractor_id}:{self.snapshot_id}"
168
168
 
169
169
 
170
- def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
170
+ def parse_extraction_snapshot_reference(value: str) -> ExtractionSnapshotReference:
171
171
  """
172
- Parse an extraction run reference in the form extractor_id:run_id.
172
+ Parse an extraction snapshot reference in the form extractor_id:snapshot_id.
173
173
 
174
174
  :param value: Raw reference string.
175
175
  :type value: str
176
- :return: Parsed extraction run reference.
177
- :rtype: ExtractionRunReference
176
+ :return: Parsed extraction snapshot reference.
177
+ :rtype: ExtractionSnapshotReference
178
178
  :raises ValueError: If the reference is not well formed.
179
179
  """
180
180
  if ":" not in value:
181
- raise ValueError("Extraction run reference must be extractor_id:run_id")
182
- extractor_id, run_id = value.split(":", 1)
181
+ raise ValueError("Extraction snapshot reference must be extractor_id:snapshot_id")
182
+ extractor_id, snapshot_id = value.split(":", 1)
183
183
  extractor_id = extractor_id.strip()
184
- run_id = run_id.strip()
185
- if not extractor_id or not run_id:
184
+ snapshot_id = snapshot_id.strip()
185
+ if not extractor_id or not snapshot_id:
186
186
  raise ValueError(
187
- "Extraction run reference must be extractor_id:run_id with non-empty parts"
187
+ "Extraction snapshot reference must be extractor_id:snapshot_id with non-empty parts"
188
188
  )
189
- return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
189
+ return ExtractionSnapshotReference(extractor_id=extractor_id, snapshot_id=snapshot_id)
190
190
 
191
191
 
192
- class ExtractionRunListEntry(BaseModel):
192
+ class ExtractionSnapshotListEntry(BaseModel):
193
193
  """
194
- Summary entry for an extraction run stored in a corpus.
194
+ Summary entry for an extraction snapshot stored in a corpus.
195
195
 
196
196
  :ivar extractor_id: Extractor plugin identifier.
197
197
  :vartype extractor_id: str
198
- :ivar run_id: Extraction run identifier.
199
- :vartype run_id: str
200
- :ivar recipe_id: Deterministic recipe identifier.
201
- :vartype recipe_id: str
202
- :ivar recipe_name: Human-readable recipe name.
203
- :vartype recipe_name: str
204
- :ivar catalog_generated_at: Catalog timestamp used for the run.
198
+ :ivar snapshot_id: Extraction snapshot identifier.
199
+ :vartype snapshot_id: str
200
+ :ivar configuration_id: Deterministic configuration identifier.
201
+ :vartype configuration_id: str
202
+ :ivar configuration_name: Human-readable configuration name.
203
+ :vartype configuration_name: str
204
+ :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
205
205
  :vartype catalog_generated_at: str
206
- :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
206
+ :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
207
207
  :vartype created_at: str
208
- :ivar stats: Run statistics.
208
+ :ivar stats: Snapshot statistics.
209
209
  :vartype stats: dict[str, object]
210
210
  """
211
211
 
212
212
  model_config = ConfigDict(extra="forbid")
213
213
 
214
214
  extractor_id: str = Field(min_length=1)
215
- run_id: str = Field(min_length=1)
216
- recipe_id: str = Field(min_length=1)
217
- recipe_name: str = Field(min_length=1)
215
+ snapshot_id: str = Field(min_length=1)
216
+ configuration_id: str = Field(min_length=1)
217
+ configuration_name: str = Field(min_length=1)
218
218
  catalog_generated_at: str = Field(min_length=1)
219
219
  created_at: str = Field(min_length=1)
220
220
  stats: Dict[str, object] = Field(default_factory=dict)
@@ -250,7 +250,7 @@ class QueryBudget(BaseModel):
250
250
 
251
251
  class Evidence(BaseModel):
252
252
  """
253
- Structured retrieval evidence returned from a backend.
253
+ Structured retrieval evidence returned from a retriever.
254
254
 
255
255
  :ivar item_id: Item identifier that produced the evidence.
256
256
  :vartype item_id: str
@@ -274,10 +274,10 @@ class Evidence(BaseModel):
274
274
  :vartype stage: str
275
275
  :ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
276
276
  :vartype stage_scores: dict[str, float] or None
277
- :ivar recipe_id: Recipe identifier used to create the run.
278
- :vartype recipe_id: str
279
- :ivar run_id: Retrieval run identifier.
280
- :vartype run_id: str
277
+ :ivar configuration_id: Configuration identifier used to create the snapshot.
278
+ :vartype configuration_id: str
279
+ :ivar snapshot_id: Retrieval snapshot identifier.
280
+ :vartype snapshot_id: str
281
281
  :ivar metadata: Optional metadata payload from the catalog item.
282
282
  :vartype metadata: dict[str, Any]
283
283
  :ivar hash: Optional content hash for provenance.
@@ -297,8 +297,8 @@ class Evidence(BaseModel):
297
297
  span_end: Optional[int] = None
298
298
  stage: str
299
299
  stage_scores: Optional[Dict[str, float]] = None
300
- recipe_id: str
301
- run_id: str
300
+ configuration_id: str
301
+ snapshot_id: str
302
302
  metadata: Dict[str, Any] = Field(default_factory=dict)
303
303
  hash: Optional[str] = None
304
304
 
@@ -311,79 +311,79 @@ class Evidence(BaseModel):
311
311
  return self
312
312
 
313
313
 
314
- class RecipeManifest(BaseModel):
314
+ class ConfigurationManifest(BaseModel):
315
315
  """
316
- Reproducible configuration for a retrieval backend.
316
+ Reproducible configuration for a retriever.
317
317
 
318
- :ivar recipe_id: Deterministic recipe identifier.
319
- :vartype recipe_id: str
320
- :ivar backend_id: Backend identifier for the recipe.
321
- :vartype backend_id: str
322
- :ivar name: Human-readable name for the recipe.
318
+ :ivar configuration_id: Deterministic configuration identifier.
319
+ :vartype configuration_id: str
320
+ :ivar retriever_id: Retriever identifier for the configuration.
321
+ :vartype retriever_id: str
322
+ :ivar name: Human-readable name for the configuration.
323
323
  :vartype name: str
324
- :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
324
+ :ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
325
325
  :vartype created_at: str
326
- :ivar config: Backend-specific configuration values.
327
- :vartype config: dict[str, Any]
326
+ :ivar configuration: Retriever-specific configuration values.
327
+ :vartype configuration: dict[str, Any]
328
328
  :ivar description: Optional human description.
329
329
  :vartype description: str or None
330
330
  """
331
331
 
332
332
  model_config = ConfigDict(extra="forbid")
333
333
 
334
- recipe_id: str
335
- backend_id: str
334
+ configuration_id: str
335
+ retriever_id: str
336
336
  name: str
337
337
  created_at: str
338
- config: Dict[str, Any] = Field(default_factory=dict)
338
+ configuration: Dict[str, Any] = Field(default_factory=dict)
339
339
  description: Optional[str] = None
340
340
 
341
341
 
342
- class RetrievalRun(BaseModel):
342
+ class RetrievalSnapshot(BaseModel):
343
343
  """
344
- Immutable record of a retrieval materialization or on-demand run.
344
+ Immutable record of a retrieval snapshot.
345
345
 
346
- :ivar run_id: Unique run identifier.
347
- :vartype run_id: str
348
- :ivar recipe: Recipe manifest for this run.
349
- :vartype recipe: RecipeManifest
346
+ :ivar snapshot_id: Unique snapshot identifier.
347
+ :vartype snapshot_id: str
348
+ :ivar configuration: Configuration manifest for this snapshot.
349
+ :vartype configuration: ConfigurationManifest
350
350
  :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
351
351
  :vartype corpus_uri: str
352
- :ivar catalog_generated_at: Catalog timestamp used for the run.
352
+ :ivar catalog_generated_at: Catalog timestamp used for the snapshot.
353
353
  :vartype catalog_generated_at: str
354
- :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
354
+ :ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
355
355
  :vartype created_at: str
356
- :ivar artifact_paths: Relative paths to materialized artifacts.
357
- :vartype artifact_paths: list[str]
358
- :ivar stats: Backend-specific run statistics.
356
+ :ivar snapshot_artifacts: Relative paths to materialized artifacts.
357
+ :vartype snapshot_artifacts: list[str]
358
+ :ivar stats: Retriever-specific snapshot statistics.
359
359
  :vartype stats: dict[str, Any]
360
360
  """
361
361
 
362
362
  model_config = ConfigDict(extra="forbid")
363
363
 
364
- run_id: str
365
- recipe: RecipeManifest
364
+ snapshot_id: str
365
+ configuration: ConfigurationManifest
366
366
  corpus_uri: str
367
367
  catalog_generated_at: str
368
368
  created_at: str
369
- artifact_paths: List[str] = Field(default_factory=list)
369
+ snapshot_artifacts: List[str] = Field(default_factory=list)
370
370
  stats: Dict[str, Any] = Field(default_factory=dict)
371
371
 
372
372
 
373
373
  class RetrievalResult(BaseModel):
374
374
  """
375
- Retrieval result bundle returned from a backend query.
375
+ Retrieval result bundle returned from a retriever query.
376
376
 
377
377
  :ivar query_text: Query text issued against the backend.
378
378
  :vartype query_text: str
379
379
  :ivar budget: Evidence selection budget applied to results.
380
380
  :vartype budget: QueryBudget
381
- :ivar run_id: Retrieval run identifier.
382
- :vartype run_id: str
383
- :ivar recipe_id: Recipe identifier used for this query.
384
- :vartype recipe_id: str
385
- :ivar backend_id: Backend identifier used for this query.
386
- :vartype backend_id: str
381
+ :ivar snapshot_id: Retrieval snapshot identifier.
382
+ :vartype snapshot_id: str
383
+ :ivar configuration_id: Configuration identifier used for this query.
384
+ :vartype configuration_id: str
385
+ :ivar retriever_id: Retriever identifier used for this query.
386
+ :vartype retriever_id: str
387
387
  :ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
388
388
  :vartype generated_at: str
389
389
  :ivar evidence: Evidence objects selected under the budget.
@@ -396,9 +396,9 @@ class RetrievalResult(BaseModel):
396
396
 
397
397
  query_text: str
398
398
  budget: QueryBudget
399
- run_id: str
400
- recipe_id: str
401
- backend_id: str
399
+ snapshot_id: str
400
+ configuration_id: str
401
+ retriever_id: str
402
402
  generated_at: str
403
403
  evidence: List[Evidence] = Field(default_factory=list)
404
404
  stats: Dict[str, Any] = Field(default_factory=dict)
biblicus/retrieval.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- Shared retrieval helpers for Biblicus backends.
2
+ Shared retrieval helpers for Biblicus retrievers.
3
3
  """
4
4
 
5
5
  from __future__ import annotations
@@ -9,75 +9,82 @@ import json
9
9
  from typing import Any, Dict, Iterable, List, Optional
10
10
 
11
11
  from .corpus import Corpus
12
- from .models import Evidence, QueryBudget, RecipeManifest, RetrievalRun
12
+ from .models import (
13
+ ConfigurationManifest,
14
+ Evidence,
15
+ QueryBudget,
16
+ RetrievalSnapshot,
17
+ )
13
18
  from .time import utc_now_iso
14
19
 
15
20
 
16
- def create_recipe_manifest(
21
+ def create_configuration_manifest(
17
22
  *,
18
- backend_id: str,
23
+ retriever_id: str,
19
24
  name: str,
20
- config: Dict[str, Any],
25
+ configuration: Dict[str, Any],
21
26
  description: Optional[str] = None,
22
- ) -> RecipeManifest:
27
+ ) -> ConfigurationManifest:
23
28
  """
24
- Create a deterministic recipe manifest from a backend configuration.
29
+ Create a deterministic configuration manifest from a retriever configuration.
25
30
 
26
- :param backend_id: Backend identifier for the recipe.
27
- :type backend_id: str
28
- :param name: Human-readable recipe name.
31
+ :param retriever_id: Retriever identifier for the configuration.
32
+ :type retriever_id: str
33
+ :param name: Human-readable configuration name.
29
34
  :type name: str
30
- :param config: Backend-specific configuration values.
31
- :type config: dict[str, Any]
32
- :param description: Optional recipe description.
35
+ :param configuration: Retriever-specific configuration values.
36
+ :type configuration: dict[str, Any]
37
+ :param description: Optional configuration description.
33
38
  :type description: str or None
34
- :return: Deterministic recipe manifest.
35
- :rtype: RecipeManifest
39
+ :return: Deterministic configuration manifest.
40
+ :rtype: ConfigurationManifest
36
41
  """
37
- config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
38
- recipe_seed = f"{backend_id}:{config_json}"
39
- recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
40
- return RecipeManifest(
41
- recipe_id=recipe_id,
42
- backend_id=backend_id,
42
+ config_json = json.dumps(configuration, sort_keys=True, separators=(",", ":"))
43
+ configuration_seed = f"{retriever_id}:{config_json}"
44
+ configuration_id = hashlib.sha256(configuration_seed.encode("utf-8")).hexdigest()
45
+ return ConfigurationManifest(
46
+ configuration_id=configuration_id,
47
+ retriever_id=retriever_id,
43
48
  name=name,
44
49
  created_at=utc_now_iso(),
45
- config=config,
50
+ configuration=configuration,
46
51
  description=description,
47
52
  )
48
53
 
49
54
 
50
- def create_run_manifest(
55
+ def create_snapshot_manifest(
51
56
  corpus: Corpus,
52
57
  *,
53
- recipe: RecipeManifest,
58
+ configuration: ConfigurationManifest,
54
59
  stats: Dict[str, Any],
55
- artifact_paths: Optional[List[str]] = None,
56
- ) -> RetrievalRun:
60
+ snapshot_artifacts: Optional[List[str]] = None,
61
+ ) -> RetrievalSnapshot:
57
62
  """
58
- Create a retrieval run manifest tied to the current catalog snapshot.
63
+ Create a retrieval snapshot manifest tied to the current catalog snapshot.
59
64
 
60
- :param corpus: Corpus used to generate the run.
65
+ :param corpus: Corpus used to generate the snapshot.
61
66
  :type corpus: Corpus
62
- :param recipe: Recipe manifest for the run.
63
- :type recipe: RecipeManifest
64
- :param stats: Backend-specific run statistics.
67
+ :param configuration: Configuration manifest for the snapshot.
68
+ :type configuration: ConfigurationManifest
69
+ :param stats: Retriever-specific snapshot statistics.
65
70
  :type stats: dict[str, Any]
66
- :param artifact_paths: Optional relative paths to materialized artifacts.
67
- :type artifact_paths: list[str] or None
68
- :return: Run manifest.
69
- :rtype: RetrievalRun
71
+ :param snapshot_artifacts: Optional relative paths to materialized artifacts.
72
+ :type snapshot_artifacts: list[str] or None
73
+ :return: Snapshot manifest.
74
+ :rtype: RetrievalSnapshot
70
75
  """
71
76
  catalog = corpus.load_catalog()
72
77
  created_at = utc_now_iso()
73
- run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
74
- return RetrievalRun(
75
- run_id=run_id,
76
- recipe=recipe,
78
+ snapshot_id = hashlib.sha256(
79
+ f"{configuration.configuration_id}:{created_at}".encode("utf-8")
80
+ ).hexdigest()
81
+ return RetrievalSnapshot(
82
+ snapshot_id=snapshot_id,
83
+ configuration=configuration,
77
84
  corpus_uri=catalog.corpus_uri,
78
85
  catalog_generated_at=catalog.generated_at,
79
86
  created_at=created_at,
80
- artifact_paths=list(artifact_paths or []),
87
+ snapshot_artifacts=list(snapshot_artifacts or []),
81
88
  stats=stats,
82
89
  )
83
90
 
@@ -0,0 +1,50 @@
1
+ """
2
+ Retriever registry for Biblicus retrieval engines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, Type
8
+
9
+ from .base import Retriever
10
+ from .embedding_index_file import EmbeddingIndexFileRetriever
11
+ from .embedding_index_inmemory import EmbeddingIndexInMemoryRetriever
12
+ from .hybrid import HybridRetriever
13
+ from .scan import ScanRetriever
14
+ from .sqlite_full_text_search import SqliteFullTextSearchRetriever
15
+ from .tf_vector import TfVectorRetriever
16
+
17
+
18
+ def available_retrievers() -> Dict[str, Type[Retriever]]:
19
+ """
20
+ Return the registered retrievers.
21
+
22
+ :return: Mapping of retriever identifiers to retriever classes.
23
+ :rtype: dict[str, Type[Retriever]]
24
+ """
25
+ return {
26
+ EmbeddingIndexFileRetriever.retriever_id: EmbeddingIndexFileRetriever,
27
+ EmbeddingIndexInMemoryRetriever.retriever_id: EmbeddingIndexInMemoryRetriever,
28
+ HybridRetriever.retriever_id: HybridRetriever,
29
+ ScanRetriever.retriever_id: ScanRetriever,
30
+ SqliteFullTextSearchRetriever.retriever_id: SqliteFullTextSearchRetriever,
31
+ TfVectorRetriever.retriever_id: TfVectorRetriever,
32
+ }
33
+
34
+
35
+ def get_retriever(retriever_id: str) -> Retriever:
36
+ """
37
+ Instantiate a retriever by identifier.
38
+
39
+ :param retriever_id: Retriever identifier.
40
+ :type retriever_id: str
41
+ :return: Retriever instance.
42
+ :rtype: Retriever
43
+ :raises KeyError: If the retriever identifier is unknown.
44
+ """
45
+ registry = available_retrievers()
46
+ retriever_class = registry.get(retriever_id)
47
+ if retriever_class is None:
48
+ known = ", ".join(sorted(registry))
49
+ raise KeyError(f"Unknown retriever '{retriever_id}'. Known retrievers: {known}")
50
+ return retriever_class()
@@ -0,0 +1,65 @@
1
+ """
2
+ Retriever interface for Biblicus retrieval engines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Dict
9
+
10
+ from ..corpus import Corpus
11
+ from ..models import QueryBudget, RetrievalResult, RetrievalSnapshot
12
+
13
+
14
+ class Retriever(ABC):
15
+ """
16
+ Abstract interface for retrievers.
17
+
18
+ :ivar retriever_id: Identifier string for the retriever.
19
+ :vartype retriever_id: str
20
+ """
21
+
22
+ retriever_id: str
23
+
24
+ @abstractmethod
25
+ def build_snapshot(
26
+ self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
27
+ ) -> RetrievalSnapshot:
28
+ """
29
+ Build or register a retrieval snapshot for the retriever.
30
+
31
+ :param corpus: Corpus to build against.
32
+ :type corpus: Corpus
33
+ :param configuration_name: Human name for the configuration.
34
+ :type configuration_name: str
35
+ :param configuration: Retriever-specific configuration values.
36
+ :type configuration: dict[str, object]
37
+ :return: Snapshot manifest describing the build.
38
+ :rtype: RetrievalSnapshot
39
+ """
40
+ raise NotImplementedError
41
+
42
+ @abstractmethod
43
+ def query(
44
+ self,
45
+ corpus: Corpus,
46
+ *,
47
+ snapshot: RetrievalSnapshot,
48
+ query_text: str,
49
+ budget: QueryBudget,
50
+ ) -> RetrievalResult:
51
+ """
52
+ Run a retrieval query against a retriever.
53
+
54
+ :param corpus: Corpus associated with the snapshot.
55
+ :type corpus: Corpus
56
+ :param snapshot: Snapshot manifest to use for querying.
57
+ :type snapshot: RetrievalSnapshot
58
+ :param query_text: Query text to execute.
59
+ :type query_text: str
60
+ :param budget: Evidence selection budget.
61
+ :type budget: QueryBudget
62
+ :return: Retrieval results containing evidence.
63
+ :rtype: RetrievalResult
64
+ """
65
+ raise NotImplementedError