biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/models.py
CHANGED
|
@@ -117,8 +117,8 @@ class CorpusCatalog(BaseModel):
|
|
|
117
117
|
:vartype corpus_uri: str
|
|
118
118
|
:ivar raw_dir: Relative path to the raw items folder.
|
|
119
119
|
:vartype raw_dir: str
|
|
120
|
-
:ivar
|
|
121
|
-
:vartype
|
|
120
|
+
:ivar latest_snapshot_id: Latest retrieval snapshot identifier, if any.
|
|
121
|
+
:vartype latest_snapshot_id: str or None
|
|
122
122
|
:ivar items: Mapping of item IDs to catalog entries.
|
|
123
123
|
:vartype items: dict[str, CatalogItem]
|
|
124
124
|
:ivar order: Display order of item IDs (most recent first).
|
|
@@ -131,7 +131,7 @@ class CorpusCatalog(BaseModel):
|
|
|
131
131
|
generated_at: str
|
|
132
132
|
corpus_uri: str
|
|
133
133
|
raw_dir: str = "raw"
|
|
134
|
-
|
|
134
|
+
latest_snapshot_id: Optional[str] = None
|
|
135
135
|
items: Dict[str, CatalogItem] = Field(default_factory=dict)
|
|
136
136
|
order: List[str] = Field(default_factory=list)
|
|
137
137
|
|
|
@@ -142,79 +142,79 @@ class CorpusCatalog(BaseModel):
|
|
|
142
142
|
return self
|
|
143
143
|
|
|
144
144
|
|
|
145
|
-
class
|
|
145
|
+
class ExtractionSnapshotReference(BaseModel):
|
|
146
146
|
"""
|
|
147
|
-
Reference to an extraction
|
|
147
|
+
Reference to an extraction snapshot.
|
|
148
148
|
|
|
149
149
|
:ivar extractor_id: Extractor plugin identifier.
|
|
150
150
|
:vartype extractor_id: str
|
|
151
|
-
:ivar
|
|
152
|
-
:vartype
|
|
151
|
+
:ivar snapshot_id: Extraction snapshot identifier.
|
|
152
|
+
:vartype snapshot_id: str
|
|
153
153
|
"""
|
|
154
154
|
|
|
155
155
|
model_config = ConfigDict(extra="forbid")
|
|
156
156
|
|
|
157
157
|
extractor_id: str = Field(min_length=1)
|
|
158
|
-
|
|
158
|
+
snapshot_id: str = Field(min_length=1)
|
|
159
159
|
|
|
160
160
|
def as_string(self) -> str:
|
|
161
161
|
"""
|
|
162
162
|
Serialize the reference as a single string.
|
|
163
163
|
|
|
164
|
-
:return: Reference in the form extractor_id:
|
|
164
|
+
:return: Reference in the form extractor_id:snapshot_id.
|
|
165
165
|
:rtype: str
|
|
166
166
|
"""
|
|
167
|
-
return f"{self.extractor_id}:{self.
|
|
167
|
+
return f"{self.extractor_id}:{self.snapshot_id}"
|
|
168
168
|
|
|
169
169
|
|
|
170
|
-
def
|
|
170
|
+
def parse_extraction_snapshot_reference(value: str) -> ExtractionSnapshotReference:
|
|
171
171
|
"""
|
|
172
|
-
Parse an extraction
|
|
172
|
+
Parse an extraction snapshot reference in the form extractor_id:snapshot_id.
|
|
173
173
|
|
|
174
174
|
:param value: Raw reference string.
|
|
175
175
|
:type value: str
|
|
176
|
-
:return: Parsed extraction
|
|
177
|
-
:rtype:
|
|
176
|
+
:return: Parsed extraction snapshot reference.
|
|
177
|
+
:rtype: ExtractionSnapshotReference
|
|
178
178
|
:raises ValueError: If the reference is not well formed.
|
|
179
179
|
"""
|
|
180
180
|
if ":" not in value:
|
|
181
|
-
raise ValueError("Extraction
|
|
182
|
-
extractor_id,
|
|
181
|
+
raise ValueError("Extraction snapshot reference must be extractor_id:snapshot_id")
|
|
182
|
+
extractor_id, snapshot_id = value.split(":", 1)
|
|
183
183
|
extractor_id = extractor_id.strip()
|
|
184
|
-
|
|
185
|
-
if not extractor_id or not
|
|
184
|
+
snapshot_id = snapshot_id.strip()
|
|
185
|
+
if not extractor_id or not snapshot_id:
|
|
186
186
|
raise ValueError(
|
|
187
|
-
"Extraction
|
|
187
|
+
"Extraction snapshot reference must be extractor_id:snapshot_id with non-empty parts"
|
|
188
188
|
)
|
|
189
|
-
return
|
|
189
|
+
return ExtractionSnapshotReference(extractor_id=extractor_id, snapshot_id=snapshot_id)
|
|
190
190
|
|
|
191
191
|
|
|
192
|
-
class
|
|
192
|
+
class ExtractionSnapshotListEntry(BaseModel):
|
|
193
193
|
"""
|
|
194
|
-
Summary entry for an extraction
|
|
194
|
+
Summary entry for an extraction snapshot stored in a corpus.
|
|
195
195
|
|
|
196
196
|
:ivar extractor_id: Extractor plugin identifier.
|
|
197
197
|
:vartype extractor_id: str
|
|
198
|
-
:ivar
|
|
199
|
-
:vartype
|
|
200
|
-
:ivar
|
|
201
|
-
:vartype
|
|
202
|
-
:ivar
|
|
203
|
-
:vartype
|
|
204
|
-
:ivar catalog_generated_at: Catalog timestamp used for the
|
|
198
|
+
:ivar snapshot_id: Extraction snapshot identifier.
|
|
199
|
+
:vartype snapshot_id: str
|
|
200
|
+
:ivar configuration_id: Deterministic configuration identifier.
|
|
201
|
+
:vartype configuration_id: str
|
|
202
|
+
:ivar configuration_name: Human-readable configuration name.
|
|
203
|
+
:vartype configuration_name: str
|
|
204
|
+
:ivar catalog_generated_at: Catalog timestamp used for the snapshot.
|
|
205
205
|
:vartype catalog_generated_at: str
|
|
206
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
206
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
|
|
207
207
|
:vartype created_at: str
|
|
208
|
-
:ivar stats:
|
|
208
|
+
:ivar stats: Snapshot statistics.
|
|
209
209
|
:vartype stats: dict[str, object]
|
|
210
210
|
"""
|
|
211
211
|
|
|
212
212
|
model_config = ConfigDict(extra="forbid")
|
|
213
213
|
|
|
214
214
|
extractor_id: str = Field(min_length=1)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
215
|
+
snapshot_id: str = Field(min_length=1)
|
|
216
|
+
configuration_id: str = Field(min_length=1)
|
|
217
|
+
configuration_name: str = Field(min_length=1)
|
|
218
218
|
catalog_generated_at: str = Field(min_length=1)
|
|
219
219
|
created_at: str = Field(min_length=1)
|
|
220
220
|
stats: Dict[str, object] = Field(default_factory=dict)
|
|
@@ -250,7 +250,7 @@ class QueryBudget(BaseModel):
|
|
|
250
250
|
|
|
251
251
|
class Evidence(BaseModel):
|
|
252
252
|
"""
|
|
253
|
-
Structured retrieval evidence returned from a
|
|
253
|
+
Structured retrieval evidence returned from a retriever.
|
|
254
254
|
|
|
255
255
|
:ivar item_id: Item identifier that produced the evidence.
|
|
256
256
|
:vartype item_id: str
|
|
@@ -274,10 +274,10 @@ class Evidence(BaseModel):
|
|
|
274
274
|
:vartype stage: str
|
|
275
275
|
:ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
|
|
276
276
|
:vartype stage_scores: dict[str, float] or None
|
|
277
|
-
:ivar
|
|
278
|
-
:vartype
|
|
279
|
-
:ivar
|
|
280
|
-
:vartype
|
|
277
|
+
:ivar configuration_id: Configuration identifier used to create the snapshot.
|
|
278
|
+
:vartype configuration_id: str
|
|
279
|
+
:ivar snapshot_id: Retrieval snapshot identifier.
|
|
280
|
+
:vartype snapshot_id: str
|
|
281
281
|
:ivar metadata: Optional metadata payload from the catalog item.
|
|
282
282
|
:vartype metadata: dict[str, Any]
|
|
283
283
|
:ivar hash: Optional content hash for provenance.
|
|
@@ -297,8 +297,8 @@ class Evidence(BaseModel):
|
|
|
297
297
|
span_end: Optional[int] = None
|
|
298
298
|
stage: str
|
|
299
299
|
stage_scores: Optional[Dict[str, float]] = None
|
|
300
|
-
|
|
301
|
-
|
|
300
|
+
configuration_id: str
|
|
301
|
+
snapshot_id: str
|
|
302
302
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
303
303
|
hash: Optional[str] = None
|
|
304
304
|
|
|
@@ -311,79 +311,79 @@ class Evidence(BaseModel):
|
|
|
311
311
|
return self
|
|
312
312
|
|
|
313
313
|
|
|
314
|
-
class
|
|
314
|
+
class ConfigurationManifest(BaseModel):
|
|
315
315
|
"""
|
|
316
|
-
Reproducible configuration for a
|
|
316
|
+
Reproducible configuration for a retriever.
|
|
317
317
|
|
|
318
|
-
:ivar
|
|
319
|
-
:vartype
|
|
320
|
-
:ivar
|
|
321
|
-
:vartype
|
|
322
|
-
:ivar name: Human-readable name for the
|
|
318
|
+
:ivar configuration_id: Deterministic configuration identifier.
|
|
319
|
+
:vartype configuration_id: str
|
|
320
|
+
:ivar retriever_id: Retriever identifier for the configuration.
|
|
321
|
+
:vartype retriever_id: str
|
|
322
|
+
:ivar name: Human-readable name for the configuration.
|
|
323
323
|
:vartype name: str
|
|
324
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
324
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
|
|
325
325
|
:vartype created_at: str
|
|
326
|
-
:ivar
|
|
327
|
-
:vartype
|
|
326
|
+
:ivar configuration: Retriever-specific configuration values.
|
|
327
|
+
:vartype configuration: dict[str, Any]
|
|
328
328
|
:ivar description: Optional human description.
|
|
329
329
|
:vartype description: str or None
|
|
330
330
|
"""
|
|
331
331
|
|
|
332
332
|
model_config = ConfigDict(extra="forbid")
|
|
333
333
|
|
|
334
|
-
|
|
335
|
-
|
|
334
|
+
configuration_id: str
|
|
335
|
+
retriever_id: str
|
|
336
336
|
name: str
|
|
337
337
|
created_at: str
|
|
338
|
-
|
|
338
|
+
configuration: Dict[str, Any] = Field(default_factory=dict)
|
|
339
339
|
description: Optional[str] = None
|
|
340
340
|
|
|
341
341
|
|
|
342
|
-
class
|
|
342
|
+
class RetrievalSnapshot(BaseModel):
|
|
343
343
|
"""
|
|
344
|
-
Immutable record of a retrieval
|
|
344
|
+
Immutable record of a retrieval snapshot.
|
|
345
345
|
|
|
346
|
-
:ivar
|
|
347
|
-
:vartype
|
|
348
|
-
:ivar
|
|
349
|
-
:vartype
|
|
346
|
+
:ivar snapshot_id: Unique snapshot identifier.
|
|
347
|
+
:vartype snapshot_id: str
|
|
348
|
+
:ivar configuration: Configuration manifest for this snapshot.
|
|
349
|
+
:vartype configuration: ConfigurationManifest
|
|
350
350
|
:ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
|
|
351
351
|
:vartype corpus_uri: str
|
|
352
|
-
:ivar catalog_generated_at: Catalog timestamp used for the
|
|
352
|
+
:ivar catalog_generated_at: Catalog timestamp used for the snapshot.
|
|
353
353
|
:vartype catalog_generated_at: str
|
|
354
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
354
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
|
|
355
355
|
:vartype created_at: str
|
|
356
|
-
:ivar
|
|
357
|
-
:vartype
|
|
358
|
-
:ivar stats:
|
|
356
|
+
:ivar snapshot_artifacts: Relative paths to materialized artifacts.
|
|
357
|
+
:vartype snapshot_artifacts: list[str]
|
|
358
|
+
:ivar stats: Retriever-specific snapshot statistics.
|
|
359
359
|
:vartype stats: dict[str, Any]
|
|
360
360
|
"""
|
|
361
361
|
|
|
362
362
|
model_config = ConfigDict(extra="forbid")
|
|
363
363
|
|
|
364
|
-
|
|
365
|
-
|
|
364
|
+
snapshot_id: str
|
|
365
|
+
configuration: ConfigurationManifest
|
|
366
366
|
corpus_uri: str
|
|
367
367
|
catalog_generated_at: str
|
|
368
368
|
created_at: str
|
|
369
|
-
|
|
369
|
+
snapshot_artifacts: List[str] = Field(default_factory=list)
|
|
370
370
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
371
371
|
|
|
372
372
|
|
|
373
373
|
class RetrievalResult(BaseModel):
|
|
374
374
|
"""
|
|
375
|
-
Retrieval result bundle returned from a
|
|
375
|
+
Retrieval result bundle returned from a retriever query.
|
|
376
376
|
|
|
377
377
|
:ivar query_text: Query text issued against the backend.
|
|
378
378
|
:vartype query_text: str
|
|
379
379
|
:ivar budget: Evidence selection budget applied to results.
|
|
380
380
|
:vartype budget: QueryBudget
|
|
381
|
-
:ivar
|
|
382
|
-
:vartype
|
|
383
|
-
:ivar
|
|
384
|
-
:vartype
|
|
385
|
-
:ivar
|
|
386
|
-
:vartype
|
|
381
|
+
:ivar snapshot_id: Retrieval snapshot identifier.
|
|
382
|
+
:vartype snapshot_id: str
|
|
383
|
+
:ivar configuration_id: Configuration identifier used for this query.
|
|
384
|
+
:vartype configuration_id: str
|
|
385
|
+
:ivar retriever_id: Retriever identifier used for this query.
|
|
386
|
+
:vartype retriever_id: str
|
|
387
387
|
:ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
|
|
388
388
|
:vartype generated_at: str
|
|
389
389
|
:ivar evidence: Evidence objects selected under the budget.
|
|
@@ -396,9 +396,9 @@ class RetrievalResult(BaseModel):
|
|
|
396
396
|
|
|
397
397
|
query_text: str
|
|
398
398
|
budget: QueryBudget
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
399
|
+
snapshot_id: str
|
|
400
|
+
configuration_id: str
|
|
401
|
+
retriever_id: str
|
|
402
402
|
generated_at: str
|
|
403
403
|
evidence: List[Evidence] = Field(default_factory=list)
|
|
404
404
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
biblicus/retrieval.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Shared retrieval helpers for Biblicus
|
|
2
|
+
Shared retrieval helpers for Biblicus retrievers.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -9,75 +9,82 @@ import json
|
|
|
9
9
|
from typing import Any, Dict, Iterable, List, Optional
|
|
10
10
|
|
|
11
11
|
from .corpus import Corpus
|
|
12
|
-
from .models import
|
|
12
|
+
from .models import (
|
|
13
|
+
ConfigurationManifest,
|
|
14
|
+
Evidence,
|
|
15
|
+
QueryBudget,
|
|
16
|
+
RetrievalSnapshot,
|
|
17
|
+
)
|
|
13
18
|
from .time import utc_now_iso
|
|
14
19
|
|
|
15
20
|
|
|
16
|
-
def
|
|
21
|
+
def create_configuration_manifest(
|
|
17
22
|
*,
|
|
18
|
-
|
|
23
|
+
retriever_id: str,
|
|
19
24
|
name: str,
|
|
20
|
-
|
|
25
|
+
configuration: Dict[str, Any],
|
|
21
26
|
description: Optional[str] = None,
|
|
22
|
-
) ->
|
|
27
|
+
) -> ConfigurationManifest:
|
|
23
28
|
"""
|
|
24
|
-
Create a deterministic
|
|
29
|
+
Create a deterministic configuration manifest from a retriever configuration.
|
|
25
30
|
|
|
26
|
-
:param
|
|
27
|
-
:type
|
|
28
|
-
:param name: Human-readable
|
|
31
|
+
:param retriever_id: Retriever identifier for the configuration.
|
|
32
|
+
:type retriever_id: str
|
|
33
|
+
:param name: Human-readable configuration name.
|
|
29
34
|
:type name: str
|
|
30
|
-
:param
|
|
31
|
-
:type
|
|
32
|
-
:param description: Optional
|
|
35
|
+
:param configuration: Retriever-specific configuration values.
|
|
36
|
+
:type configuration: dict[str, Any]
|
|
37
|
+
:param description: Optional configuration description.
|
|
33
38
|
:type description: str or None
|
|
34
|
-
:return: Deterministic
|
|
35
|
-
:rtype:
|
|
39
|
+
:return: Deterministic configuration manifest.
|
|
40
|
+
:rtype: ConfigurationManifest
|
|
36
41
|
"""
|
|
37
|
-
config_json = json.dumps(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return
|
|
41
|
-
|
|
42
|
-
|
|
42
|
+
config_json = json.dumps(configuration, sort_keys=True, separators=(",", ":"))
|
|
43
|
+
configuration_seed = f"{retriever_id}:{config_json}"
|
|
44
|
+
configuration_id = hashlib.sha256(configuration_seed.encode("utf-8")).hexdigest()
|
|
45
|
+
return ConfigurationManifest(
|
|
46
|
+
configuration_id=configuration_id,
|
|
47
|
+
retriever_id=retriever_id,
|
|
43
48
|
name=name,
|
|
44
49
|
created_at=utc_now_iso(),
|
|
45
|
-
|
|
50
|
+
configuration=configuration,
|
|
46
51
|
description=description,
|
|
47
52
|
)
|
|
48
53
|
|
|
49
54
|
|
|
50
|
-
def
|
|
55
|
+
def create_snapshot_manifest(
|
|
51
56
|
corpus: Corpus,
|
|
52
57
|
*,
|
|
53
|
-
|
|
58
|
+
configuration: ConfigurationManifest,
|
|
54
59
|
stats: Dict[str, Any],
|
|
55
|
-
|
|
56
|
-
) ->
|
|
60
|
+
snapshot_artifacts: Optional[List[str]] = None,
|
|
61
|
+
) -> RetrievalSnapshot:
|
|
57
62
|
"""
|
|
58
|
-
Create a retrieval
|
|
63
|
+
Create a retrieval snapshot manifest tied to the current catalog snapshot.
|
|
59
64
|
|
|
60
|
-
:param corpus: Corpus used to generate the
|
|
65
|
+
:param corpus: Corpus used to generate the snapshot.
|
|
61
66
|
:type corpus: Corpus
|
|
62
|
-
:param
|
|
63
|
-
:type
|
|
64
|
-
:param stats:
|
|
67
|
+
:param configuration: Configuration manifest for the snapshot.
|
|
68
|
+
:type configuration: ConfigurationManifest
|
|
69
|
+
:param stats: Retriever-specific snapshot statistics.
|
|
65
70
|
:type stats: dict[str, Any]
|
|
66
|
-
:param
|
|
67
|
-
:type
|
|
68
|
-
:return:
|
|
69
|
-
:rtype:
|
|
71
|
+
:param snapshot_artifacts: Optional relative paths to materialized artifacts.
|
|
72
|
+
:type snapshot_artifacts: list[str] or None
|
|
73
|
+
:return: Snapshot manifest.
|
|
74
|
+
:rtype: RetrievalSnapshot
|
|
70
75
|
"""
|
|
71
76
|
catalog = corpus.load_catalog()
|
|
72
77
|
created_at = utc_now_iso()
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
78
|
+
snapshot_id = hashlib.sha256(
|
|
79
|
+
f"{configuration.configuration_id}:{created_at}".encode("utf-8")
|
|
80
|
+
).hexdigest()
|
|
81
|
+
return RetrievalSnapshot(
|
|
82
|
+
snapshot_id=snapshot_id,
|
|
83
|
+
configuration=configuration,
|
|
77
84
|
corpus_uri=catalog.corpus_uri,
|
|
78
85
|
catalog_generated_at=catalog.generated_at,
|
|
79
86
|
created_at=created_at,
|
|
80
|
-
|
|
87
|
+
snapshot_artifacts=list(snapshot_artifacts or []),
|
|
81
88
|
stats=stats,
|
|
82
89
|
)
|
|
83
90
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Retriever registry for Biblicus retrieval engines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Type
|
|
8
|
+
|
|
9
|
+
from .base import Retriever
|
|
10
|
+
from .embedding_index_file import EmbeddingIndexFileRetriever
|
|
11
|
+
from .embedding_index_inmemory import EmbeddingIndexInMemoryRetriever
|
|
12
|
+
from .hybrid import HybridRetriever
|
|
13
|
+
from .scan import ScanRetriever
|
|
14
|
+
from .sqlite_full_text_search import SqliteFullTextSearchRetriever
|
|
15
|
+
from .tf_vector import TfVectorRetriever
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def available_retrievers() -> Dict[str, Type[Retriever]]:
|
|
19
|
+
"""
|
|
20
|
+
Return the registered retrievers.
|
|
21
|
+
|
|
22
|
+
:return: Mapping of retriever identifiers to retriever classes.
|
|
23
|
+
:rtype: dict[str, Type[Retriever]]
|
|
24
|
+
"""
|
|
25
|
+
return {
|
|
26
|
+
EmbeddingIndexFileRetriever.retriever_id: EmbeddingIndexFileRetriever,
|
|
27
|
+
EmbeddingIndexInMemoryRetriever.retriever_id: EmbeddingIndexInMemoryRetriever,
|
|
28
|
+
HybridRetriever.retriever_id: HybridRetriever,
|
|
29
|
+
ScanRetriever.retriever_id: ScanRetriever,
|
|
30
|
+
SqliteFullTextSearchRetriever.retriever_id: SqliteFullTextSearchRetriever,
|
|
31
|
+
TfVectorRetriever.retriever_id: TfVectorRetriever,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_retriever(retriever_id: str) -> Retriever:
|
|
36
|
+
"""
|
|
37
|
+
Instantiate a retriever by identifier.
|
|
38
|
+
|
|
39
|
+
:param retriever_id: Retriever identifier.
|
|
40
|
+
:type retriever_id: str
|
|
41
|
+
:return: Retriever instance.
|
|
42
|
+
:rtype: Retriever
|
|
43
|
+
:raises KeyError: If the retriever identifier is unknown.
|
|
44
|
+
"""
|
|
45
|
+
registry = available_retrievers()
|
|
46
|
+
retriever_class = registry.get(retriever_id)
|
|
47
|
+
if retriever_class is None:
|
|
48
|
+
known = ", ".join(sorted(registry))
|
|
49
|
+
raise KeyError(f"Unknown retriever '{retriever_id}'. Known retrievers: {known}")
|
|
50
|
+
return retriever_class()
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Retriever interface for Biblicus retrieval engines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Dict
|
|
9
|
+
|
|
10
|
+
from ..corpus import Corpus
|
|
11
|
+
from ..models import QueryBudget, RetrievalResult, RetrievalSnapshot
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Retriever(ABC):
|
|
15
|
+
"""
|
|
16
|
+
Abstract interface for retrievers.
|
|
17
|
+
|
|
18
|
+
:ivar retriever_id: Identifier string for the retriever.
|
|
19
|
+
:vartype retriever_id: str
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
retriever_id: str
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def build_snapshot(
|
|
26
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
27
|
+
) -> RetrievalSnapshot:
|
|
28
|
+
"""
|
|
29
|
+
Build or register a retrieval snapshot for the retriever.
|
|
30
|
+
|
|
31
|
+
:param corpus: Corpus to build against.
|
|
32
|
+
:type corpus: Corpus
|
|
33
|
+
:param configuration_name: Human name for the configuration.
|
|
34
|
+
:type configuration_name: str
|
|
35
|
+
:param configuration: Retriever-specific configuration values.
|
|
36
|
+
:type configuration: dict[str, object]
|
|
37
|
+
:return: Snapshot manifest describing the build.
|
|
38
|
+
:rtype: RetrievalSnapshot
|
|
39
|
+
"""
|
|
40
|
+
raise NotImplementedError
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def query(
|
|
44
|
+
self,
|
|
45
|
+
corpus: Corpus,
|
|
46
|
+
*,
|
|
47
|
+
snapshot: RetrievalSnapshot,
|
|
48
|
+
query_text: str,
|
|
49
|
+
budget: QueryBudget,
|
|
50
|
+
) -> RetrievalResult:
|
|
51
|
+
"""
|
|
52
|
+
Run a retrieval query against a retriever.
|
|
53
|
+
|
|
54
|
+
:param corpus: Corpus associated with the snapshot.
|
|
55
|
+
:type corpus: Corpus
|
|
56
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
57
|
+
:type snapshot: RetrievalSnapshot
|
|
58
|
+
:param query_text: Query text to execute.
|
|
59
|
+
:type query_text: str
|
|
60
|
+
:param budget: Evidence selection budget.
|
|
61
|
+
:type budget: QueryBudget
|
|
62
|
+
:return: Retrieval results containing evidence.
|
|
63
|
+
:rtype: RetrievalResult
|
|
64
|
+
"""
|
|
65
|
+
raise NotImplementedError
|