biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/knowledge_base.py
CHANGED
|
@@ -11,7 +11,6 @@ from typing import List, Optional, Sequence
|
|
|
11
11
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
|
-
from .backends import get_backend
|
|
15
14
|
from .context import (
|
|
16
15
|
ContextPack,
|
|
17
16
|
ContextPackPolicy,
|
|
@@ -20,17 +19,18 @@ from .context import (
|
|
|
20
19
|
fit_context_pack_to_token_budget,
|
|
21
20
|
)
|
|
22
21
|
from .corpus import Corpus
|
|
23
|
-
from .models import QueryBudget, RetrievalResult,
|
|
22
|
+
from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
|
|
23
|
+
from .retrievers import get_retriever
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class KnowledgeBaseDefaults(BaseModel):
|
|
27
27
|
"""
|
|
28
28
|
Default configuration for a knowledge base workflow.
|
|
29
29
|
|
|
30
|
-
:ivar
|
|
31
|
-
:vartype
|
|
32
|
-
:ivar
|
|
33
|
-
:vartype
|
|
30
|
+
:ivar retriever_id: Retriever identifier to use for retrieval.
|
|
31
|
+
:vartype retriever_id: str
|
|
32
|
+
:ivar configuration_name: Human-readable retrieval configuration name.
|
|
33
|
+
:vartype configuration_name: str
|
|
34
34
|
:ivar query_budget: Default query budget to apply to retrieval.
|
|
35
35
|
:vartype query_budget: QueryBudget
|
|
36
36
|
:ivar tags: Tags to apply when importing the folder.
|
|
@@ -39,12 +39,12 @@ class KnowledgeBaseDefaults(BaseModel):
|
|
|
39
39
|
|
|
40
40
|
model_config = ConfigDict(extra="forbid")
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
retriever_id: str = Field(default="scan", min_length=1)
|
|
43
|
+
configuration_name: str = Field(default="Knowledge base", min_length=1)
|
|
44
44
|
query_budget: QueryBudget = Field(
|
|
45
45
|
default_factory=lambda: QueryBudget(
|
|
46
46
|
max_total_items=5,
|
|
47
|
-
|
|
47
|
+
maximum_total_characters=2000,
|
|
48
48
|
max_items_per_source=None,
|
|
49
49
|
)
|
|
50
50
|
)
|
|
@@ -58,17 +58,17 @@ class KnowledgeBase:
|
|
|
58
58
|
|
|
59
59
|
:ivar corpus: Corpus instance that stores the ingested items.
|
|
60
60
|
:vartype corpus: Corpus
|
|
61
|
-
:ivar
|
|
62
|
-
:vartype
|
|
63
|
-
:ivar
|
|
64
|
-
:vartype
|
|
61
|
+
:ivar retriever_id: Retriever identifier used for retrieval.
|
|
62
|
+
:vartype retriever_id: str
|
|
63
|
+
:ivar snapshot: Retrieval snapshot manifest associated with the knowledge base.
|
|
64
|
+
:vartype snapshot: RetrievalSnapshot
|
|
65
65
|
:ivar defaults: Default configuration used for this knowledge base.
|
|
66
66
|
:vartype defaults: KnowledgeBaseDefaults
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
69
|
corpus: Corpus
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
retriever_id: str
|
|
71
|
+
snapshot: RetrievalSnapshot
|
|
72
72
|
defaults: KnowledgeBaseDefaults
|
|
73
73
|
_temp_dir: Optional[TemporaryDirectory]
|
|
74
74
|
|
|
@@ -77,8 +77,8 @@ class KnowledgeBase:
|
|
|
77
77
|
cls,
|
|
78
78
|
folder: str | Path,
|
|
79
79
|
*,
|
|
80
|
-
|
|
81
|
-
|
|
80
|
+
retriever_id: Optional[str] = None,
|
|
81
|
+
configuration_name: Optional[str] = None,
|
|
82
82
|
query_budget: Optional[QueryBudget] = None,
|
|
83
83
|
tags: Optional[Sequence[str]] = None,
|
|
84
84
|
corpus_root: Optional[str | Path] = None,
|
|
@@ -88,10 +88,10 @@ class KnowledgeBase:
|
|
|
88
88
|
|
|
89
89
|
:param folder: Folder containing source files.
|
|
90
90
|
:type folder: str or Path
|
|
91
|
-
:param
|
|
92
|
-
:type
|
|
93
|
-
:param
|
|
94
|
-
:type
|
|
91
|
+
:param retriever_id: Optional retriever identifier override.
|
|
92
|
+
:type retriever_id: str or None
|
|
93
|
+
:param configuration_name: Optional configuration name override.
|
|
94
|
+
:type configuration_name: str or None
|
|
95
95
|
:param query_budget: Optional query budget override.
|
|
96
96
|
:type query_budget: QueryBudget or None
|
|
97
97
|
:param tags: Optional tags to apply during import.
|
|
@@ -110,8 +110,8 @@ class KnowledgeBase:
|
|
|
110
110
|
raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
|
|
111
111
|
|
|
112
112
|
defaults = KnowledgeBaseDefaults()
|
|
113
|
-
|
|
114
|
-
|
|
113
|
+
resolved_retriever_id = retriever_id or defaults.retriever_id
|
|
114
|
+
resolved_configuration_name = configuration_name or defaults.configuration_name
|
|
115
115
|
resolved_query_budget = query_budget or defaults.query_budget
|
|
116
116
|
resolved_tags = list(tags) if tags is not None else defaults.tags
|
|
117
117
|
|
|
@@ -125,16 +125,18 @@ class KnowledgeBase:
|
|
|
125
125
|
corpus = Corpus.init(corpus_root_path)
|
|
126
126
|
corpus.import_tree(source_root, tags=resolved_tags)
|
|
127
127
|
|
|
128
|
-
|
|
129
|
-
|
|
128
|
+
retriever = get_retriever(resolved_retriever_id)
|
|
129
|
+
snapshot = retriever.build_snapshot(
|
|
130
|
+
corpus, configuration_name=resolved_configuration_name, configuration={}
|
|
131
|
+
)
|
|
130
132
|
|
|
131
133
|
return cls(
|
|
132
134
|
corpus=corpus,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
+
retriever_id=resolved_retriever_id,
|
|
136
|
+
snapshot=snapshot,
|
|
135
137
|
defaults=KnowledgeBaseDefaults(
|
|
136
|
-
|
|
137
|
-
|
|
138
|
+
retriever_id=resolved_retriever_id,
|
|
139
|
+
configuration_name=resolved_configuration_name,
|
|
138
140
|
query_budget=resolved_query_budget,
|
|
139
141
|
tags=resolved_tags,
|
|
140
142
|
),
|
|
@@ -152,11 +154,11 @@ class KnowledgeBase:
|
|
|
152
154
|
:return: Retrieval result containing evidence.
|
|
153
155
|
:rtype: RetrievalResult
|
|
154
156
|
"""
|
|
155
|
-
|
|
157
|
+
retriever = get_retriever(self.retriever_id)
|
|
156
158
|
resolved_budget = budget or self.defaults.query_budget
|
|
157
|
-
return
|
|
159
|
+
return retriever.query(
|
|
158
160
|
self.corpus,
|
|
159
|
-
|
|
161
|
+
snapshot=self.snapshot,
|
|
160
162
|
query_text=query_text,
|
|
161
163
|
budget=resolved_budget,
|
|
162
164
|
)
|
biblicus/models.py
CHANGED
|
@@ -117,8 +117,8 @@ class CorpusCatalog(BaseModel):
|
|
|
117
117
|
:vartype corpus_uri: str
|
|
118
118
|
:ivar raw_dir: Relative path to the raw items folder.
|
|
119
119
|
:vartype raw_dir: str
|
|
120
|
-
:ivar
|
|
121
|
-
:vartype
|
|
120
|
+
:ivar latest_snapshot_id: Latest retrieval snapshot identifier, if any.
|
|
121
|
+
:vartype latest_snapshot_id: str or None
|
|
122
122
|
:ivar items: Mapping of item IDs to catalog entries.
|
|
123
123
|
:vartype items: dict[str, CatalogItem]
|
|
124
124
|
:ivar order: Display order of item IDs (most recent first).
|
|
@@ -131,7 +131,7 @@ class CorpusCatalog(BaseModel):
|
|
|
131
131
|
generated_at: str
|
|
132
132
|
corpus_uri: str
|
|
133
133
|
raw_dir: str = "raw"
|
|
134
|
-
|
|
134
|
+
latest_snapshot_id: Optional[str] = None
|
|
135
135
|
items: Dict[str, CatalogItem] = Field(default_factory=dict)
|
|
136
136
|
order: List[str] = Field(default_factory=list)
|
|
137
137
|
|
|
@@ -142,79 +142,79 @@ class CorpusCatalog(BaseModel):
|
|
|
142
142
|
return self
|
|
143
143
|
|
|
144
144
|
|
|
145
|
-
class
|
|
145
|
+
class ExtractionSnapshotReference(BaseModel):
|
|
146
146
|
"""
|
|
147
|
-
Reference to an extraction
|
|
147
|
+
Reference to an extraction snapshot.
|
|
148
148
|
|
|
149
149
|
:ivar extractor_id: Extractor plugin identifier.
|
|
150
150
|
:vartype extractor_id: str
|
|
151
|
-
:ivar
|
|
152
|
-
:vartype
|
|
151
|
+
:ivar snapshot_id: Extraction snapshot identifier.
|
|
152
|
+
:vartype snapshot_id: str
|
|
153
153
|
"""
|
|
154
154
|
|
|
155
155
|
model_config = ConfigDict(extra="forbid")
|
|
156
156
|
|
|
157
157
|
extractor_id: str = Field(min_length=1)
|
|
158
|
-
|
|
158
|
+
snapshot_id: str = Field(min_length=1)
|
|
159
159
|
|
|
160
160
|
def as_string(self) -> str:
|
|
161
161
|
"""
|
|
162
162
|
Serialize the reference as a single string.
|
|
163
163
|
|
|
164
|
-
:return: Reference in the form extractor_id:
|
|
164
|
+
:return: Reference in the form extractor_id:snapshot_id.
|
|
165
165
|
:rtype: str
|
|
166
166
|
"""
|
|
167
|
-
return f"{self.extractor_id}:{self.
|
|
167
|
+
return f"{self.extractor_id}:{self.snapshot_id}"
|
|
168
168
|
|
|
169
169
|
|
|
170
|
-
def
|
|
170
|
+
def parse_extraction_snapshot_reference(value: str) -> ExtractionSnapshotReference:
|
|
171
171
|
"""
|
|
172
|
-
Parse an extraction
|
|
172
|
+
Parse an extraction snapshot reference in the form extractor_id:snapshot_id.
|
|
173
173
|
|
|
174
174
|
:param value: Raw reference string.
|
|
175
175
|
:type value: str
|
|
176
|
-
:return: Parsed extraction
|
|
177
|
-
:rtype:
|
|
176
|
+
:return: Parsed extraction snapshot reference.
|
|
177
|
+
:rtype: ExtractionSnapshotReference
|
|
178
178
|
:raises ValueError: If the reference is not well formed.
|
|
179
179
|
"""
|
|
180
180
|
if ":" not in value:
|
|
181
|
-
raise ValueError("Extraction
|
|
182
|
-
extractor_id,
|
|
181
|
+
raise ValueError("Extraction snapshot reference must be extractor_id:snapshot_id")
|
|
182
|
+
extractor_id, snapshot_id = value.split(":", 1)
|
|
183
183
|
extractor_id = extractor_id.strip()
|
|
184
|
-
|
|
185
|
-
if not extractor_id or not
|
|
184
|
+
snapshot_id = snapshot_id.strip()
|
|
185
|
+
if not extractor_id or not snapshot_id:
|
|
186
186
|
raise ValueError(
|
|
187
|
-
"Extraction
|
|
187
|
+
"Extraction snapshot reference must be extractor_id:snapshot_id with non-empty parts"
|
|
188
188
|
)
|
|
189
|
-
return
|
|
189
|
+
return ExtractionSnapshotReference(extractor_id=extractor_id, snapshot_id=snapshot_id)
|
|
190
190
|
|
|
191
191
|
|
|
192
|
-
class
|
|
192
|
+
class ExtractionSnapshotListEntry(BaseModel):
|
|
193
193
|
"""
|
|
194
|
-
Summary entry for an extraction
|
|
194
|
+
Summary entry for an extraction snapshot stored in a corpus.
|
|
195
195
|
|
|
196
196
|
:ivar extractor_id: Extractor plugin identifier.
|
|
197
197
|
:vartype extractor_id: str
|
|
198
|
-
:ivar
|
|
199
|
-
:vartype
|
|
200
|
-
:ivar
|
|
201
|
-
:vartype
|
|
202
|
-
:ivar
|
|
203
|
-
:vartype
|
|
204
|
-
:ivar catalog_generated_at: Catalog timestamp used for the
|
|
198
|
+
:ivar snapshot_id: Extraction snapshot identifier.
|
|
199
|
+
:vartype snapshot_id: str
|
|
200
|
+
:ivar configuration_id: Deterministic configuration identifier.
|
|
201
|
+
:vartype configuration_id: str
|
|
202
|
+
:ivar configuration_name: Human-readable configuration name.
|
|
203
|
+
:vartype configuration_name: str
|
|
204
|
+
:ivar catalog_generated_at: Catalog timestamp used for the snapshot.
|
|
205
205
|
:vartype catalog_generated_at: str
|
|
206
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
206
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
|
|
207
207
|
:vartype created_at: str
|
|
208
|
-
:ivar stats:
|
|
208
|
+
:ivar stats: Snapshot statistics.
|
|
209
209
|
:vartype stats: dict[str, object]
|
|
210
210
|
"""
|
|
211
211
|
|
|
212
212
|
model_config = ConfigDict(extra="forbid")
|
|
213
213
|
|
|
214
214
|
extractor_id: str = Field(min_length=1)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
215
|
+
snapshot_id: str = Field(min_length=1)
|
|
216
|
+
configuration_id: str = Field(min_length=1)
|
|
217
|
+
configuration_name: str = Field(min_length=1)
|
|
218
218
|
catalog_generated_at: str = Field(min_length=1)
|
|
219
219
|
created_at: str = Field(min_length=1)
|
|
220
220
|
stats: Dict[str, object] = Field(default_factory=dict)
|
|
@@ -234,8 +234,8 @@ class QueryBudget(BaseModel):
|
|
|
234
234
|
This enables simple pagination by re-running the same query with a
|
|
235
235
|
higher offset.
|
|
236
236
|
:vartype offset: int
|
|
237
|
-
:ivar
|
|
238
|
-
:vartype
|
|
237
|
+
:ivar maximum_total_characters: Optional maximum total characters across evidence text.
|
|
238
|
+
:vartype maximum_total_characters: int or None
|
|
239
239
|
:ivar max_items_per_source: Optional cap per source uniform resource identifier.
|
|
240
240
|
:vartype max_items_per_source: int or None
|
|
241
241
|
"""
|
|
@@ -244,13 +244,13 @@ class QueryBudget(BaseModel):
|
|
|
244
244
|
|
|
245
245
|
max_total_items: int = Field(ge=1)
|
|
246
246
|
offset: int = Field(default=0, ge=0)
|
|
247
|
-
|
|
247
|
+
maximum_total_characters: Optional[int] = Field(default=None, ge=1)
|
|
248
248
|
max_items_per_source: Optional[int] = Field(default=None, ge=1)
|
|
249
249
|
|
|
250
250
|
|
|
251
251
|
class Evidence(BaseModel):
|
|
252
252
|
"""
|
|
253
|
-
Structured retrieval evidence returned from a
|
|
253
|
+
Structured retrieval evidence returned from a retriever.
|
|
254
254
|
|
|
255
255
|
:ivar item_id: Item identifier that produced the evidence.
|
|
256
256
|
:vartype item_id: str
|
|
@@ -274,10 +274,12 @@ class Evidence(BaseModel):
|
|
|
274
274
|
:vartype stage: str
|
|
275
275
|
:ivar stage_scores: Optional per-stage scores for multi-stage retrieval.
|
|
276
276
|
:vartype stage_scores: dict[str, float] or None
|
|
277
|
-
:ivar
|
|
278
|
-
:vartype
|
|
279
|
-
:ivar
|
|
280
|
-
:vartype
|
|
277
|
+
:ivar configuration_id: Configuration identifier used to create the snapshot.
|
|
278
|
+
:vartype configuration_id: str
|
|
279
|
+
:ivar snapshot_id: Retrieval snapshot identifier.
|
|
280
|
+
:vartype snapshot_id: str
|
|
281
|
+
:ivar metadata: Optional metadata payload from the catalog item.
|
|
282
|
+
:vartype metadata: dict[str, Any]
|
|
281
283
|
:ivar hash: Optional content hash for provenance.
|
|
282
284
|
:vartype hash: str or None
|
|
283
285
|
"""
|
|
@@ -295,8 +297,9 @@ class Evidence(BaseModel):
|
|
|
295
297
|
span_end: Optional[int] = None
|
|
296
298
|
stage: str
|
|
297
299
|
stage_scores: Optional[Dict[str, float]] = None
|
|
298
|
-
|
|
299
|
-
|
|
300
|
+
configuration_id: str
|
|
301
|
+
snapshot_id: str
|
|
302
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
300
303
|
hash: Optional[str] = None
|
|
301
304
|
|
|
302
305
|
@model_validator(mode="after")
|
|
@@ -308,79 +311,79 @@ class Evidence(BaseModel):
|
|
|
308
311
|
return self
|
|
309
312
|
|
|
310
313
|
|
|
311
|
-
class
|
|
314
|
+
class ConfigurationManifest(BaseModel):
|
|
312
315
|
"""
|
|
313
|
-
Reproducible configuration for a
|
|
316
|
+
Reproducible configuration for a retriever.
|
|
314
317
|
|
|
315
|
-
:ivar
|
|
316
|
-
:vartype
|
|
317
|
-
:ivar
|
|
318
|
-
:vartype
|
|
319
|
-
:ivar name: Human-readable name for the
|
|
318
|
+
:ivar configuration_id: Deterministic configuration identifier.
|
|
319
|
+
:vartype configuration_id: str
|
|
320
|
+
:ivar retriever_id: Retriever identifier for the configuration.
|
|
321
|
+
:vartype retriever_id: str
|
|
322
|
+
:ivar name: Human-readable name for the configuration.
|
|
320
323
|
:vartype name: str
|
|
321
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
324
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
|
|
322
325
|
:vartype created_at: str
|
|
323
|
-
:ivar
|
|
324
|
-
:vartype
|
|
326
|
+
:ivar configuration: Retriever-specific configuration values.
|
|
327
|
+
:vartype configuration: dict[str, Any]
|
|
325
328
|
:ivar description: Optional human description.
|
|
326
329
|
:vartype description: str or None
|
|
327
330
|
"""
|
|
328
331
|
|
|
329
332
|
model_config = ConfigDict(extra="forbid")
|
|
330
333
|
|
|
331
|
-
|
|
332
|
-
|
|
334
|
+
configuration_id: str
|
|
335
|
+
retriever_id: str
|
|
333
336
|
name: str
|
|
334
337
|
created_at: str
|
|
335
|
-
|
|
338
|
+
configuration: Dict[str, Any] = Field(default_factory=dict)
|
|
336
339
|
description: Optional[str] = None
|
|
337
340
|
|
|
338
341
|
|
|
339
|
-
class
|
|
342
|
+
class RetrievalSnapshot(BaseModel):
|
|
340
343
|
"""
|
|
341
|
-
Immutable record of a retrieval
|
|
344
|
+
Immutable record of a retrieval snapshot.
|
|
342
345
|
|
|
343
|
-
:ivar
|
|
344
|
-
:vartype
|
|
345
|
-
:ivar
|
|
346
|
-
:vartype
|
|
346
|
+
:ivar snapshot_id: Unique snapshot identifier.
|
|
347
|
+
:vartype snapshot_id: str
|
|
348
|
+
:ivar configuration: Configuration manifest for this snapshot.
|
|
349
|
+
:vartype configuration: ConfigurationManifest
|
|
347
350
|
:ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
|
|
348
351
|
:vartype corpus_uri: str
|
|
349
|
-
:ivar catalog_generated_at: Catalog timestamp used for the
|
|
352
|
+
:ivar catalog_generated_at: Catalog timestamp used for the snapshot.
|
|
350
353
|
:vartype catalog_generated_at: str
|
|
351
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
354
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for snapshot creation.
|
|
352
355
|
:vartype created_at: str
|
|
353
|
-
:ivar
|
|
354
|
-
:vartype
|
|
355
|
-
:ivar stats:
|
|
356
|
+
:ivar snapshot_artifacts: Relative paths to materialized artifacts.
|
|
357
|
+
:vartype snapshot_artifacts: list[str]
|
|
358
|
+
:ivar stats: Retriever-specific snapshot statistics.
|
|
356
359
|
:vartype stats: dict[str, Any]
|
|
357
360
|
"""
|
|
358
361
|
|
|
359
362
|
model_config = ConfigDict(extra="forbid")
|
|
360
363
|
|
|
361
|
-
|
|
362
|
-
|
|
364
|
+
snapshot_id: str
|
|
365
|
+
configuration: ConfigurationManifest
|
|
363
366
|
corpus_uri: str
|
|
364
367
|
catalog_generated_at: str
|
|
365
368
|
created_at: str
|
|
366
|
-
|
|
369
|
+
snapshot_artifacts: List[str] = Field(default_factory=list)
|
|
367
370
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
368
371
|
|
|
369
372
|
|
|
370
373
|
class RetrievalResult(BaseModel):
|
|
371
374
|
"""
|
|
372
|
-
Retrieval result bundle returned from a
|
|
375
|
+
Retrieval result bundle returned from a retriever query.
|
|
373
376
|
|
|
374
377
|
:ivar query_text: Query text issued against the backend.
|
|
375
378
|
:vartype query_text: str
|
|
376
379
|
:ivar budget: Evidence selection budget applied to results.
|
|
377
380
|
:vartype budget: QueryBudget
|
|
378
|
-
:ivar
|
|
379
|
-
:vartype
|
|
380
|
-
:ivar
|
|
381
|
-
:vartype
|
|
382
|
-
:ivar
|
|
383
|
-
:vartype
|
|
381
|
+
:ivar snapshot_id: Retrieval snapshot identifier.
|
|
382
|
+
:vartype snapshot_id: str
|
|
383
|
+
:ivar configuration_id: Configuration identifier used for this query.
|
|
384
|
+
:vartype configuration_id: str
|
|
385
|
+
:ivar retriever_id: Retriever identifier used for this query.
|
|
386
|
+
:vartype retriever_id: str
|
|
384
387
|
:ivar generated_at: International Organization for Standardization 8601 timestamp for the query result.
|
|
385
388
|
:vartype generated_at: str
|
|
386
389
|
:ivar evidence: Evidence objects selected under the budget.
|
|
@@ -393,9 +396,9 @@ class RetrievalResult(BaseModel):
|
|
|
393
396
|
|
|
394
397
|
query_text: str
|
|
395
398
|
budget: QueryBudget
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
+
snapshot_id: str
|
|
400
|
+
configuration_id: str
|
|
401
|
+
retriever_id: str
|
|
399
402
|
generated_at: str
|
|
400
403
|
evidence: List[Evidence] = Field(default_factory=list)
|
|
401
404
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
biblicus/retrieval.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Shared retrieval helpers for Biblicus
|
|
2
|
+
Shared retrieval helpers for Biblicus retrievers.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -9,75 +9,82 @@ import json
|
|
|
9
9
|
from typing import Any, Dict, Iterable, List, Optional
|
|
10
10
|
|
|
11
11
|
from .corpus import Corpus
|
|
12
|
-
from .models import
|
|
12
|
+
from .models import (
|
|
13
|
+
ConfigurationManifest,
|
|
14
|
+
Evidence,
|
|
15
|
+
QueryBudget,
|
|
16
|
+
RetrievalSnapshot,
|
|
17
|
+
)
|
|
13
18
|
from .time import utc_now_iso
|
|
14
19
|
|
|
15
20
|
|
|
16
|
-
def
|
|
21
|
+
def create_configuration_manifest(
|
|
17
22
|
*,
|
|
18
|
-
|
|
23
|
+
retriever_id: str,
|
|
19
24
|
name: str,
|
|
20
|
-
|
|
25
|
+
configuration: Dict[str, Any],
|
|
21
26
|
description: Optional[str] = None,
|
|
22
|
-
) ->
|
|
27
|
+
) -> ConfigurationManifest:
|
|
23
28
|
"""
|
|
24
|
-
Create a deterministic
|
|
29
|
+
Create a deterministic configuration manifest from a retriever configuration.
|
|
25
30
|
|
|
26
|
-
:param
|
|
27
|
-
:type
|
|
28
|
-
:param name: Human-readable
|
|
31
|
+
:param retriever_id: Retriever identifier for the configuration.
|
|
32
|
+
:type retriever_id: str
|
|
33
|
+
:param name: Human-readable configuration name.
|
|
29
34
|
:type name: str
|
|
30
|
-
:param
|
|
31
|
-
:type
|
|
32
|
-
:param description: Optional
|
|
35
|
+
:param configuration: Retriever-specific configuration values.
|
|
36
|
+
:type configuration: dict[str, Any]
|
|
37
|
+
:param description: Optional configuration description.
|
|
33
38
|
:type description: str or None
|
|
34
|
-
:return: Deterministic
|
|
35
|
-
:rtype:
|
|
39
|
+
:return: Deterministic configuration manifest.
|
|
40
|
+
:rtype: ConfigurationManifest
|
|
36
41
|
"""
|
|
37
|
-
config_json = json.dumps(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return
|
|
41
|
-
|
|
42
|
-
|
|
42
|
+
config_json = json.dumps(configuration, sort_keys=True, separators=(",", ":"))
|
|
43
|
+
configuration_seed = f"{retriever_id}:{config_json}"
|
|
44
|
+
configuration_id = hashlib.sha256(configuration_seed.encode("utf-8")).hexdigest()
|
|
45
|
+
return ConfigurationManifest(
|
|
46
|
+
configuration_id=configuration_id,
|
|
47
|
+
retriever_id=retriever_id,
|
|
43
48
|
name=name,
|
|
44
49
|
created_at=utc_now_iso(),
|
|
45
|
-
|
|
50
|
+
configuration=configuration,
|
|
46
51
|
description=description,
|
|
47
52
|
)
|
|
48
53
|
|
|
49
54
|
|
|
50
|
-
def
|
|
55
|
+
def create_snapshot_manifest(
|
|
51
56
|
corpus: Corpus,
|
|
52
57
|
*,
|
|
53
|
-
|
|
58
|
+
configuration: ConfigurationManifest,
|
|
54
59
|
stats: Dict[str, Any],
|
|
55
|
-
|
|
56
|
-
) ->
|
|
60
|
+
snapshot_artifacts: Optional[List[str]] = None,
|
|
61
|
+
) -> RetrievalSnapshot:
|
|
57
62
|
"""
|
|
58
|
-
Create a retrieval
|
|
63
|
+
Create a retrieval snapshot manifest tied to the current catalog snapshot.
|
|
59
64
|
|
|
60
|
-
:param corpus: Corpus used to generate the
|
|
65
|
+
:param corpus: Corpus used to generate the snapshot.
|
|
61
66
|
:type corpus: Corpus
|
|
62
|
-
:param
|
|
63
|
-
:type
|
|
64
|
-
:param stats:
|
|
67
|
+
:param configuration: Configuration manifest for the snapshot.
|
|
68
|
+
:type configuration: ConfigurationManifest
|
|
69
|
+
:param stats: Retriever-specific snapshot statistics.
|
|
65
70
|
:type stats: dict[str, Any]
|
|
66
|
-
:param
|
|
67
|
-
:type
|
|
68
|
-
:return:
|
|
69
|
-
:rtype:
|
|
71
|
+
:param snapshot_artifacts: Optional relative paths to materialized artifacts.
|
|
72
|
+
:type snapshot_artifacts: list[str] or None
|
|
73
|
+
:return: Snapshot manifest.
|
|
74
|
+
:rtype: RetrievalSnapshot
|
|
70
75
|
"""
|
|
71
76
|
catalog = corpus.load_catalog()
|
|
72
77
|
created_at = utc_now_iso()
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
78
|
+
snapshot_id = hashlib.sha256(
|
|
79
|
+
f"{configuration.configuration_id}:{created_at}".encode("utf-8")
|
|
80
|
+
).hexdigest()
|
|
81
|
+
return RetrievalSnapshot(
|
|
82
|
+
snapshot_id=snapshot_id,
|
|
83
|
+
configuration=configuration,
|
|
77
84
|
corpus_uri=catalog.corpus_uri,
|
|
78
85
|
catalog_generated_at=catalog.generated_at,
|
|
79
86
|
created_at=created_at,
|
|
80
|
-
|
|
87
|
+
snapshot_artifacts=list(snapshot_artifacts or []),
|
|
81
88
|
stats=stats,
|
|
82
89
|
)
|
|
83
90
|
|
|
@@ -124,8 +131,8 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
|
|
|
124
131
|
continue
|
|
125
132
|
|
|
126
133
|
text_character_count = len(candidate_evidence.text or "")
|
|
127
|
-
if budget.
|
|
128
|
-
if total_characters + text_character_count > budget.
|
|
134
|
+
if budget.maximum_total_characters is not None:
|
|
135
|
+
if total_characters + text_character_count > budget.maximum_total_characters:
|
|
129
136
|
continue
|
|
130
137
|
|
|
131
138
|
selected_evidence.append(candidate_evidence)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Retriever registry for Biblicus retrieval engines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Type
|
|
8
|
+
|
|
9
|
+
from .base import Retriever
|
|
10
|
+
from .embedding_index_file import EmbeddingIndexFileRetriever
|
|
11
|
+
from .embedding_index_inmemory import EmbeddingIndexInMemoryRetriever
|
|
12
|
+
from .hybrid import HybridRetriever
|
|
13
|
+
from .scan import ScanRetriever
|
|
14
|
+
from .sqlite_full_text_search import SqliteFullTextSearchRetriever
|
|
15
|
+
from .tf_vector import TfVectorRetriever
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def available_retrievers() -> Dict[str, Type[Retriever]]:
|
|
19
|
+
"""
|
|
20
|
+
Return the registered retrievers.
|
|
21
|
+
|
|
22
|
+
:return: Mapping of retriever identifiers to retriever classes.
|
|
23
|
+
:rtype: dict[str, Type[Retriever]]
|
|
24
|
+
"""
|
|
25
|
+
return {
|
|
26
|
+
EmbeddingIndexFileRetriever.retriever_id: EmbeddingIndexFileRetriever,
|
|
27
|
+
EmbeddingIndexInMemoryRetriever.retriever_id: EmbeddingIndexInMemoryRetriever,
|
|
28
|
+
HybridRetriever.retriever_id: HybridRetriever,
|
|
29
|
+
ScanRetriever.retriever_id: ScanRetriever,
|
|
30
|
+
SqliteFullTextSearchRetriever.retriever_id: SqliteFullTextSearchRetriever,
|
|
31
|
+
TfVectorRetriever.retriever_id: TfVectorRetriever,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_retriever(retriever_id: str) -> Retriever:
|
|
36
|
+
"""
|
|
37
|
+
Instantiate a retriever by identifier.
|
|
38
|
+
|
|
39
|
+
:param retriever_id: Retriever identifier.
|
|
40
|
+
:type retriever_id: str
|
|
41
|
+
:return: Retriever instance.
|
|
42
|
+
:rtype: Retriever
|
|
43
|
+
:raises KeyError: If the retriever identifier is unknown.
|
|
44
|
+
"""
|
|
45
|
+
registry = available_retrievers()
|
|
46
|
+
retriever_class = registry.get(retriever_id)
|
|
47
|
+
if retriever_class is None:
|
|
48
|
+
known = ", ".join(sorted(registry))
|
|
49
|
+
raise KeyError(f"Unknown retriever '{retriever_id}'. Known retrievers: {known}")
|
|
50
|
+
return retriever_class()
|