biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Naive full-scan
|
|
2
|
+
Naive full-scan retriever.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -12,87 +12,97 @@ from ..corpus import Corpus
|
|
|
12
12
|
from ..frontmatter import parse_front_matter
|
|
13
13
|
from ..models import (
|
|
14
14
|
Evidence,
|
|
15
|
-
|
|
15
|
+
ExtractionSnapshotReference,
|
|
16
16
|
QueryBudget,
|
|
17
17
|
RetrievalResult,
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
RetrievalSnapshot,
|
|
19
|
+
parse_extraction_snapshot_reference,
|
|
20
|
+
)
|
|
21
|
+
from ..retrieval import (
|
|
22
|
+
apply_budget,
|
|
23
|
+
create_configuration_manifest,
|
|
24
|
+
create_snapshot_manifest,
|
|
25
|
+
hash_text,
|
|
20
26
|
)
|
|
21
|
-
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
22
27
|
from ..time import utc_now_iso
|
|
23
28
|
|
|
24
29
|
|
|
25
|
-
class
|
|
30
|
+
class ScanConfiguration(BaseModel):
|
|
26
31
|
"""
|
|
27
|
-
Configuration for the naive scan
|
|
32
|
+
Configuration for the naive scan retriever.
|
|
28
33
|
|
|
29
34
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
30
35
|
:vartype snippet_characters: int
|
|
31
|
-
:ivar
|
|
32
|
-
:vartype
|
|
36
|
+
:ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
|
|
37
|
+
:vartype extraction_snapshot: str or None
|
|
33
38
|
"""
|
|
34
39
|
|
|
35
40
|
model_config = ConfigDict(extra="forbid")
|
|
36
41
|
|
|
37
42
|
snippet_characters: int = Field(default=400, ge=1)
|
|
38
|
-
|
|
43
|
+
extraction_snapshot: Optional[str] = None
|
|
39
44
|
|
|
40
45
|
|
|
41
|
-
class
|
|
46
|
+
class ScanRetriever:
|
|
42
47
|
"""
|
|
43
|
-
Naive
|
|
48
|
+
Naive retriever that scans all text items at query time.
|
|
44
49
|
|
|
45
|
-
:ivar
|
|
46
|
-
:vartype
|
|
50
|
+
:ivar retriever_id: Retriever identifier.
|
|
51
|
+
:vartype retriever_id: str
|
|
47
52
|
"""
|
|
48
53
|
|
|
49
|
-
|
|
54
|
+
retriever_id = "scan"
|
|
50
55
|
|
|
51
|
-
def
|
|
52
|
-
self, corpus: Corpus, *,
|
|
53
|
-
) ->
|
|
56
|
+
def build_snapshot(
|
|
57
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
58
|
+
) -> RetrievalSnapshot:
|
|
54
59
|
"""
|
|
55
|
-
Register a scan
|
|
60
|
+
Register a scan retriever snapshot (no snapshot artifacts).
|
|
56
61
|
|
|
57
62
|
:param corpus: Corpus to build against.
|
|
58
63
|
:type corpus: Corpus
|
|
59
|
-
:param
|
|
60
|
-
:type
|
|
61
|
-
:param
|
|
62
|
-
:type
|
|
63
|
-
:return:
|
|
64
|
-
:rtype:
|
|
64
|
+
:param configuration_name: Human-readable configuration name.
|
|
65
|
+
:type configuration_name: str
|
|
66
|
+
:param configuration: Retriever-specific configuration values.
|
|
67
|
+
:type configuration: dict[str, object]
|
|
68
|
+
:return: Snapshot manifest describing the build.
|
|
69
|
+
:rtype: RetrievalSnapshot
|
|
65
70
|
"""
|
|
66
|
-
|
|
71
|
+
parsed_config = ScanConfiguration.model_validate(configuration)
|
|
67
72
|
catalog = corpus.load_catalog()
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
name=
|
|
71
|
-
|
|
73
|
+
configuration_manifest = create_configuration_manifest(
|
|
74
|
+
retriever_id=self.retriever_id,
|
|
75
|
+
name=configuration_name,
|
|
76
|
+
configuration=parsed_config.model_dump(),
|
|
72
77
|
)
|
|
73
78
|
stats = {
|
|
74
79
|
"items": len(catalog.items),
|
|
75
|
-
"text_items": _count_text_items(corpus, catalog.items.values(),
|
|
80
|
+
"text_items": _count_text_items(corpus, catalog.items.values(), parsed_config),
|
|
76
81
|
}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
82
|
+
snapshot = create_snapshot_manifest(
|
|
83
|
+
corpus,
|
|
84
|
+
configuration=configuration_manifest,
|
|
85
|
+
stats=stats,
|
|
86
|
+
snapshot_artifacts=[],
|
|
87
|
+
)
|
|
88
|
+
corpus.write_snapshot(snapshot)
|
|
89
|
+
return snapshot
|
|
80
90
|
|
|
81
91
|
def query(
|
|
82
92
|
self,
|
|
83
93
|
corpus: Corpus,
|
|
84
94
|
*,
|
|
85
|
-
|
|
95
|
+
snapshot: RetrievalSnapshot,
|
|
86
96
|
query_text: str,
|
|
87
97
|
budget: QueryBudget,
|
|
88
98
|
) -> RetrievalResult:
|
|
89
99
|
"""
|
|
90
100
|
Query the corpus with a full scan.
|
|
91
101
|
|
|
92
|
-
:param corpus: Corpus associated with the
|
|
102
|
+
:param corpus: Corpus associated with the snapshot.
|
|
93
103
|
:type corpus: Corpus
|
|
94
|
-
:param
|
|
95
|
-
:type
|
|
104
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
105
|
+
:type snapshot: RetrievalSnapshot
|
|
96
106
|
:param query_text: Query text to execute.
|
|
97
107
|
:type query_text: str
|
|
98
108
|
:param budget: Evidence selection budget.
|
|
@@ -100,15 +110,15 @@ class ScanBackend:
|
|
|
100
110
|
:return: Retrieval results containing evidence.
|
|
101
111
|
:rtype: RetrievalResult
|
|
102
112
|
"""
|
|
103
|
-
|
|
113
|
+
parsed_config = ScanConfiguration.model_validate(snapshot.configuration.configuration)
|
|
104
114
|
catalog = corpus.load_catalog()
|
|
105
|
-
extraction_reference = _resolve_extraction_reference(corpus,
|
|
115
|
+
extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
|
|
106
116
|
query_tokens = _tokenize_query(query_text)
|
|
107
117
|
scored_candidates = _score_items(
|
|
108
118
|
corpus,
|
|
109
119
|
catalog.items.values(),
|
|
110
120
|
query_tokens,
|
|
111
|
-
|
|
121
|
+
parsed_config.snippet_characters,
|
|
112
122
|
extraction_reference=extraction_reference,
|
|
113
123
|
)
|
|
114
124
|
sorted_candidates = sorted(
|
|
@@ -119,8 +129,8 @@ class ScanBackend:
|
|
|
119
129
|
evidence_item.model_copy(
|
|
120
130
|
update={
|
|
121
131
|
"rank": index,
|
|
122
|
-
"
|
|
123
|
-
"
|
|
132
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
133
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
124
134
|
}
|
|
125
135
|
)
|
|
126
136
|
for index, evidence_item in enumerate(sorted_candidates, start=1)
|
|
@@ -130,9 +140,9 @@ class ScanBackend:
|
|
|
130
140
|
return RetrievalResult(
|
|
131
141
|
query_text=query_text,
|
|
132
142
|
budget=budget,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
143
|
+
snapshot_id=snapshot.snapshot_id,
|
|
144
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
145
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
136
146
|
generated_at=utc_now_iso(),
|
|
137
147
|
evidence=evidence,
|
|
138
148
|
stats=stats,
|
|
@@ -140,56 +150,56 @@ class ScanBackend:
|
|
|
140
150
|
|
|
141
151
|
|
|
142
152
|
def _resolve_extraction_reference(
|
|
143
|
-
corpus: Corpus,
|
|
144
|
-
) -> Optional[
|
|
153
|
+
corpus: Corpus, configuration: ScanConfiguration
|
|
154
|
+
) -> Optional[ExtractionSnapshotReference]:
|
|
145
155
|
"""
|
|
146
|
-
Resolve an extraction
|
|
156
|
+
Resolve an extraction snapshot reference from a configuration.
|
|
147
157
|
|
|
148
|
-
:param corpus: Corpus associated with the
|
|
158
|
+
:param corpus: Corpus associated with the configuration.
|
|
149
159
|
:type corpus: Corpus
|
|
150
|
-
:param
|
|
151
|
-
:type
|
|
160
|
+
:param configuration: Parsed scan configuration.
|
|
161
|
+
:type configuration: ScanConfiguration
|
|
152
162
|
:return: Parsed extraction reference or None.
|
|
153
|
-
:rtype:
|
|
154
|
-
:raises FileNotFoundError: If an extraction
|
|
163
|
+
:rtype: ExtractionSnapshotReference or None
|
|
164
|
+
:raises FileNotFoundError: If an extraction snapshot is referenced but not present.
|
|
155
165
|
"""
|
|
156
|
-
if not
|
|
166
|
+
if not configuration.extraction_snapshot:
|
|
157
167
|
return None
|
|
158
|
-
extraction_reference =
|
|
159
|
-
|
|
168
|
+
extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
|
|
169
|
+
snapshot_dir = corpus.extraction_snapshot_dir(
|
|
160
170
|
extractor_id=extraction_reference.extractor_id,
|
|
161
|
-
|
|
171
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
162
172
|
)
|
|
163
|
-
if not
|
|
164
|
-
raise FileNotFoundError(f"Missing extraction
|
|
173
|
+
if not snapshot_dir.is_dir():
|
|
174
|
+
raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
|
|
165
175
|
return extraction_reference
|
|
166
176
|
|
|
167
177
|
|
|
168
178
|
def _count_text_items(
|
|
169
|
-
corpus: Corpus, items: Iterable[object],
|
|
179
|
+
corpus: Corpus, items: Iterable[object], configuration: ScanConfiguration
|
|
170
180
|
) -> int:
|
|
171
181
|
"""
|
|
172
182
|
Count catalog items that represent text content.
|
|
173
183
|
|
|
174
|
-
When an extraction
|
|
184
|
+
When an extraction snapshot is configured, extracted artifacts are treated as text.
|
|
175
185
|
|
|
176
186
|
:param corpus: Corpus containing the items.
|
|
177
187
|
:type corpus: Corpus
|
|
178
188
|
:param items: Catalog items to inspect.
|
|
179
189
|
:type items: Iterable[object]
|
|
180
|
-
:param
|
|
181
|
-
:type
|
|
190
|
+
:param configuration: Parsed scan configuration.
|
|
191
|
+
:type configuration: ScanConfiguration
|
|
182
192
|
:return: Number of text items.
|
|
183
193
|
:rtype: int
|
|
184
194
|
"""
|
|
185
195
|
text_item_count = 0
|
|
186
|
-
extraction_reference = _resolve_extraction_reference(corpus,
|
|
196
|
+
extraction_reference = _resolve_extraction_reference(corpus, configuration)
|
|
187
197
|
for catalog_item in items:
|
|
188
198
|
item_id = str(getattr(catalog_item, "id", ""))
|
|
189
199
|
if extraction_reference and item_id:
|
|
190
200
|
extracted_text = corpus.read_extracted_text(
|
|
191
201
|
extractor_id=extraction_reference.extractor_id,
|
|
192
|
-
|
|
202
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
193
203
|
item_id=item_id,
|
|
194
204
|
)
|
|
195
205
|
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
@@ -219,7 +229,7 @@ def _load_text_from_item(
|
|
|
219
229
|
item_id: str,
|
|
220
230
|
relpath: str,
|
|
221
231
|
media_type: str,
|
|
222
|
-
extraction_reference: Optional[
|
|
232
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
223
233
|
) -> Optional[str]:
|
|
224
234
|
"""
|
|
225
235
|
Load a text payload from a catalog item.
|
|
@@ -232,15 +242,15 @@ def _load_text_from_item(
|
|
|
232
242
|
:type relpath: str
|
|
233
243
|
:param media_type: Media type for the stored content.
|
|
234
244
|
:type media_type: str
|
|
235
|
-
:param extraction_reference: Optional extraction
|
|
236
|
-
:type extraction_reference:
|
|
245
|
+
:param extraction_reference: Optional extraction snapshot reference.
|
|
246
|
+
:type extraction_reference: ExtractionSnapshotReference or None
|
|
237
247
|
:return: Text payload or None if not decodable as text.
|
|
238
248
|
:rtype: str or None
|
|
239
249
|
"""
|
|
240
250
|
if extraction_reference:
|
|
241
251
|
extracted_text = corpus.read_extracted_text(
|
|
242
252
|
extractor_id=extraction_reference.extractor_id,
|
|
243
|
-
|
|
253
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
244
254
|
item_id=item_id,
|
|
245
255
|
)
|
|
246
256
|
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
@@ -316,7 +326,7 @@ def _score_items(
|
|
|
316
326
|
tokens: List[str],
|
|
317
327
|
snippet_characters: int,
|
|
318
328
|
*,
|
|
319
|
-
extraction_reference: Optional[
|
|
329
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
320
330
|
) -> List[Evidence]:
|
|
321
331
|
"""
|
|
322
332
|
Score catalog items by token frequency and return evidence candidates.
|
|
@@ -366,8 +376,8 @@ def _score_items(
|
|
|
366
376
|
span_start=span_start,
|
|
367
377
|
span_end=span_end,
|
|
368
378
|
stage="scan",
|
|
369
|
-
|
|
370
|
-
|
|
379
|
+
configuration_id="",
|
|
380
|
+
snapshot_id="",
|
|
371
381
|
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
372
382
|
hash=hash_text(snippet),
|
|
373
383
|
)
|