biblicus 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/markov.py +35 -3
- biblicus/backends/__init__.py +6 -2
- biblicus/backends/embedding_index_common.py +301 -0
- biblicus/backends/embedding_index_file.py +266 -0
- biblicus/backends/embedding_index_inmemory.py +268 -0
- biblicus/backends/hybrid.py +4 -2
- biblicus/backends/sqlite_full_text_search.py +1 -1
- biblicus/backends/{vector.py → tf_vector.py} +11 -11
- biblicus/chunking.py +396 -0
- biblicus/cli.py +50 -10
- biblicus/embedding_providers.py +122 -0
- biblicus/frontmatter.py +2 -0
- biblicus/models.py +9 -0
- biblicus/retrieval.py +5 -0
- {biblicus-0.15.1.dist-info → biblicus-0.16.0.dist-info}/METADATA +2 -1
- {biblicus-0.15.1.dist-info → biblicus-0.16.0.dist-info}/RECORD +21 -16
- {biblicus-0.15.1.dist-info → biblicus-0.16.0.dist-info}/WHEEL +0 -0
- {biblicus-0.15.1.dist-info → biblicus-0.16.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.15.1.dist-info → biblicus-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.15.1.dist-info → biblicus-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding-index retrieval backend that loads the full embedding matrix into memory at query time.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from pydantic import ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
from ..corpus import Corpus
|
|
13
|
+
from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult, RetrievalRun
|
|
14
|
+
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
15
|
+
from ..time import utc_now_iso
|
|
16
|
+
from .embedding_index_common import (
|
|
17
|
+
ChunkRecord,
|
|
18
|
+
EmbeddingIndexRecipeConfig,
|
|
19
|
+
artifact_paths_for_run,
|
|
20
|
+
chunks_to_records,
|
|
21
|
+
collect_chunks,
|
|
22
|
+
cosine_similarity_scores,
|
|
23
|
+
read_chunks_jsonl,
|
|
24
|
+
read_embeddings,
|
|
25
|
+
resolve_extraction_reference,
|
|
26
|
+
write_chunks_jsonl,
|
|
27
|
+
write_embeddings,
|
|
28
|
+
)
|
|
29
|
+
from .scan import _build_snippet
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class EmbeddingIndexInMemoryRecipeConfig(EmbeddingIndexRecipeConfig):
|
|
33
|
+
"""
|
|
34
|
+
Configuration for embedding-index-inmemory retrieval.
|
|
35
|
+
|
|
36
|
+
:ivar max_chunks: Maximum chunks allowed for in-memory query loading.
|
|
37
|
+
:vartype max_chunks: int
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
model_config = ConfigDict(extra="forbid")
|
|
41
|
+
|
|
42
|
+
max_chunks: int = Field(default=25000, ge=1)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class EmbeddingIndexInMemoryBackend:
|
|
46
|
+
"""
|
|
47
|
+
Embedding retrieval backend using an in-memory similarity scan.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
backend_id = "embedding-index-inmemory"
|
|
51
|
+
|
|
52
|
+
def build_run(
|
|
53
|
+
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
54
|
+
) -> RetrievalRun:
|
|
55
|
+
"""
|
|
56
|
+
Build an embedding index run by chunking text payloads and materializing embeddings.
|
|
57
|
+
|
|
58
|
+
:param corpus: Corpus to build against.
|
|
59
|
+
:type corpus: Corpus
|
|
60
|
+
:param recipe_name: Human-readable recipe name.
|
|
61
|
+
:type recipe_name: str
|
|
62
|
+
:param config: Backend-specific configuration values.
|
|
63
|
+
:type config: dict[str, object]
|
|
64
|
+
:return: Run manifest describing the build.
|
|
65
|
+
:rtype: biblicus.models.RetrievalRun
|
|
66
|
+
"""
|
|
67
|
+
recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(config)
|
|
68
|
+
chunks, text_items = collect_chunks(corpus, recipe_config=recipe_config)
|
|
69
|
+
if len(chunks) > recipe_config.max_chunks:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
"embedding-index-inmemory exceeded max_chunks. "
|
|
72
|
+
"Use embedding-index-file or increase max_chunks."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
76
|
+
chunk_texts = [chunk.text for chunk in chunks]
|
|
77
|
+
embeddings = provider.embed_texts(chunk_texts)
|
|
78
|
+
embeddings = embeddings.astype(np.float32)
|
|
79
|
+
|
|
80
|
+
recipe = create_recipe_manifest(
|
|
81
|
+
backend_id=self.backend_id,
|
|
82
|
+
name=recipe_name,
|
|
83
|
+
config=recipe_config.model_dump(),
|
|
84
|
+
)
|
|
85
|
+
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
86
|
+
|
|
87
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
88
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
89
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
90
|
+
corpus.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
|
|
92
|
+
write_embeddings(embeddings_path, embeddings)
|
|
93
|
+
write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
|
|
94
|
+
|
|
95
|
+
stats = {
|
|
96
|
+
"items": len(corpus.load_catalog().items),
|
|
97
|
+
"text_items": text_items,
|
|
98
|
+
"chunks": len(chunks),
|
|
99
|
+
"dimensions": (
|
|
100
|
+
int(embeddings.shape[1])
|
|
101
|
+
if embeddings.size
|
|
102
|
+
else recipe_config.embedding_provider.dimensions
|
|
103
|
+
),
|
|
104
|
+
}
|
|
105
|
+
run = run.model_copy(
|
|
106
|
+
update={"artifact_paths": [paths["embeddings"], paths["chunks"]], "stats": stats}
|
|
107
|
+
)
|
|
108
|
+
corpus.write_run(run)
|
|
109
|
+
return run
|
|
110
|
+
|
|
111
|
+
def query(
|
|
112
|
+
self,
|
|
113
|
+
corpus: Corpus,
|
|
114
|
+
*,
|
|
115
|
+
run: RetrievalRun,
|
|
116
|
+
query_text: str,
|
|
117
|
+
budget: QueryBudget,
|
|
118
|
+
) -> RetrievalResult:
|
|
119
|
+
"""
|
|
120
|
+
Query an embedding index run and return ranked evidence.
|
|
121
|
+
|
|
122
|
+
:param corpus: Corpus associated with the run.
|
|
123
|
+
:type corpus: Corpus
|
|
124
|
+
:param run: Run manifest to use for querying.
|
|
125
|
+
:type run: biblicus.models.RetrievalRun
|
|
126
|
+
:param query_text: Query text to embed.
|
|
127
|
+
:type query_text: str
|
|
128
|
+
:param budget: Evidence selection budget.
|
|
129
|
+
:type budget: biblicus.models.QueryBudget
|
|
130
|
+
:return: Retrieval results containing evidence.
|
|
131
|
+
:rtype: biblicus.models.RetrievalResult
|
|
132
|
+
"""
|
|
133
|
+
recipe_config = EmbeddingIndexInMemoryRecipeConfig.model_validate(run.recipe.config)
|
|
134
|
+
extraction_reference = resolve_extraction_reference(corpus, recipe_config)
|
|
135
|
+
|
|
136
|
+
paths = artifact_paths_for_run(run_id=run.run_id, backend_id=self.backend_id)
|
|
137
|
+
embeddings_path = corpus.root / paths["embeddings"]
|
|
138
|
+
chunks_path = corpus.root / paths["chunks"]
|
|
139
|
+
if not embeddings_path.is_file() or not chunks_path.is_file():
|
|
140
|
+
raise FileNotFoundError("Embedding index artifacts are missing for this run")
|
|
141
|
+
|
|
142
|
+
embeddings = read_embeddings(embeddings_path, mmap=False).astype(np.float32)
|
|
143
|
+
chunk_records = read_chunks_jsonl(chunks_path)
|
|
144
|
+
if embeddings.shape[0] != len(chunk_records):
|
|
145
|
+
raise ValueError(
|
|
146
|
+
"Embedding index artifacts are inconsistent: "
|
|
147
|
+
"embeddings row count does not match chunk record count"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
provider = recipe_config.embedding_provider.build_provider()
|
|
151
|
+
query_embedding = provider.embed_texts([query_text]).astype(np.float32)
|
|
152
|
+
if query_embedding.shape[0] != 1:
|
|
153
|
+
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
154
|
+
scores = cosine_similarity_scores(embeddings, query_embedding[0])
|
|
155
|
+
|
|
156
|
+
candidates = _top_indices(
|
|
157
|
+
scores,
|
|
158
|
+
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
159
|
+
)
|
|
160
|
+
evidence_items = _build_evidence(
|
|
161
|
+
corpus,
|
|
162
|
+
run=run,
|
|
163
|
+
recipe_config=recipe_config,
|
|
164
|
+
candidates=candidates,
|
|
165
|
+
scores=scores,
|
|
166
|
+
chunk_records=chunk_records,
|
|
167
|
+
extraction_reference=extraction_reference,
|
|
168
|
+
)
|
|
169
|
+
ranked = [
|
|
170
|
+
item.model_copy(
|
|
171
|
+
update={"rank": index, "recipe_id": run.recipe.recipe_id, "run_id": run.run_id}
|
|
172
|
+
)
|
|
173
|
+
for index, item in enumerate(evidence_items, start=1)
|
|
174
|
+
]
|
|
175
|
+
evidence = apply_budget(ranked, budget)
|
|
176
|
+
return RetrievalResult(
|
|
177
|
+
query_text=query_text,
|
|
178
|
+
budget=budget,
|
|
179
|
+
run_id=run.run_id,
|
|
180
|
+
recipe_id=run.recipe.recipe_id,
|
|
181
|
+
backend_id=self.backend_id,
|
|
182
|
+
generated_at=utc_now_iso(),
|
|
183
|
+
evidence=evidence,
|
|
184
|
+
stats={"candidates": len(evidence_items), "returned": len(evidence)},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _candidate_limit(max_total_items: int, *, multiplier: int = 10) -> int:
|
|
189
|
+
return max(1, int(max_total_items) * int(multiplier))
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _top_indices(scores: np.ndarray, *, limit: int) -> List[int]:
|
|
193
|
+
if scores.size == 0:
|
|
194
|
+
return []
|
|
195
|
+
limit = min(int(limit), int(scores.size))
|
|
196
|
+
indices = np.argpartition(-scores, limit - 1)[:limit]
|
|
197
|
+
sorted_indices = indices[np.argsort(-scores[indices])]
|
|
198
|
+
return [int(index) for index in sorted_indices]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _build_evidence(
|
|
202
|
+
corpus: Corpus,
|
|
203
|
+
*,
|
|
204
|
+
run: RetrievalRun,
|
|
205
|
+
recipe_config: EmbeddingIndexInMemoryRecipeConfig,
|
|
206
|
+
candidates: List[int],
|
|
207
|
+
scores: np.ndarray,
|
|
208
|
+
chunk_records: List[ChunkRecord],
|
|
209
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
210
|
+
) -> List[Evidence]:
|
|
211
|
+
catalog = corpus.load_catalog()
|
|
212
|
+
evidence_items: List[Evidence] = []
|
|
213
|
+
for idx in candidates:
|
|
214
|
+
record = chunk_records[idx]
|
|
215
|
+
item_id = record.item_id
|
|
216
|
+
span_start = record.span_start
|
|
217
|
+
span_end = record.span_end
|
|
218
|
+
catalog_item = catalog.items[item_id]
|
|
219
|
+
relpath = str(getattr(catalog_item, "relpath"))
|
|
220
|
+
media_type = str(getattr(catalog_item, "media_type"))
|
|
221
|
+
text = _load_text_for_evidence(
|
|
222
|
+
corpus,
|
|
223
|
+
item_id=item_id,
|
|
224
|
+
relpath=relpath,
|
|
225
|
+
media_type=media_type,
|
|
226
|
+
extraction_reference=extraction_reference,
|
|
227
|
+
)
|
|
228
|
+
snippet = _build_snippet(
|
|
229
|
+
text, (span_start, span_end), max_chars=recipe_config.snippet_characters
|
|
230
|
+
)
|
|
231
|
+
evidence_items.append(
|
|
232
|
+
Evidence(
|
|
233
|
+
item_id=item_id,
|
|
234
|
+
source_uri=getattr(catalog_item, "source_uri", None),
|
|
235
|
+
media_type=media_type,
|
|
236
|
+
score=float(scores[idx]),
|
|
237
|
+
rank=1,
|
|
238
|
+
text=snippet,
|
|
239
|
+
content_ref=None,
|
|
240
|
+
span_start=span_start,
|
|
241
|
+
span_end=span_end,
|
|
242
|
+
stage=EmbeddingIndexInMemoryBackend.backend_id,
|
|
243
|
+
stage_scores=None,
|
|
244
|
+
recipe_id=run.recipe.recipe_id,
|
|
245
|
+
run_id=run.run_id,
|
|
246
|
+
hash=hash_text(snippet),
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
return evidence_items
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _load_text_for_evidence(
|
|
253
|
+
corpus: Corpus,
|
|
254
|
+
*,
|
|
255
|
+
item_id: str,
|
|
256
|
+
relpath: str,
|
|
257
|
+
media_type: str,
|
|
258
|
+
extraction_reference: Optional[ExtractionRunReference],
|
|
259
|
+
) -> Optional[str]:
|
|
260
|
+
from .embedding_index_common import _load_text_from_item
|
|
261
|
+
|
|
262
|
+
return _load_text_from_item(
|
|
263
|
+
corpus,
|
|
264
|
+
item_id=item_id,
|
|
265
|
+
relpath=relpath,
|
|
266
|
+
media_type=media_type,
|
|
267
|
+
extraction_reference=extraction_reference,
|
|
268
|
+
)
|
biblicus/backends/hybrid.py
CHANGED
|
@@ -35,7 +35,7 @@ class HybridRecipeConfig(BaseModel):
|
|
|
35
35
|
model_config = ConfigDict(extra="forbid")
|
|
36
36
|
|
|
37
37
|
lexical_backend: str = Field(default="sqlite-full-text-search", min_length=1)
|
|
38
|
-
embedding_backend: str = Field(default="vector", min_length=1)
|
|
38
|
+
embedding_backend: str = Field(default="tf-vector", min_length=1)
|
|
39
39
|
lexical_weight: float = Field(default=0.5, ge=0, le=1)
|
|
40
40
|
embedding_weight: float = Field(default=0.5, ge=0, le=1)
|
|
41
41
|
lexical_config: Dict[str, object] = Field(default_factory=dict)
|
|
@@ -226,8 +226,10 @@ def _expand_component_budget(budget: QueryBudget, *, multiplier: int = 5) -> Que
|
|
|
226
226
|
if budget.max_items_per_source is not None
|
|
227
227
|
else None
|
|
228
228
|
)
|
|
229
|
+
requested_items = budget.max_total_items + budget.offset
|
|
229
230
|
return QueryBudget(
|
|
230
|
-
max_total_items=
|
|
231
|
+
max_total_items=requested_items * multiplier,
|
|
232
|
+
offset=0,
|
|
231
233
|
max_total_characters=expanded_characters,
|
|
232
234
|
max_items_per_source=expanded_max_items_per_source,
|
|
233
235
|
)
|
|
@@ -231,7 +231,7 @@ class SqliteFullTextSearchBackend:
|
|
|
231
231
|
candidates = _query_full_text_search_index(
|
|
232
232
|
db_path=db_path,
|
|
233
233
|
query_text=" ".join(filtered_tokens),
|
|
234
|
-
limit=_candidate_limit(budget.max_total_items),
|
|
234
|
+
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
235
235
|
snippet_characters=recipe_config.snippet_characters,
|
|
236
236
|
)
|
|
237
237
|
sorted_candidates = _rank_candidates(candidates)
|
|
@@ -24,9 +24,9 @@ from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifes
|
|
|
24
24
|
from ..time import utc_now_iso
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
class
|
|
27
|
+
class TfVectorRecipeConfig(BaseModel):
|
|
28
28
|
"""
|
|
29
|
-
Configuration for the vector retrieval backend.
|
|
29
|
+
Configuration for the term-frequency vector retrieval backend.
|
|
30
30
|
|
|
31
31
|
:ivar snippet_characters: Maximum characters to include in evidence snippets.
|
|
32
32
|
:vartype snippet_characters: int
|
|
@@ -40,7 +40,7 @@ class VectorRecipeConfig(BaseModel):
|
|
|
40
40
|
extraction_run: Optional[str] = None
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
class
|
|
43
|
+
class TfVectorBackend:
|
|
44
44
|
"""
|
|
45
45
|
Deterministic vector backend using term-frequency cosine similarity.
|
|
46
46
|
|
|
@@ -48,7 +48,7 @@ class VectorBackend:
|
|
|
48
48
|
:vartype backend_id: str
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
|
-
backend_id = "vector"
|
|
51
|
+
backend_id = "tf-vector"
|
|
52
52
|
|
|
53
53
|
def build_run(
|
|
54
54
|
self, corpus: Corpus, *, recipe_name: str, config: Dict[str, object]
|
|
@@ -65,7 +65,7 @@ class VectorBackend:
|
|
|
65
65
|
:return: Run manifest describing the build.
|
|
66
66
|
:rtype: RetrievalRun
|
|
67
67
|
"""
|
|
68
|
-
recipe_config =
|
|
68
|
+
recipe_config = TfVectorRecipeConfig.model_validate(config)
|
|
69
69
|
catalog = corpus.load_catalog()
|
|
70
70
|
recipe = create_recipe_manifest(
|
|
71
71
|
backend_id=self.backend_id,
|
|
@@ -102,7 +102,7 @@ class VectorBackend:
|
|
|
102
102
|
:return: Retrieval results containing evidence.
|
|
103
103
|
:rtype: RetrievalResult
|
|
104
104
|
"""
|
|
105
|
-
recipe_config =
|
|
105
|
+
recipe_config = TfVectorRecipeConfig.model_validate(run.recipe.config)
|
|
106
106
|
query_tokens = _tokenize_text(query_text)
|
|
107
107
|
if not query_tokens:
|
|
108
108
|
return RetrievalResult(
|
|
@@ -157,7 +157,7 @@ class VectorBackend:
|
|
|
157
157
|
|
|
158
158
|
|
|
159
159
|
def _resolve_extraction_reference(
|
|
160
|
-
corpus: Corpus, recipe_config:
|
|
160
|
+
corpus: Corpus, recipe_config: TfVectorRecipeConfig
|
|
161
161
|
) -> Optional[ExtractionRunReference]:
|
|
162
162
|
"""
|
|
163
163
|
Resolve an extraction run reference from a recipe config.
|
|
@@ -165,7 +165,7 @@ def _resolve_extraction_reference(
|
|
|
165
165
|
:param corpus: Corpus associated with the recipe.
|
|
166
166
|
:type corpus: Corpus
|
|
167
167
|
:param recipe_config: Parsed vector recipe configuration.
|
|
168
|
-
:type recipe_config:
|
|
168
|
+
:type recipe_config: TfVectorRecipeConfig
|
|
169
169
|
:return: Parsed extraction reference or None.
|
|
170
170
|
:rtype: ExtractionRunReference or None
|
|
171
171
|
:raises FileNotFoundError: If an extraction run is referenced but not present.
|
|
@@ -183,7 +183,7 @@ def _resolve_extraction_reference(
|
|
|
183
183
|
|
|
184
184
|
|
|
185
185
|
def _count_text_items(
|
|
186
|
-
corpus: Corpus, items: Iterable[object], recipe_config:
|
|
186
|
+
corpus: Corpus, items: Iterable[object], recipe_config: TfVectorRecipeConfig
|
|
187
187
|
) -> int:
|
|
188
188
|
"""
|
|
189
189
|
Count catalog items that represent text content.
|
|
@@ -193,7 +193,7 @@ def _count_text_items(
|
|
|
193
193
|
:param items: Catalog items to inspect.
|
|
194
194
|
:type items: Iterable[object]
|
|
195
195
|
:param recipe_config: Parsed vector recipe configuration.
|
|
196
|
-
:type recipe_config:
|
|
196
|
+
:type recipe_config: TfVectorRecipeConfig
|
|
197
197
|
:return: Number of text items.
|
|
198
198
|
:rtype: int
|
|
199
199
|
"""
|
|
@@ -451,7 +451,7 @@ def _score_items(
|
|
|
451
451
|
content_ref=None,
|
|
452
452
|
span_start=span_start,
|
|
453
453
|
span_end=span_end,
|
|
454
|
-
stage="vector",
|
|
454
|
+
stage="tf-vector",
|
|
455
455
|
recipe_id="",
|
|
456
456
|
run_id="",
|
|
457
457
|
hash=hash_text(snippet),
|