biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Retriever interface for Biblicus retrieval engines.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Dict
|
|
9
|
+
|
|
10
|
+
from ..corpus import Corpus
|
|
11
|
+
from ..models import QueryBudget, RetrievalResult, RetrievalSnapshot
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Retriever(ABC):
|
|
15
|
+
"""
|
|
16
|
+
Abstract interface for retrievers.
|
|
17
|
+
|
|
18
|
+
:ivar retriever_id: Identifier string for the retriever.
|
|
19
|
+
:vartype retriever_id: str
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
retriever_id: str
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def build_snapshot(
|
|
26
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
27
|
+
) -> RetrievalSnapshot:
|
|
28
|
+
"""
|
|
29
|
+
Build or register a retrieval snapshot for the retriever.
|
|
30
|
+
|
|
31
|
+
:param corpus: Corpus to build against.
|
|
32
|
+
:type corpus: Corpus
|
|
33
|
+
:param configuration_name: Human name for the configuration.
|
|
34
|
+
:type configuration_name: str
|
|
35
|
+
:param configuration: Retriever-specific configuration values.
|
|
36
|
+
:type configuration: dict[str, object]
|
|
37
|
+
:return: Snapshot manifest describing the build.
|
|
38
|
+
:rtype: RetrievalSnapshot
|
|
39
|
+
"""
|
|
40
|
+
raise NotImplementedError
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def query(
|
|
44
|
+
self,
|
|
45
|
+
corpus: Corpus,
|
|
46
|
+
*,
|
|
47
|
+
snapshot: RetrievalSnapshot,
|
|
48
|
+
query_text: str,
|
|
49
|
+
budget: QueryBudget,
|
|
50
|
+
) -> RetrievalResult:
|
|
51
|
+
"""
|
|
52
|
+
Run a retrieval query against a retriever.
|
|
53
|
+
|
|
54
|
+
:param corpus: Corpus associated with the snapshot.
|
|
55
|
+
:type corpus: Corpus
|
|
56
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
57
|
+
:type snapshot: RetrievalSnapshot
|
|
58
|
+
:param query_text: Query text to execute.
|
|
59
|
+
:type query_text: str
|
|
60
|
+
:param budget: Evidence selection budget.
|
|
61
|
+
:type budget: QueryBudget
|
|
62
|
+
:return: Retrieval results containing evidence.
|
|
63
|
+
:rtype: RetrievalResult
|
|
64
|
+
"""
|
|
65
|
+
raise NotImplementedError
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Shared primitives for embedding-index
|
|
2
|
+
Shared primitives for embedding-index retrievers.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -12,10 +12,11 @@ import numpy as np
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
13
13
|
|
|
14
14
|
from ..chunking import ChunkerConfig, TextChunk, TokenizerConfig
|
|
15
|
-
from ..
|
|
15
|
+
from ..constants import CORPUS_DIR_NAME, SNAPSHOTS_DIR_NAME
|
|
16
|
+
from ..corpus import Corpus
|
|
16
17
|
from ..embedding_providers import EmbeddingProviderConfig, _l2_normalize_rows
|
|
17
18
|
from ..frontmatter import parse_front_matter
|
|
18
|
-
from ..models import
|
|
19
|
+
from ..models import ExtractionSnapshotReference, parse_extraction_snapshot_reference
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ChunkRecord(BaseModel):
|
|
@@ -43,54 +44,87 @@ class ChunkRecord(BaseModel):
|
|
|
43
44
|
return self
|
|
44
45
|
|
|
45
46
|
|
|
46
|
-
class
|
|
47
|
+
class EmbeddingIndexConfiguration(BaseModel):
|
|
47
48
|
"""
|
|
48
|
-
Configuration for embedding-index
|
|
49
|
+
Configuration for embedding-index retrievers.
|
|
49
50
|
|
|
50
|
-
:ivar
|
|
51
|
-
:vartype
|
|
52
|
-
:ivar extraction_run: Optional extraction run reference in the form extractor_id:run_id.
|
|
53
|
-
:vartype extraction_run: str or None
|
|
51
|
+
:ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
|
|
52
|
+
:vartype extraction_snapshot: str or None
|
|
54
53
|
:ivar chunker: Chunker configuration.
|
|
55
54
|
:vartype chunker: biblicus.chunking.ChunkerConfig
|
|
56
55
|
:ivar tokenizer: Optional tokenizer configuration.
|
|
57
56
|
:vartype tokenizer: biblicus.chunking.TokenizerConfig or None
|
|
58
57
|
:ivar embedding_provider: Embedding provider configuration.
|
|
59
58
|
:vartype embedding_provider: biblicus.embedding_providers.EmbeddingProviderConfig
|
|
59
|
+
:ivar snippet_characters: Optional maximum character count for returned evidence text.
|
|
60
|
+
:vartype snippet_characters: int or None
|
|
61
|
+
:ivar maximum_cache_total_items: Optional maximum number of vectors cached per scan batch.
|
|
62
|
+
:vartype maximum_cache_total_items: int or None
|
|
63
|
+
:ivar maximum_cache_total_characters: Optional maximum characters cached per scan batch.
|
|
64
|
+
:vartype maximum_cache_total_characters: int or None
|
|
60
65
|
"""
|
|
61
66
|
|
|
62
67
|
model_config = ConfigDict(extra="forbid")
|
|
63
68
|
|
|
64
|
-
snippet_characters: int = Field(default=
|
|
65
|
-
|
|
69
|
+
snippet_characters: Optional[int] = Field(default=None, ge=1)
|
|
70
|
+
maximum_cache_total_items: Optional[int] = Field(default=None, ge=1)
|
|
71
|
+
maximum_cache_total_characters: Optional[int] = Field(default=None, ge=1)
|
|
72
|
+
extraction_snapshot: Optional[str] = None
|
|
66
73
|
chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
|
|
67
74
|
tokenizer: Optional[TokenizerConfig] = None
|
|
68
75
|
embedding_provider: EmbeddingProviderConfig
|
|
69
76
|
|
|
70
77
|
|
|
78
|
+
def _extract_span_text(text: Optional[str], span: Tuple[int, int]) -> Optional[str]:
|
|
79
|
+
if not isinstance(text, str):
|
|
80
|
+
return None
|
|
81
|
+
span_start, span_end = span
|
|
82
|
+
if span_start < 0 or span_end <= span_start:
|
|
83
|
+
return text
|
|
84
|
+
return text[span_start:span_end]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _build_snippet(
|
|
88
|
+
text: Optional[str], span: Tuple[int, int], max_chars: Optional[int]
|
|
89
|
+
) -> Optional[str]:
|
|
90
|
+
if not isinstance(text, str):
|
|
91
|
+
return None
|
|
92
|
+
if max_chars is None:
|
|
93
|
+
return _extract_span_text(text, span)
|
|
94
|
+
if max_chars <= 0:
|
|
95
|
+
return ""
|
|
96
|
+
span_start, span_end = span
|
|
97
|
+
if span_start < 0 or span_end <= span_start:
|
|
98
|
+
return text[:max_chars]
|
|
99
|
+
half_window = max_chars // 2
|
|
100
|
+
snippet_start = max(span_start - half_window, 0)
|
|
101
|
+
snippet_end = min(span_end + half_window, len(text))
|
|
102
|
+
return text[snippet_start:snippet_end]
|
|
103
|
+
|
|
104
|
+
|
|
71
105
|
def resolve_extraction_reference(
|
|
72
|
-
corpus: Corpus,
|
|
73
|
-
) -> Optional[
|
|
106
|
+
corpus: Corpus, configuration: EmbeddingIndexConfiguration
|
|
107
|
+
) -> Optional[ExtractionSnapshotReference]:
|
|
74
108
|
"""
|
|
75
|
-
Resolve an extraction
|
|
109
|
+
Resolve an extraction snapshot reference from an embedding-index configuration.
|
|
76
110
|
|
|
77
|
-
:param corpus: Corpus associated with the
|
|
111
|
+
:param corpus: Corpus associated with the configuration.
|
|
78
112
|
:type corpus: Corpus
|
|
79
|
-
:param
|
|
80
|
-
:type
|
|
113
|
+
:param configuration: Parsed embedding-index configuration.
|
|
114
|
+
:type configuration: EmbeddingIndexConfiguration
|
|
81
115
|
:return: Parsed extraction reference or None.
|
|
82
|
-
:rtype:
|
|
83
|
-
:raises FileNotFoundError: If an extraction
|
|
116
|
+
:rtype: ExtractionSnapshotReference or None
|
|
117
|
+
:raises FileNotFoundError: If an extraction snapshot is referenced but not present.
|
|
84
118
|
"""
|
|
85
|
-
if not
|
|
119
|
+
if not configuration.extraction_snapshot:
|
|
86
120
|
return None
|
|
87
|
-
extraction_reference =
|
|
88
|
-
|
|
121
|
+
extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
|
|
122
|
+
snapshot_dir = corpus.extraction_snapshot_dir(
|
|
89
123
|
extractor_id=extraction_reference.extractor_id,
|
|
90
|
-
|
|
124
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
91
125
|
)
|
|
92
|
-
if not
|
|
93
|
-
raise FileNotFoundError(f"Missing extraction
|
|
126
|
+
if not snapshot_dir.is_dir():
|
|
127
|
+
raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
|
|
94
128
|
return extraction_reference
|
|
95
129
|
|
|
96
130
|
|
|
@@ -100,12 +134,12 @@ def _load_text_from_item(
|
|
|
100
134
|
item_id: str,
|
|
101
135
|
relpath: str,
|
|
102
136
|
media_type: str,
|
|
103
|
-
extraction_reference: Optional[
|
|
137
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
104
138
|
) -> Optional[str]:
|
|
105
139
|
if extraction_reference:
|
|
106
140
|
extracted_text = corpus.read_extracted_text(
|
|
107
141
|
extractor_id=extraction_reference.extractor_id,
|
|
108
|
-
|
|
142
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
109
143
|
item_id=item_id,
|
|
110
144
|
)
|
|
111
145
|
if isinstance(extracted_text, str):
|
|
@@ -120,7 +154,7 @@ def _load_text_from_item(
|
|
|
120
154
|
|
|
121
155
|
|
|
122
156
|
def iter_text_payloads(
|
|
123
|
-
corpus: Corpus, *, extraction_reference: Optional[
|
|
157
|
+
corpus: Corpus, *, extraction_reference: Optional[ExtractionSnapshotReference]
|
|
124
158
|
) -> Iterator[Tuple[object, str]]:
|
|
125
159
|
"""
|
|
126
160
|
Yield catalog items and their text payloads.
|
|
@@ -128,7 +162,7 @@ def iter_text_payloads(
|
|
|
128
162
|
:param corpus: Corpus containing the items.
|
|
129
163
|
:type corpus: Corpus
|
|
130
164
|
:param extraction_reference: Optional extraction reference.
|
|
131
|
-
:type extraction_reference:
|
|
165
|
+
:type extraction_reference: ExtractionSnapshotReference or None
|
|
132
166
|
:yield: (catalog_item, text) pairs.
|
|
133
167
|
:rtype: Iterator[tuple[object, str]]
|
|
134
168
|
"""
|
|
@@ -152,21 +186,21 @@ def iter_text_payloads(
|
|
|
152
186
|
|
|
153
187
|
|
|
154
188
|
def collect_chunks(
|
|
155
|
-
corpus: Corpus, *,
|
|
189
|
+
corpus: Corpus, *, configuration: EmbeddingIndexConfiguration
|
|
156
190
|
) -> Tuple[List[TextChunk], int]:
|
|
157
191
|
"""
|
|
158
192
|
Collect chunks from text payloads in a corpus.
|
|
159
193
|
|
|
160
194
|
:param corpus: Corpus to chunk.
|
|
161
195
|
:type corpus: Corpus
|
|
162
|
-
:param
|
|
163
|
-
:type
|
|
196
|
+
:param configuration: Parsed embedding-index configuration.
|
|
197
|
+
:type configuration: EmbeddingIndexConfiguration
|
|
164
198
|
:return: (chunks, text_item_count)
|
|
165
199
|
:rtype: tuple[list[TextChunk], int]
|
|
166
200
|
"""
|
|
167
|
-
tokenizer =
|
|
168
|
-
chunker =
|
|
169
|
-
extraction_reference = resolve_extraction_reference(corpus,
|
|
201
|
+
tokenizer = configuration.tokenizer.build_tokenizer() if configuration.tokenizer else None
|
|
202
|
+
chunker = configuration.chunker.build_chunker(tokenizer=tokenizer)
|
|
203
|
+
extraction_reference = resolve_extraction_reference(corpus, configuration)
|
|
170
204
|
|
|
171
205
|
chunks: List[TextChunk] = []
|
|
172
206
|
next_chunk_id = 0
|
|
@@ -284,18 +318,20 @@ def cosine_similarity_scores(embeddings: np.ndarray, query_vector: np.ndarray) -
|
|
|
284
318
|
return embeddings @ query_vector
|
|
285
319
|
|
|
286
320
|
|
|
287
|
-
def
|
|
321
|
+
def artifact_paths_for_snapshot(*, snapshot_id: str, retriever_id: str) -> Dict[str, str]:
|
|
288
322
|
"""
|
|
289
|
-
Build deterministic artifact relative paths for an embedding index
|
|
323
|
+
Build deterministic artifact relative paths for an embedding index snapshot.
|
|
290
324
|
|
|
291
|
-
:param
|
|
292
|
-
:type
|
|
293
|
-
:param
|
|
294
|
-
:type
|
|
325
|
+
:param snapshot_id: Snapshot identifier.
|
|
326
|
+
:type snapshot_id: str
|
|
327
|
+
:param retriever_id: Retriever identifier.
|
|
328
|
+
:type retriever_id: str
|
|
295
329
|
:return: Mapping with keys embeddings and chunks.
|
|
296
330
|
:rtype: dict[str, str]
|
|
297
331
|
"""
|
|
298
|
-
prefix = f"{
|
|
299
|
-
embeddings_relpath = str(
|
|
300
|
-
|
|
332
|
+
prefix = f"{snapshot_id}.{retriever_id}"
|
|
333
|
+
embeddings_relpath = str(
|
|
334
|
+
Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.embeddings.npy"
|
|
335
|
+
)
|
|
336
|
+
chunks_relpath = str(Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.chunks.jsonl")
|
|
301
337
|
return {"embeddings": embeddings_relpath, "chunks": chunks_relpath}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Embedding-index
|
|
2
|
+
Embedding-index retriever that reads the embedding matrix via memory mapping.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -10,13 +10,26 @@ from typing import Dict, List, Optional
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
12
|
from ..corpus import Corpus
|
|
13
|
-
from ..models import
|
|
14
|
-
|
|
13
|
+
from ..models import (
|
|
14
|
+
Evidence,
|
|
15
|
+
ExtractionSnapshotReference,
|
|
16
|
+
QueryBudget,
|
|
17
|
+
RetrievalResult,
|
|
18
|
+
RetrievalSnapshot,
|
|
19
|
+
)
|
|
20
|
+
from ..retrieval import (
|
|
21
|
+
apply_budget,
|
|
22
|
+
create_configuration_manifest,
|
|
23
|
+
create_snapshot_manifest,
|
|
24
|
+
hash_text,
|
|
25
|
+
)
|
|
15
26
|
from ..time import utc_now_iso
|
|
16
27
|
from .embedding_index_common import (
|
|
17
28
|
ChunkRecord,
|
|
18
|
-
|
|
19
|
-
|
|
29
|
+
EmbeddingIndexConfiguration,
|
|
30
|
+
_build_snippet,
|
|
31
|
+
_extract_span_text,
|
|
32
|
+
artifact_paths_for_snapshot,
|
|
20
33
|
chunks_to_records,
|
|
21
34
|
collect_chunks,
|
|
22
35
|
cosine_similarity_scores,
|
|
@@ -26,48 +39,54 @@ from .embedding_index_common import (
|
|
|
26
39
|
write_chunks_jsonl,
|
|
27
40
|
write_embeddings,
|
|
28
41
|
)
|
|
29
|
-
from .scan import _build_snippet
|
|
30
42
|
|
|
31
43
|
|
|
32
|
-
class
|
|
44
|
+
class EmbeddingIndexFileRetriever:
|
|
33
45
|
"""
|
|
34
|
-
Embedding retrieval
|
|
46
|
+
Embedding retrieval retriever using memory-mapped similarity scanning.
|
|
35
47
|
"""
|
|
36
48
|
|
|
37
|
-
|
|
49
|
+
retriever_id = "embedding-index-file"
|
|
38
50
|
|
|
39
|
-
def
|
|
40
|
-
self, corpus: Corpus, *,
|
|
41
|
-
) ->
|
|
51
|
+
def build_snapshot(
|
|
52
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
53
|
+
) -> RetrievalSnapshot:
|
|
42
54
|
"""
|
|
43
|
-
Build an embedding index
|
|
55
|
+
Build an embedding index snapshot by chunking text payloads and materializing embeddings.
|
|
44
56
|
|
|
45
57
|
:param corpus: Corpus to build against.
|
|
46
58
|
:type corpus: Corpus
|
|
47
|
-
:param
|
|
48
|
-
:type
|
|
49
|
-
:param
|
|
50
|
-
:type
|
|
51
|
-
:return:
|
|
52
|
-
:rtype: biblicus.models.
|
|
59
|
+
:param configuration_name: Human-readable configuration name.
|
|
60
|
+
:type configuration_name: str
|
|
61
|
+
:param configuration: Retriever-specific configuration values.
|
|
62
|
+
:type configuration: dict[str, object]
|
|
63
|
+
:return: Snapshot manifest describing the build.
|
|
64
|
+
:rtype: biblicus.models.RetrievalSnapshot
|
|
53
65
|
"""
|
|
54
|
-
|
|
55
|
-
chunks, text_items = collect_chunks(corpus,
|
|
66
|
+
parsed_config = EmbeddingIndexConfiguration.model_validate(configuration)
|
|
67
|
+
chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
|
|
56
68
|
|
|
57
|
-
provider =
|
|
69
|
+
provider = parsed_config.embedding_provider.build_provider()
|
|
58
70
|
embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
|
|
59
71
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
name=
|
|
63
|
-
|
|
72
|
+
configuration_manifest = create_configuration_manifest(
|
|
73
|
+
retriever_id=self.retriever_id,
|
|
74
|
+
name=configuration_name,
|
|
75
|
+
configuration=parsed_config.model_dump(),
|
|
76
|
+
)
|
|
77
|
+
snapshot = create_snapshot_manifest(
|
|
78
|
+
corpus,
|
|
79
|
+
configuration=configuration_manifest,
|
|
80
|
+
stats={},
|
|
81
|
+
snapshot_artifacts=[],
|
|
64
82
|
)
|
|
65
|
-
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
66
83
|
|
|
67
|
-
paths =
|
|
84
|
+
paths = artifact_paths_for_snapshot(
|
|
85
|
+
snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
|
|
86
|
+
)
|
|
68
87
|
embeddings_path = corpus.root / paths["embeddings"]
|
|
69
88
|
chunks_path = corpus.root / paths["chunks"]
|
|
70
|
-
corpus.
|
|
89
|
+
corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
71
90
|
|
|
72
91
|
write_embeddings(embeddings_path, embeddings)
|
|
73
92
|
write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
|
|
@@ -79,30 +98,33 @@ class EmbeddingIndexFileBackend:
|
|
|
79
98
|
"dimensions": (
|
|
80
99
|
int(embeddings.shape[1])
|
|
81
100
|
if embeddings.size
|
|
82
|
-
else
|
|
101
|
+
else parsed_config.embedding_provider.dimensions
|
|
83
102
|
),
|
|
84
103
|
}
|
|
85
|
-
|
|
86
|
-
update={
|
|
104
|
+
snapshot = snapshot.model_copy(
|
|
105
|
+
update={
|
|
106
|
+
"snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
|
|
107
|
+
"stats": stats,
|
|
108
|
+
}
|
|
87
109
|
)
|
|
88
|
-
corpus.
|
|
89
|
-
return
|
|
110
|
+
corpus.write_snapshot(snapshot)
|
|
111
|
+
return snapshot
|
|
90
112
|
|
|
91
113
|
def query(
|
|
92
114
|
self,
|
|
93
115
|
corpus: Corpus,
|
|
94
116
|
*,
|
|
95
|
-
|
|
117
|
+
snapshot: RetrievalSnapshot,
|
|
96
118
|
query_text: str,
|
|
97
119
|
budget: QueryBudget,
|
|
98
120
|
) -> RetrievalResult:
|
|
99
121
|
"""
|
|
100
|
-
Query an embedding index
|
|
122
|
+
Query an embedding index snapshot and return ranked evidence.
|
|
101
123
|
|
|
102
|
-
:param corpus: Corpus associated with the
|
|
124
|
+
:param corpus: Corpus associated with the snapshot.
|
|
103
125
|
:type corpus: Corpus
|
|
104
|
-
:param
|
|
105
|
-
:type
|
|
126
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
127
|
+
:type snapshot: biblicus.models.RetrievalSnapshot
|
|
106
128
|
:param query_text: Query text to embed.
|
|
107
129
|
:type query_text: str
|
|
108
130
|
:param budget: Evidence selection budget.
|
|
@@ -110,14 +132,18 @@ class EmbeddingIndexFileBackend:
|
|
|
110
132
|
:return: Retrieval results containing evidence.
|
|
111
133
|
:rtype: biblicus.models.RetrievalResult
|
|
112
134
|
"""
|
|
113
|
-
|
|
114
|
-
|
|
135
|
+
parsed_config = EmbeddingIndexConfiguration.model_validate(
|
|
136
|
+
snapshot.configuration.configuration
|
|
137
|
+
)
|
|
138
|
+
extraction_reference = resolve_extraction_reference(corpus, parsed_config)
|
|
115
139
|
|
|
116
|
-
paths =
|
|
140
|
+
paths = artifact_paths_for_snapshot(
|
|
141
|
+
snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
|
|
142
|
+
)
|
|
117
143
|
embeddings_path = corpus.root / paths["embeddings"]
|
|
118
144
|
chunks_path = corpus.root / paths["chunks"]
|
|
119
145
|
if not embeddings_path.is_file() or not chunks_path.is_file():
|
|
120
|
-
raise FileNotFoundError("Embedding index artifacts are missing for this
|
|
146
|
+
raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
|
|
121
147
|
|
|
122
148
|
embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
|
|
123
149
|
chunk_records = read_chunks_jsonl(chunks_path)
|
|
@@ -127,20 +153,22 @@ class EmbeddingIndexFileBackend:
|
|
|
127
153
|
"embeddings row count does not match chunk record count"
|
|
128
154
|
)
|
|
129
155
|
|
|
130
|
-
provider =
|
|
156
|
+
provider = parsed_config.embedding_provider.build_provider()
|
|
131
157
|
query_embedding = provider.embed_texts([query_text]).astype(np.float32)
|
|
132
158
|
if query_embedding.shape[0] != 1:
|
|
133
159
|
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
134
160
|
|
|
161
|
+
batch_rows = parsed_config.maximum_cache_total_items or 4096
|
|
135
162
|
candidates = _top_indices_batched(
|
|
136
163
|
embeddings=embeddings,
|
|
137
164
|
query_vector=query_embedding[0],
|
|
138
165
|
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
166
|
+
batch_rows=batch_rows,
|
|
139
167
|
)
|
|
140
168
|
evidence_items = _build_evidence(
|
|
141
169
|
corpus,
|
|
142
|
-
|
|
143
|
-
|
|
170
|
+
snapshot=snapshot,
|
|
171
|
+
configuration=parsed_config,
|
|
144
172
|
candidates=candidates,
|
|
145
173
|
embeddings=embeddings,
|
|
146
174
|
query_vector=query_embedding[0],
|
|
@@ -149,7 +177,11 @@ class EmbeddingIndexFileBackend:
|
|
|
149
177
|
)
|
|
150
178
|
ranked = [
|
|
151
179
|
item.model_copy(
|
|
152
|
-
update={
|
|
180
|
+
update={
|
|
181
|
+
"rank": index,
|
|
182
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
183
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
184
|
+
}
|
|
153
185
|
)
|
|
154
186
|
for index, item in enumerate(evidence_items, start=1)
|
|
155
187
|
]
|
|
@@ -157,9 +189,9 @@ class EmbeddingIndexFileBackend:
|
|
|
157
189
|
return RetrievalResult(
|
|
158
190
|
query_text=query_text,
|
|
159
191
|
budget=budget,
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
192
|
+
snapshot_id=snapshot.snapshot_id,
|
|
193
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
194
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
163
195
|
generated_at=utc_now_iso(),
|
|
164
196
|
evidence=evidence,
|
|
165
197
|
stats={"candidates": len(evidence_items), "returned": len(evidence)},
|
|
@@ -202,13 +234,13 @@ def _top_indices_batched(
|
|
|
202
234
|
def _build_evidence(
|
|
203
235
|
corpus: Corpus,
|
|
204
236
|
*,
|
|
205
|
-
|
|
206
|
-
|
|
237
|
+
snapshot: RetrievalSnapshot,
|
|
238
|
+
configuration: EmbeddingIndexConfiguration,
|
|
207
239
|
candidates: List[int],
|
|
208
240
|
embeddings: np.ndarray,
|
|
209
241
|
query_vector: np.ndarray,
|
|
210
242
|
chunk_records: List[ChunkRecord],
|
|
211
|
-
extraction_reference: Optional[
|
|
243
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
212
244
|
) -> List[Evidence]:
|
|
213
245
|
catalog = corpus.load_catalog()
|
|
214
246
|
evidence_items: List[Evidence] = []
|
|
@@ -222,9 +254,11 @@ def _build_evidence(
|
|
|
222
254
|
media_type=str(getattr(catalog_item, "media_type")),
|
|
223
255
|
extraction_reference=extraction_reference,
|
|
224
256
|
)
|
|
225
|
-
|
|
226
|
-
text, (record.span_start, record.span_end),
|
|
257
|
+
span_text = _build_snippet(
|
|
258
|
+
text, (record.span_start, record.span_end), configuration.snippet_characters
|
|
227
259
|
)
|
|
260
|
+
if span_text is None:
|
|
261
|
+
span_text = _extract_span_text(text, (record.span_start, record.span_end))
|
|
228
262
|
score = float(cosine_similarity_scores(embeddings[idx : idx + 1], query_vector)[0])
|
|
229
263
|
evidence_items.append(
|
|
230
264
|
Evidence(
|
|
@@ -233,15 +267,16 @@ def _build_evidence(
|
|
|
233
267
|
media_type=str(getattr(catalog_item, "media_type")),
|
|
234
268
|
score=score,
|
|
235
269
|
rank=1,
|
|
236
|
-
text=
|
|
270
|
+
text=span_text,
|
|
237
271
|
content_ref=None,
|
|
238
272
|
span_start=record.span_start,
|
|
239
273
|
span_end=record.span_end,
|
|
240
|
-
stage=
|
|
274
|
+
stage=EmbeddingIndexFileRetriever.retriever_id,
|
|
241
275
|
stage_scores=None,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
276
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
277
|
+
snapshot_id=snapshot.snapshot_id,
|
|
278
|
+
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
279
|
+
hash=hash_text(span_text or ""),
|
|
245
280
|
)
|
|
246
281
|
)
|
|
247
282
|
return evidence_items
|
|
@@ -253,7 +288,7 @@ def _load_text_for_evidence(
|
|
|
253
288
|
item_id: str,
|
|
254
289
|
relpath: str,
|
|
255
290
|
media_type: str,
|
|
256
|
-
extraction_reference: Optional[
|
|
291
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
257
292
|
) -> Optional[str]:
|
|
258
293
|
from .embedding_index_common import _load_text_from_item
|
|
259
294
|
|