biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Shared primitives for embedding-index
|
|
2
|
+
Shared primitives for embedding-index retrievers.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -12,10 +12,11 @@ import numpy as np
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
13
13
|
|
|
14
14
|
from ..chunking import ChunkerConfig, TextChunk, TokenizerConfig
|
|
15
|
-
from ..
|
|
15
|
+
from ..constants import CORPUS_DIR_NAME, SNAPSHOTS_DIR_NAME
|
|
16
|
+
from ..corpus import Corpus
|
|
16
17
|
from ..embedding_providers import EmbeddingProviderConfig, _l2_normalize_rows
|
|
17
18
|
from ..frontmatter import parse_front_matter
|
|
18
|
-
from ..models import
|
|
19
|
+
from ..models import ExtractionSnapshotReference, parse_extraction_snapshot_reference
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ChunkRecord(BaseModel):
|
|
@@ -43,12 +44,12 @@ class ChunkRecord(BaseModel):
|
|
|
43
44
|
return self
|
|
44
45
|
|
|
45
46
|
|
|
46
|
-
class
|
|
47
|
+
class EmbeddingIndexConfiguration(BaseModel):
|
|
47
48
|
"""
|
|
48
|
-
Configuration for embedding-index
|
|
49
|
+
Configuration for embedding-index retrievers.
|
|
49
50
|
|
|
50
|
-
:ivar
|
|
51
|
-
:vartype
|
|
51
|
+
:ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
|
|
52
|
+
:vartype extraction_snapshot: str or None
|
|
52
53
|
:ivar chunker: Chunker configuration.
|
|
53
54
|
:vartype chunker: biblicus.chunking.ChunkerConfig
|
|
54
55
|
:ivar tokenizer: Optional tokenizer configuration.
|
|
@@ -68,7 +69,7 @@ class EmbeddingIndexRecipeConfig(BaseModel):
|
|
|
68
69
|
snippet_characters: Optional[int] = Field(default=None, ge=1)
|
|
69
70
|
maximum_cache_total_items: Optional[int] = Field(default=None, ge=1)
|
|
70
71
|
maximum_cache_total_characters: Optional[int] = Field(default=None, ge=1)
|
|
71
|
-
|
|
72
|
+
extraction_snapshot: Optional[str] = None
|
|
72
73
|
chunker: ChunkerConfig = Field(default_factory=lambda: ChunkerConfig(chunker_id="paragraph"))
|
|
73
74
|
tokenizer: Optional[TokenizerConfig] = None
|
|
74
75
|
embedding_provider: EmbeddingProviderConfig
|
|
@@ -102,28 +103,28 @@ def _build_snippet(
|
|
|
102
103
|
|
|
103
104
|
|
|
104
105
|
def resolve_extraction_reference(
|
|
105
|
-
corpus: Corpus,
|
|
106
|
-
) -> Optional[
|
|
106
|
+
corpus: Corpus, configuration: EmbeddingIndexConfiguration
|
|
107
|
+
) -> Optional[ExtractionSnapshotReference]:
|
|
107
108
|
"""
|
|
108
|
-
Resolve an extraction
|
|
109
|
+
Resolve an extraction snapshot reference from an embedding-index configuration.
|
|
109
110
|
|
|
110
|
-
:param corpus: Corpus associated with the
|
|
111
|
+
:param corpus: Corpus associated with the configuration.
|
|
111
112
|
:type corpus: Corpus
|
|
112
|
-
:param
|
|
113
|
-
:type
|
|
113
|
+
:param configuration: Parsed embedding-index configuration.
|
|
114
|
+
:type configuration: EmbeddingIndexConfiguration
|
|
114
115
|
:return: Parsed extraction reference or None.
|
|
115
|
-
:rtype:
|
|
116
|
-
:raises FileNotFoundError: If an extraction
|
|
116
|
+
:rtype: ExtractionSnapshotReference or None
|
|
117
|
+
:raises FileNotFoundError: If an extraction snapshot is referenced but not present.
|
|
117
118
|
"""
|
|
118
|
-
if not
|
|
119
|
+
if not configuration.extraction_snapshot:
|
|
119
120
|
return None
|
|
120
|
-
extraction_reference =
|
|
121
|
-
|
|
121
|
+
extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
|
|
122
|
+
snapshot_dir = corpus.extraction_snapshot_dir(
|
|
122
123
|
extractor_id=extraction_reference.extractor_id,
|
|
123
|
-
|
|
124
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
124
125
|
)
|
|
125
|
-
if not
|
|
126
|
-
raise FileNotFoundError(f"Missing extraction
|
|
126
|
+
if not snapshot_dir.is_dir():
|
|
127
|
+
raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
|
|
127
128
|
return extraction_reference
|
|
128
129
|
|
|
129
130
|
|
|
@@ -133,12 +134,12 @@ def _load_text_from_item(
|
|
|
133
134
|
item_id: str,
|
|
134
135
|
relpath: str,
|
|
135
136
|
media_type: str,
|
|
136
|
-
extraction_reference: Optional[
|
|
137
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
137
138
|
) -> Optional[str]:
|
|
138
139
|
if extraction_reference:
|
|
139
140
|
extracted_text = corpus.read_extracted_text(
|
|
140
141
|
extractor_id=extraction_reference.extractor_id,
|
|
141
|
-
|
|
142
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
142
143
|
item_id=item_id,
|
|
143
144
|
)
|
|
144
145
|
if isinstance(extracted_text, str):
|
|
@@ -153,7 +154,7 @@ def _load_text_from_item(
|
|
|
153
154
|
|
|
154
155
|
|
|
155
156
|
def iter_text_payloads(
|
|
156
|
-
corpus: Corpus, *, extraction_reference: Optional[
|
|
157
|
+
corpus: Corpus, *, extraction_reference: Optional[ExtractionSnapshotReference]
|
|
157
158
|
) -> Iterator[Tuple[object, str]]:
|
|
158
159
|
"""
|
|
159
160
|
Yield catalog items and their text payloads.
|
|
@@ -161,7 +162,7 @@ def iter_text_payloads(
|
|
|
161
162
|
:param corpus: Corpus containing the items.
|
|
162
163
|
:type corpus: Corpus
|
|
163
164
|
:param extraction_reference: Optional extraction reference.
|
|
164
|
-
:type extraction_reference:
|
|
165
|
+
:type extraction_reference: ExtractionSnapshotReference or None
|
|
165
166
|
:yield: (catalog_item, text) pairs.
|
|
166
167
|
:rtype: Iterator[tuple[object, str]]
|
|
167
168
|
"""
|
|
@@ -185,21 +186,21 @@ def iter_text_payloads(
|
|
|
185
186
|
|
|
186
187
|
|
|
187
188
|
def collect_chunks(
|
|
188
|
-
corpus: Corpus, *,
|
|
189
|
+
corpus: Corpus, *, configuration: EmbeddingIndexConfiguration
|
|
189
190
|
) -> Tuple[List[TextChunk], int]:
|
|
190
191
|
"""
|
|
191
192
|
Collect chunks from text payloads in a corpus.
|
|
192
193
|
|
|
193
194
|
:param corpus: Corpus to chunk.
|
|
194
195
|
:type corpus: Corpus
|
|
195
|
-
:param
|
|
196
|
-
:type
|
|
196
|
+
:param configuration: Parsed embedding-index configuration.
|
|
197
|
+
:type configuration: EmbeddingIndexConfiguration
|
|
197
198
|
:return: (chunks, text_item_count)
|
|
198
199
|
:rtype: tuple[list[TextChunk], int]
|
|
199
200
|
"""
|
|
200
|
-
tokenizer =
|
|
201
|
-
chunker =
|
|
202
|
-
extraction_reference = resolve_extraction_reference(corpus,
|
|
201
|
+
tokenizer = configuration.tokenizer.build_tokenizer() if configuration.tokenizer else None
|
|
202
|
+
chunker = configuration.chunker.build_chunker(tokenizer=tokenizer)
|
|
203
|
+
extraction_reference = resolve_extraction_reference(corpus, configuration)
|
|
203
204
|
|
|
204
205
|
chunks: List[TextChunk] = []
|
|
205
206
|
next_chunk_id = 0
|
|
@@ -317,18 +318,20 @@ def cosine_similarity_scores(embeddings: np.ndarray, query_vector: np.ndarray) -
|
|
|
317
318
|
return embeddings @ query_vector
|
|
318
319
|
|
|
319
320
|
|
|
320
|
-
def
|
|
321
|
+
def artifact_paths_for_snapshot(*, snapshot_id: str, retriever_id: str) -> Dict[str, str]:
|
|
321
322
|
"""
|
|
322
|
-
Build deterministic artifact relative paths for an embedding index
|
|
323
|
+
Build deterministic artifact relative paths for an embedding index snapshot.
|
|
323
324
|
|
|
324
|
-
:param
|
|
325
|
-
:type
|
|
326
|
-
:param
|
|
327
|
-
:type
|
|
325
|
+
:param snapshot_id: Snapshot identifier.
|
|
326
|
+
:type snapshot_id: str
|
|
327
|
+
:param retriever_id: Retriever identifier.
|
|
328
|
+
:type retriever_id: str
|
|
328
329
|
:return: Mapping with keys embeddings and chunks.
|
|
329
330
|
:rtype: dict[str, str]
|
|
330
331
|
"""
|
|
331
|
-
prefix = f"{
|
|
332
|
-
embeddings_relpath = str(
|
|
333
|
-
|
|
332
|
+
prefix = f"{snapshot_id}.{retriever_id}"
|
|
333
|
+
embeddings_relpath = str(
|
|
334
|
+
Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.embeddings.npy"
|
|
335
|
+
)
|
|
336
|
+
chunks_relpath = str(Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{prefix}.chunks.jsonl")
|
|
334
337
|
return {"embeddings": embeddings_relpath, "chunks": chunks_relpath}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Embedding-index
|
|
2
|
+
Embedding-index retriever that reads the embedding matrix via memory mapping.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -10,15 +10,26 @@ from typing import Dict, List, Optional
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
12
|
from ..corpus import Corpus
|
|
13
|
-
from ..models import
|
|
14
|
-
|
|
13
|
+
from ..models import (
|
|
14
|
+
Evidence,
|
|
15
|
+
ExtractionSnapshotReference,
|
|
16
|
+
QueryBudget,
|
|
17
|
+
RetrievalResult,
|
|
18
|
+
RetrievalSnapshot,
|
|
19
|
+
)
|
|
20
|
+
from ..retrieval import (
|
|
21
|
+
apply_budget,
|
|
22
|
+
create_configuration_manifest,
|
|
23
|
+
create_snapshot_manifest,
|
|
24
|
+
hash_text,
|
|
25
|
+
)
|
|
15
26
|
from ..time import utc_now_iso
|
|
16
27
|
from .embedding_index_common import (
|
|
17
28
|
ChunkRecord,
|
|
18
|
-
|
|
29
|
+
EmbeddingIndexConfiguration,
|
|
19
30
|
_build_snippet,
|
|
20
31
|
_extract_span_text,
|
|
21
|
-
|
|
32
|
+
artifact_paths_for_snapshot,
|
|
22
33
|
chunks_to_records,
|
|
23
34
|
collect_chunks,
|
|
24
35
|
cosine_similarity_scores,
|
|
@@ -30,45 +41,52 @@ from .embedding_index_common import (
|
|
|
30
41
|
)
|
|
31
42
|
|
|
32
43
|
|
|
33
|
-
class
|
|
44
|
+
class EmbeddingIndexFileRetriever:
|
|
34
45
|
"""
|
|
35
|
-
Embedding retrieval
|
|
46
|
+
Embedding retrieval retriever using memory-mapped similarity scanning.
|
|
36
47
|
"""
|
|
37
48
|
|
|
38
|
-
|
|
49
|
+
retriever_id = "embedding-index-file"
|
|
39
50
|
|
|
40
|
-
def
|
|
41
|
-
self, corpus: Corpus, *,
|
|
42
|
-
) ->
|
|
51
|
+
def build_snapshot(
|
|
52
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
53
|
+
) -> RetrievalSnapshot:
|
|
43
54
|
"""
|
|
44
|
-
Build an embedding index
|
|
55
|
+
Build an embedding index snapshot by chunking text payloads and materializing embeddings.
|
|
45
56
|
|
|
46
57
|
:param corpus: Corpus to build against.
|
|
47
58
|
:type corpus: Corpus
|
|
48
|
-
:param
|
|
49
|
-
:type
|
|
50
|
-
:param
|
|
51
|
-
:type
|
|
52
|
-
:return:
|
|
53
|
-
:rtype: biblicus.models.
|
|
59
|
+
:param configuration_name: Human-readable configuration name.
|
|
60
|
+
:type configuration_name: str
|
|
61
|
+
:param configuration: Retriever-specific configuration values.
|
|
62
|
+
:type configuration: dict[str, object]
|
|
63
|
+
:return: Snapshot manifest describing the build.
|
|
64
|
+
:rtype: biblicus.models.RetrievalSnapshot
|
|
54
65
|
"""
|
|
55
|
-
|
|
56
|
-
chunks, text_items = collect_chunks(corpus,
|
|
66
|
+
parsed_config = EmbeddingIndexConfiguration.model_validate(configuration)
|
|
67
|
+
chunks, text_items = collect_chunks(corpus, configuration=parsed_config)
|
|
57
68
|
|
|
58
|
-
provider =
|
|
69
|
+
provider = parsed_config.embedding_provider.build_provider()
|
|
59
70
|
embeddings = provider.embed_texts([chunk.text for chunk in chunks]).astype(np.float32)
|
|
60
71
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
name=
|
|
64
|
-
|
|
72
|
+
configuration_manifest = create_configuration_manifest(
|
|
73
|
+
retriever_id=self.retriever_id,
|
|
74
|
+
name=configuration_name,
|
|
75
|
+
configuration=parsed_config.model_dump(),
|
|
76
|
+
)
|
|
77
|
+
snapshot = create_snapshot_manifest(
|
|
78
|
+
corpus,
|
|
79
|
+
configuration=configuration_manifest,
|
|
80
|
+
stats={},
|
|
81
|
+
snapshot_artifacts=[],
|
|
65
82
|
)
|
|
66
|
-
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
67
83
|
|
|
68
|
-
paths =
|
|
84
|
+
paths = artifact_paths_for_snapshot(
|
|
85
|
+
snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
|
|
86
|
+
)
|
|
69
87
|
embeddings_path = corpus.root / paths["embeddings"]
|
|
70
88
|
chunks_path = corpus.root / paths["chunks"]
|
|
71
|
-
corpus.
|
|
89
|
+
corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
72
90
|
|
|
73
91
|
write_embeddings(embeddings_path, embeddings)
|
|
74
92
|
write_chunks_jsonl(chunks_path, chunks_to_records(chunks))
|
|
@@ -80,30 +98,33 @@ class EmbeddingIndexFileBackend:
|
|
|
80
98
|
"dimensions": (
|
|
81
99
|
int(embeddings.shape[1])
|
|
82
100
|
if embeddings.size
|
|
83
|
-
else
|
|
101
|
+
else parsed_config.embedding_provider.dimensions
|
|
84
102
|
),
|
|
85
103
|
}
|
|
86
|
-
|
|
87
|
-
update={
|
|
104
|
+
snapshot = snapshot.model_copy(
|
|
105
|
+
update={
|
|
106
|
+
"snapshot_artifacts": [paths["embeddings"], paths["chunks"]],
|
|
107
|
+
"stats": stats,
|
|
108
|
+
}
|
|
88
109
|
)
|
|
89
|
-
corpus.
|
|
90
|
-
return
|
|
110
|
+
corpus.write_snapshot(snapshot)
|
|
111
|
+
return snapshot
|
|
91
112
|
|
|
92
113
|
def query(
|
|
93
114
|
self,
|
|
94
115
|
corpus: Corpus,
|
|
95
116
|
*,
|
|
96
|
-
|
|
117
|
+
snapshot: RetrievalSnapshot,
|
|
97
118
|
query_text: str,
|
|
98
119
|
budget: QueryBudget,
|
|
99
120
|
) -> RetrievalResult:
|
|
100
121
|
"""
|
|
101
|
-
Query an embedding index
|
|
122
|
+
Query an embedding index snapshot and return ranked evidence.
|
|
102
123
|
|
|
103
|
-
:param corpus: Corpus associated with the
|
|
124
|
+
:param corpus: Corpus associated with the snapshot.
|
|
104
125
|
:type corpus: Corpus
|
|
105
|
-
:param
|
|
106
|
-
:type
|
|
126
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
127
|
+
:type snapshot: biblicus.models.RetrievalSnapshot
|
|
107
128
|
:param query_text: Query text to embed.
|
|
108
129
|
:type query_text: str
|
|
109
130
|
:param budget: Evidence selection budget.
|
|
@@ -111,14 +132,18 @@ class EmbeddingIndexFileBackend:
|
|
|
111
132
|
:return: Retrieval results containing evidence.
|
|
112
133
|
:rtype: biblicus.models.RetrievalResult
|
|
113
134
|
"""
|
|
114
|
-
|
|
115
|
-
|
|
135
|
+
parsed_config = EmbeddingIndexConfiguration.model_validate(
|
|
136
|
+
snapshot.configuration.configuration
|
|
137
|
+
)
|
|
138
|
+
extraction_reference = resolve_extraction_reference(corpus, parsed_config)
|
|
116
139
|
|
|
117
|
-
paths =
|
|
140
|
+
paths = artifact_paths_for_snapshot(
|
|
141
|
+
snapshot_id=snapshot.snapshot_id, retriever_id=self.retriever_id
|
|
142
|
+
)
|
|
118
143
|
embeddings_path = corpus.root / paths["embeddings"]
|
|
119
144
|
chunks_path = corpus.root / paths["chunks"]
|
|
120
145
|
if not embeddings_path.is_file() or not chunks_path.is_file():
|
|
121
|
-
raise FileNotFoundError("Embedding index artifacts are missing for this
|
|
146
|
+
raise FileNotFoundError("Embedding index artifacts are missing for this snapshot")
|
|
122
147
|
|
|
123
148
|
embeddings = read_embeddings(embeddings_path, mmap=True).astype(np.float32)
|
|
124
149
|
chunk_records = read_chunks_jsonl(chunks_path)
|
|
@@ -128,12 +153,12 @@ class EmbeddingIndexFileBackend:
|
|
|
128
153
|
"embeddings row count does not match chunk record count"
|
|
129
154
|
)
|
|
130
155
|
|
|
131
|
-
provider =
|
|
156
|
+
provider = parsed_config.embedding_provider.build_provider()
|
|
132
157
|
query_embedding = provider.embed_texts([query_text]).astype(np.float32)
|
|
133
158
|
if query_embedding.shape[0] != 1:
|
|
134
159
|
raise ValueError("Embedding provider returned an invalid query embedding shape")
|
|
135
160
|
|
|
136
|
-
batch_rows =
|
|
161
|
+
batch_rows = parsed_config.maximum_cache_total_items or 4096
|
|
137
162
|
candidates = _top_indices_batched(
|
|
138
163
|
embeddings=embeddings,
|
|
139
164
|
query_vector=query_embedding[0],
|
|
@@ -142,8 +167,8 @@ class EmbeddingIndexFileBackend:
|
|
|
142
167
|
)
|
|
143
168
|
evidence_items = _build_evidence(
|
|
144
169
|
corpus,
|
|
145
|
-
|
|
146
|
-
|
|
170
|
+
snapshot=snapshot,
|
|
171
|
+
configuration=parsed_config,
|
|
147
172
|
candidates=candidates,
|
|
148
173
|
embeddings=embeddings,
|
|
149
174
|
query_vector=query_embedding[0],
|
|
@@ -152,7 +177,11 @@ class EmbeddingIndexFileBackend:
|
|
|
152
177
|
)
|
|
153
178
|
ranked = [
|
|
154
179
|
item.model_copy(
|
|
155
|
-
update={
|
|
180
|
+
update={
|
|
181
|
+
"rank": index,
|
|
182
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
183
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
184
|
+
}
|
|
156
185
|
)
|
|
157
186
|
for index, item in enumerate(evidence_items, start=1)
|
|
158
187
|
]
|
|
@@ -160,9 +189,9 @@ class EmbeddingIndexFileBackend:
|
|
|
160
189
|
return RetrievalResult(
|
|
161
190
|
query_text=query_text,
|
|
162
191
|
budget=budget,
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
192
|
+
snapshot_id=snapshot.snapshot_id,
|
|
193
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
194
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
166
195
|
generated_at=utc_now_iso(),
|
|
167
196
|
evidence=evidence,
|
|
168
197
|
stats={"candidates": len(evidence_items), "returned": len(evidence)},
|
|
@@ -205,13 +234,13 @@ def _top_indices_batched(
|
|
|
205
234
|
def _build_evidence(
|
|
206
235
|
corpus: Corpus,
|
|
207
236
|
*,
|
|
208
|
-
|
|
209
|
-
|
|
237
|
+
snapshot: RetrievalSnapshot,
|
|
238
|
+
configuration: EmbeddingIndexConfiguration,
|
|
210
239
|
candidates: List[int],
|
|
211
240
|
embeddings: np.ndarray,
|
|
212
241
|
query_vector: np.ndarray,
|
|
213
242
|
chunk_records: List[ChunkRecord],
|
|
214
|
-
extraction_reference: Optional[
|
|
243
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
215
244
|
) -> List[Evidence]:
|
|
216
245
|
catalog = corpus.load_catalog()
|
|
217
246
|
evidence_items: List[Evidence] = []
|
|
@@ -226,7 +255,7 @@ def _build_evidence(
|
|
|
226
255
|
extraction_reference=extraction_reference,
|
|
227
256
|
)
|
|
228
257
|
span_text = _build_snippet(
|
|
229
|
-
text, (record.span_start, record.span_end),
|
|
258
|
+
text, (record.span_start, record.span_end), configuration.snippet_characters
|
|
230
259
|
)
|
|
231
260
|
if span_text is None:
|
|
232
261
|
span_text = _extract_span_text(text, (record.span_start, record.span_end))
|
|
@@ -242,10 +271,10 @@ def _build_evidence(
|
|
|
242
271
|
content_ref=None,
|
|
243
272
|
span_start=record.span_start,
|
|
244
273
|
span_end=record.span_end,
|
|
245
|
-
stage=
|
|
274
|
+
stage=EmbeddingIndexFileRetriever.retriever_id,
|
|
246
275
|
stage_scores=None,
|
|
247
|
-
|
|
248
|
-
|
|
276
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
277
|
+
snapshot_id=snapshot.snapshot_id,
|
|
249
278
|
metadata=getattr(catalog_item, "metadata", {}) or {},
|
|
250
279
|
hash=hash_text(span_text or ""),
|
|
251
280
|
)
|
|
@@ -259,7 +288,7 @@ def _load_text_for_evidence(
|
|
|
259
288
|
item_id: str,
|
|
260
289
|
relpath: str,
|
|
261
290
|
media_type: str,
|
|
262
|
-
extraction_reference: Optional[
|
|
291
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
263
292
|
) -> Optional[str]:
|
|
264
293
|
from .embedding_index_common import _load_text_from_item
|
|
265
294
|
|