biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
SQLite full-text search version five
|
|
2
|
+
SQLite full-text search version five retriever for Biblicus.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -10,24 +10,29 @@ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
12
12
|
|
|
13
|
-
from ..constants import CORPUS_DIR_NAME,
|
|
13
|
+
from ..constants import CORPUS_DIR_NAME, SNAPSHOTS_DIR_NAME
|
|
14
14
|
from ..corpus import Corpus
|
|
15
15
|
from ..frontmatter import parse_front_matter
|
|
16
16
|
from ..models import (
|
|
17
17
|
Evidence,
|
|
18
|
-
|
|
18
|
+
ExtractionSnapshotReference,
|
|
19
19
|
QueryBudget,
|
|
20
20
|
RetrievalResult,
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
RetrievalSnapshot,
|
|
22
|
+
parse_extraction_snapshot_reference,
|
|
23
|
+
)
|
|
24
|
+
from ..retrieval import (
|
|
25
|
+
apply_budget,
|
|
26
|
+
create_configuration_manifest,
|
|
27
|
+
create_snapshot_manifest,
|
|
28
|
+
hash_text,
|
|
23
29
|
)
|
|
24
|
-
from ..retrieval import apply_budget, create_recipe_manifest, create_run_manifest, hash_text
|
|
25
30
|
from ..time import utc_now_iso
|
|
26
31
|
|
|
27
32
|
|
|
28
|
-
class
|
|
33
|
+
class SqliteFullTextSearchConfiguration(BaseModel):
|
|
29
34
|
"""
|
|
30
|
-
Configuration for the SQLite full-text search
|
|
35
|
+
Configuration for the SQLite full-text search retriever.
|
|
31
36
|
|
|
32
37
|
:ivar chunk_size: Maximum characters per chunk.
|
|
33
38
|
:vartype chunk_size: int
|
|
@@ -57,8 +62,8 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
57
62
|
:vartype rerank_model: str or None
|
|
58
63
|
:ivar rerank_top_k: Number of candidates to rerank.
|
|
59
64
|
:vartype rerank_top_k: int
|
|
60
|
-
:ivar
|
|
61
|
-
:vartype
|
|
65
|
+
:ivar extraction_snapshot: Optional extraction snapshot reference in the form extractor_id:snapshot_id.
|
|
66
|
+
:vartype extraction_snapshot: str or None
|
|
62
67
|
"""
|
|
63
68
|
|
|
64
69
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -77,7 +82,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
77
82
|
rerank_enabled: bool = False
|
|
78
83
|
rerank_model: Optional[str] = None
|
|
79
84
|
rerank_top_k: int = Field(default=10, ge=1)
|
|
80
|
-
|
|
85
|
+
extraction_snapshot: Optional[str] = None
|
|
81
86
|
|
|
82
87
|
@field_validator("stop_words")
|
|
83
88
|
@classmethod
|
|
@@ -97,7 +102,7 @@ class SqliteFullTextSearchRecipeConfig(BaseModel):
|
|
|
97
102
|
return value
|
|
98
103
|
|
|
99
104
|
@model_validator(mode="after")
|
|
100
|
-
def _validate_ngram_range(self) -> "
|
|
105
|
+
def _validate_ngram_range(self) -> "SqliteFullTextSearchConfiguration":
|
|
101
106
|
if self.ngram_min > self.ngram_max:
|
|
102
107
|
raise ValueError("Invalid ngram range: ngram_min must be <= ngram_max")
|
|
103
108
|
if self.rerank_enabled and not self.rerank_model:
|
|
@@ -142,69 +147,76 @@ _ENGLISH_STOP_WORDS: Set[str] = {
|
|
|
142
147
|
}
|
|
143
148
|
|
|
144
149
|
|
|
145
|
-
class
|
|
150
|
+
class SqliteFullTextSearchRetriever:
|
|
146
151
|
"""
|
|
147
|
-
SQLite full-text search version five
|
|
152
|
+
SQLite full-text search version five retriever for practical local retrieval.
|
|
148
153
|
|
|
149
|
-
:ivar
|
|
150
|
-
:vartype
|
|
154
|
+
:ivar retriever_id: Retriever identifier.
|
|
155
|
+
:vartype retriever_id: str
|
|
151
156
|
"""
|
|
152
157
|
|
|
153
|
-
|
|
158
|
+
retriever_id = "sqlite-full-text-search"
|
|
154
159
|
|
|
155
|
-
def
|
|
156
|
-
self, corpus: Corpus, *,
|
|
157
|
-
) ->
|
|
160
|
+
def build_snapshot(
|
|
161
|
+
self, corpus: Corpus, *, configuration_name: str, configuration: Dict[str, object]
|
|
162
|
+
) -> RetrievalSnapshot:
|
|
158
163
|
"""
|
|
159
164
|
Build a full-text search version five index for the corpus.
|
|
160
165
|
|
|
161
166
|
:param corpus: Corpus to build against.
|
|
162
167
|
:type corpus: Corpus
|
|
163
|
-
:param
|
|
164
|
-
:type
|
|
165
|
-
:param
|
|
166
|
-
:type
|
|
167
|
-
:return:
|
|
168
|
-
:rtype:
|
|
168
|
+
:param configuration_name: Human-readable configuration name.
|
|
169
|
+
:type configuration_name: str
|
|
170
|
+
:param configuration: Retriever-specific configuration values.
|
|
171
|
+
:type configuration: dict[str, object]
|
|
172
|
+
:return: Snapshot manifest describing the build.
|
|
173
|
+
:rtype: RetrievalSnapshot
|
|
169
174
|
"""
|
|
170
|
-
|
|
175
|
+
parsed_config = SqliteFullTextSearchConfiguration.model_validate(configuration)
|
|
171
176
|
catalog = corpus.load_catalog()
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
name=
|
|
175
|
-
|
|
177
|
+
configuration_manifest = create_configuration_manifest(
|
|
178
|
+
retriever_id=self.retriever_id,
|
|
179
|
+
name=configuration_name,
|
|
180
|
+
configuration=parsed_config.model_dump(),
|
|
181
|
+
)
|
|
182
|
+
snapshot = create_snapshot_manifest(
|
|
183
|
+
corpus,
|
|
184
|
+
configuration=configuration_manifest,
|
|
185
|
+
stats={},
|
|
186
|
+
snapshot_artifacts=[],
|
|
187
|
+
)
|
|
188
|
+
db_relpath = str(
|
|
189
|
+
Path(CORPUS_DIR_NAME) / SNAPSHOTS_DIR_NAME / f"{snapshot.snapshot_id}.sqlite"
|
|
176
190
|
)
|
|
177
|
-
run = create_run_manifest(corpus, recipe=recipe, stats={}, artifact_paths=[])
|
|
178
|
-
db_relpath = str(Path(CORPUS_DIR_NAME) / RUNS_DIR_NAME / f"{run.run_id}.sqlite")
|
|
179
191
|
db_path = corpus.root / db_relpath
|
|
180
|
-
corpus.
|
|
181
|
-
extraction_reference = _resolve_extraction_reference(corpus,
|
|
192
|
+
corpus.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
193
|
+
extraction_reference = _resolve_extraction_reference(corpus, parsed_config)
|
|
182
194
|
stats = _build_full_text_search_index(
|
|
183
195
|
db_path=db_path,
|
|
184
196
|
corpus=corpus,
|
|
185
197
|
items=catalog.items.values(),
|
|
186
|
-
|
|
198
|
+
configuration=parsed_config,
|
|
187
199
|
extraction_reference=extraction_reference,
|
|
188
200
|
)
|
|
189
|
-
|
|
190
|
-
corpus.
|
|
191
|
-
return
|
|
201
|
+
snapshot = snapshot.model_copy(update={"snapshot_artifacts": [db_relpath], "stats": stats})
|
|
202
|
+
corpus.write_snapshot(snapshot)
|
|
203
|
+
return snapshot
|
|
192
204
|
|
|
193
205
|
def query(
|
|
194
206
|
self,
|
|
195
207
|
corpus: Corpus,
|
|
196
208
|
*,
|
|
197
|
-
|
|
209
|
+
snapshot: RetrievalSnapshot,
|
|
198
210
|
query_text: str,
|
|
199
211
|
budget: QueryBudget,
|
|
200
212
|
) -> RetrievalResult:
|
|
201
213
|
"""
|
|
202
214
|
Query the SQLite full-text search index for evidence.
|
|
203
215
|
|
|
204
|
-
:param corpus: Corpus associated with the
|
|
216
|
+
:param corpus: Corpus associated with the snapshot.
|
|
205
217
|
:type corpus: Corpus
|
|
206
|
-
:param
|
|
207
|
-
:type
|
|
218
|
+
:param snapshot: Snapshot manifest to use for querying.
|
|
219
|
+
:type snapshot: RetrievalSnapshot
|
|
208
220
|
:param query_text: Query text to execute.
|
|
209
221
|
:type query_text: str
|
|
210
222
|
:param budget: Evidence selection budget.
|
|
@@ -212,46 +224,48 @@ class SqliteFullTextSearchBackend:
|
|
|
212
224
|
:return: Retrieval results containing evidence.
|
|
213
225
|
:rtype: RetrievalResult
|
|
214
226
|
"""
|
|
215
|
-
|
|
227
|
+
parsed_config = SqliteFullTextSearchConfiguration.model_validate(
|
|
228
|
+
snapshot.configuration.configuration
|
|
229
|
+
)
|
|
216
230
|
query_tokens = _tokenize_query(query_text)
|
|
217
|
-
stop_words = _resolve_stop_words(
|
|
231
|
+
stop_words = _resolve_stop_words(parsed_config.stop_words)
|
|
218
232
|
filtered_tokens = _apply_stop_words(query_tokens, stop_words)
|
|
219
233
|
if not filtered_tokens:
|
|
220
234
|
return RetrievalResult(
|
|
221
235
|
query_text=query_text,
|
|
222
236
|
budget=budget,
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
237
|
+
snapshot_id=snapshot.snapshot_id,
|
|
238
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
239
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
226
240
|
generated_at=utc_now_iso(),
|
|
227
241
|
evidence=[],
|
|
228
242
|
stats={"candidates": 0, "returned": 0},
|
|
229
243
|
)
|
|
230
|
-
db_path =
|
|
244
|
+
db_path = _resolve_snapshot_db_path(corpus, snapshot)
|
|
231
245
|
candidates = _query_full_text_search_index(
|
|
232
246
|
db_path=db_path,
|
|
233
247
|
query_text=" ".join(filtered_tokens),
|
|
234
248
|
limit=_candidate_limit(budget.max_total_items + budget.offset),
|
|
235
|
-
snippet_characters=
|
|
249
|
+
snippet_characters=parsed_config.snippet_characters,
|
|
236
250
|
)
|
|
237
251
|
sorted_candidates = _rank_candidates(candidates)
|
|
238
252
|
evidence = _apply_rerank_if_enabled(
|
|
239
253
|
sorted_candidates,
|
|
240
254
|
query_tokens=filtered_tokens,
|
|
241
|
-
|
|
255
|
+
snapshot=snapshot,
|
|
242
256
|
budget=budget,
|
|
243
|
-
rerank_enabled=
|
|
244
|
-
rerank_top_k=
|
|
257
|
+
rerank_enabled=parsed_config.rerank_enabled,
|
|
258
|
+
rerank_top_k=parsed_config.rerank_top_k,
|
|
245
259
|
)
|
|
246
260
|
stats: Dict[str, object] = {"candidates": len(sorted_candidates), "returned": len(evidence)}
|
|
247
|
-
if
|
|
248
|
-
stats["reranked_candidates"] = min(len(sorted_candidates),
|
|
261
|
+
if parsed_config.rerank_enabled:
|
|
262
|
+
stats["reranked_candidates"] = min(len(sorted_candidates), parsed_config.rerank_top_k)
|
|
249
263
|
return RetrievalResult(
|
|
250
264
|
query_text=query_text,
|
|
251
265
|
budget=budget,
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
266
|
+
snapshot_id=snapshot.snapshot_id,
|
|
267
|
+
configuration_id=snapshot.configuration.configuration_id,
|
|
268
|
+
retriever_id=snapshot.configuration.retriever_id,
|
|
255
269
|
generated_at=utc_now_iso(),
|
|
256
270
|
evidence=evidence,
|
|
257
271
|
stats=stats,
|
|
@@ -264,7 +278,7 @@ def _candidate_limit(max_total_items: int) -> int:
|
|
|
264
278
|
|
|
265
279
|
:param max_total_items: Requested evidence count.
|
|
266
280
|
:type max_total_items: int
|
|
267
|
-
:return: Candidate limit for
|
|
281
|
+
:return: Candidate limit for retriever search.
|
|
268
282
|
:rtype: int
|
|
269
283
|
"""
|
|
270
284
|
return max_total_items * 5
|
|
@@ -347,7 +361,7 @@ def _apply_rerank_if_enabled(
|
|
|
347
361
|
candidates: List[Evidence],
|
|
348
362
|
*,
|
|
349
363
|
query_tokens: List[str],
|
|
350
|
-
|
|
364
|
+
snapshot: RetrievalSnapshot,
|
|
351
365
|
budget: QueryBudget,
|
|
352
366
|
rerank_enabled: bool,
|
|
353
367
|
rerank_top_k: int,
|
|
@@ -359,8 +373,8 @@ def _apply_rerank_if_enabled(
|
|
|
359
373
|
:type candidates: list[Evidence]
|
|
360
374
|
:param query_tokens: Query tokens used for reranking.
|
|
361
375
|
:type query_tokens: list[str]
|
|
362
|
-
:param
|
|
363
|
-
:type
|
|
376
|
+
:param snapshot: Retrieval snapshot to annotate evidence with.
|
|
377
|
+
:type snapshot: RetrievalSnapshot
|
|
364
378
|
:param budget: Evidence selection budget.
|
|
365
379
|
:type budget: QueryBudget
|
|
366
380
|
:param rerank_enabled: Whether reranking is enabled.
|
|
@@ -375,8 +389,8 @@ def _apply_rerank_if_enabled(
|
|
|
375
389
|
evidence_item.model_copy(
|
|
376
390
|
update={
|
|
377
391
|
"rank": index,
|
|
378
|
-
"
|
|
379
|
-
"
|
|
392
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
393
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
380
394
|
}
|
|
381
395
|
)
|
|
382
396
|
for index, evidence_item in enumerate(candidates, start=1)
|
|
@@ -402,8 +416,8 @@ def _apply_rerank_if_enabled(
|
|
|
402
416
|
evidence_item.model_copy(
|
|
403
417
|
update={
|
|
404
418
|
"rank": index,
|
|
405
|
-
"
|
|
406
|
-
"
|
|
419
|
+
"configuration_id": snapshot.configuration.configuration_id,
|
|
420
|
+
"snapshot_id": snapshot.snapshot_id,
|
|
407
421
|
}
|
|
408
422
|
)
|
|
409
423
|
for index, evidence_item in enumerate(reranked_sorted, start=1)
|
|
@@ -411,21 +425,21 @@ def _apply_rerank_if_enabled(
|
|
|
411
425
|
return apply_budget(ranked, budget)
|
|
412
426
|
|
|
413
427
|
|
|
414
|
-
def
|
|
428
|
+
def _resolve_snapshot_db_path(corpus: Corpus, snapshot: RetrievalSnapshot) -> Path:
|
|
415
429
|
"""
|
|
416
|
-
Resolve the SQLite index path for a retrieval
|
|
430
|
+
Resolve the SQLite index path for a retrieval snapshot.
|
|
417
431
|
|
|
418
|
-
:param corpus: Corpus containing
|
|
432
|
+
:param corpus: Corpus containing snapshot artifacts.
|
|
419
433
|
:type corpus: Corpus
|
|
420
|
-
:param
|
|
421
|
-
:type
|
|
434
|
+
:param snapshot: Retrieval snapshot manifest.
|
|
435
|
+
:type snapshot: RetrievalSnapshot
|
|
422
436
|
:return: Path to the SQLite index file.
|
|
423
437
|
:rtype: Path
|
|
424
|
-
:raises FileNotFoundError: If the
|
|
438
|
+
:raises FileNotFoundError: If the snapshot does not have artifact paths.
|
|
425
439
|
"""
|
|
426
|
-
if not
|
|
427
|
-
raise FileNotFoundError("
|
|
428
|
-
return corpus.root /
|
|
440
|
+
if not snapshot.snapshot_artifacts:
|
|
441
|
+
raise FileNotFoundError("Snapshot has no artifact paths to query")
|
|
442
|
+
return corpus.root / snapshot.snapshot_artifacts[0]
|
|
429
443
|
|
|
430
444
|
|
|
431
445
|
def _ensure_full_text_search_version_five(conn: sqlite3.Connection) -> None:
|
|
@@ -480,8 +494,8 @@ def _build_full_text_search_index(
|
|
|
480
494
|
db_path: Path,
|
|
481
495
|
corpus: Corpus,
|
|
482
496
|
items: Iterable[object],
|
|
483
|
-
|
|
484
|
-
extraction_reference: Optional[
|
|
497
|
+
configuration: SqliteFullTextSearchConfiguration,
|
|
498
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
485
499
|
) -> Dict[str, int]:
|
|
486
500
|
"""
|
|
487
501
|
Build a full-text search index from corpus items.
|
|
@@ -492,8 +506,8 @@ def _build_full_text_search_index(
|
|
|
492
506
|
:type corpus: Corpus
|
|
493
507
|
:param items: Catalog items to index.
|
|
494
508
|
:type items: Iterable[object]
|
|
495
|
-
:param
|
|
496
|
-
:type
|
|
509
|
+
:param configuration: Chunking and snippet configuration.
|
|
510
|
+
:type configuration: SqliteFullTextSearchConfiguration
|
|
497
511
|
:return: Index statistics.
|
|
498
512
|
:rtype: dict[str, int]
|
|
499
513
|
"""
|
|
@@ -523,8 +537,8 @@ def _build_full_text_search_index(
|
|
|
523
537
|
title = getattr(catalog_item, "title", None)
|
|
524
538
|
for start_offset, end_offset, chunk in _iter_chunks(
|
|
525
539
|
item_text,
|
|
526
|
-
chunk_size=
|
|
527
|
-
chunk_overlap=
|
|
540
|
+
chunk_size=configuration.chunk_size,
|
|
541
|
+
chunk_overlap=configuration.chunk_overlap,
|
|
528
542
|
):
|
|
529
543
|
connection.execute(
|
|
530
544
|
"""
|
|
@@ -568,7 +582,7 @@ def _load_text_from_item(
|
|
|
568
582
|
item_id: str,
|
|
569
583
|
relpath: str,
|
|
570
584
|
media_type: str,
|
|
571
|
-
extraction_reference: Optional[
|
|
585
|
+
extraction_reference: Optional[ExtractionSnapshotReference],
|
|
572
586
|
) -> Optional[str]:
|
|
573
587
|
"""
|
|
574
588
|
Load text content from a catalog item.
|
|
@@ -581,15 +595,15 @@ def _load_text_from_item(
|
|
|
581
595
|
:type relpath: str
|
|
582
596
|
:param media_type: Media type for the content.
|
|
583
597
|
:type media_type: str
|
|
584
|
-
:param extraction_reference: Optional extraction
|
|
585
|
-
:type extraction_reference:
|
|
598
|
+
:param extraction_reference: Optional extraction snapshot reference.
|
|
599
|
+
:type extraction_reference: ExtractionSnapshotReference or None
|
|
586
600
|
:return: Text payload or None if not text.
|
|
587
601
|
:rtype: str or None
|
|
588
602
|
"""
|
|
589
603
|
if extraction_reference:
|
|
590
604
|
extracted_text = corpus.read_extracted_text(
|
|
591
605
|
extractor_id=extraction_reference.extractor_id,
|
|
592
|
-
|
|
606
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
593
607
|
item_id=item_id,
|
|
594
608
|
)
|
|
595
609
|
if isinstance(extracted_text, str) and extracted_text.strip():
|
|
@@ -608,28 +622,28 @@ def _load_text_from_item(
|
|
|
608
622
|
|
|
609
623
|
def _resolve_extraction_reference(
|
|
610
624
|
corpus: Corpus,
|
|
611
|
-
|
|
612
|
-
) -> Optional[
|
|
625
|
+
configuration: SqliteFullTextSearchConfiguration,
|
|
626
|
+
) -> Optional[ExtractionSnapshotReference]:
|
|
613
627
|
"""
|
|
614
|
-
Resolve an extraction
|
|
628
|
+
Resolve an extraction snapshot reference from a configuration.
|
|
615
629
|
|
|
616
|
-
:param corpus: Corpus associated with the
|
|
630
|
+
:param corpus: Corpus associated with the configuration.
|
|
617
631
|
:type corpus: Corpus
|
|
618
|
-
:param
|
|
619
|
-
:type
|
|
632
|
+
:param configuration: Parsed retriever configuration.
|
|
633
|
+
:type configuration: SqliteFullTextSearchConfiguration
|
|
620
634
|
:return: Parsed extraction reference or None.
|
|
621
|
-
:rtype:
|
|
622
|
-
:raises FileNotFoundError: If an extraction
|
|
635
|
+
:rtype: ExtractionSnapshotReference or None
|
|
636
|
+
:raises FileNotFoundError: If an extraction snapshot is referenced but not present.
|
|
623
637
|
"""
|
|
624
|
-
if not
|
|
638
|
+
if not configuration.extraction_snapshot:
|
|
625
639
|
return None
|
|
626
|
-
extraction_reference =
|
|
627
|
-
|
|
640
|
+
extraction_reference = parse_extraction_snapshot_reference(configuration.extraction_snapshot)
|
|
641
|
+
snapshot_dir = corpus.extraction_snapshot_dir(
|
|
628
642
|
extractor_id=extraction_reference.extractor_id,
|
|
629
|
-
|
|
643
|
+
snapshot_id=extraction_reference.snapshot_id,
|
|
630
644
|
)
|
|
631
|
-
if not
|
|
632
|
-
raise FileNotFoundError(f"Missing extraction
|
|
645
|
+
if not snapshot_dir.is_dir():
|
|
646
|
+
raise FileNotFoundError(f"Missing extraction snapshot: {extraction_reference.as_string()}")
|
|
633
647
|
return extraction_reference
|
|
634
648
|
|
|
635
649
|
|
|
@@ -723,8 +737,8 @@ def _query_full_text_search_index(
|
|
|
723
737
|
span_start=int(start_offset) if start_offset is not None else None,
|
|
724
738
|
span_end=int(end_offset) if end_offset is not None else None,
|
|
725
739
|
stage="full-text-search",
|
|
726
|
-
|
|
727
|
-
|
|
740
|
+
configuration_id="",
|
|
741
|
+
snapshot_id="",
|
|
728
742
|
hash=hash_text(snippet_text),
|
|
729
743
|
)
|
|
730
744
|
)
|