biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
- biblicus-1.1.1.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Configuration loading utilities for Biblicus.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -103,34 +103,34 @@ def apply_dotted_overrides(
|
|
|
103
103
|
return updated
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
def
|
|
107
|
-
|
|
106
|
+
def load_configuration_view(
|
|
107
|
+
configuration_paths: Iterable[str],
|
|
108
108
|
*,
|
|
109
|
-
|
|
109
|
+
configuration_label: str = "Configuration",
|
|
110
110
|
mapping_error_message: Optional[str] = None,
|
|
111
111
|
) -> Dict[str, object]:
|
|
112
112
|
"""
|
|
113
|
-
Load a composed
|
|
113
|
+
Load a composed configuration view from one or more YAML files.
|
|
114
114
|
|
|
115
|
-
:param
|
|
116
|
-
:type
|
|
117
|
-
:param
|
|
118
|
-
:type
|
|
115
|
+
:param configuration_paths: Iterable of configuration file paths in precedence order.
|
|
116
|
+
:type configuration_paths: Iterable[str]
|
|
117
|
+
:param configuration_label: Label used in error messages (for example: "Configuration file").
|
|
118
|
+
:type configuration_label: str
|
|
119
119
|
:return: Composed configuration view.
|
|
120
120
|
:rtype: dict[str, object]
|
|
121
|
-
:raises FileNotFoundError: If any
|
|
122
|
-
:raises ValueError: If any
|
|
121
|
+
:raises FileNotFoundError: If any configuration file is missing.
|
|
122
|
+
:raises ValueError: If any configuration file is not a mapping/object.
|
|
123
123
|
"""
|
|
124
124
|
from biblicus._vendor.dotyaml import load_yaml_view
|
|
125
125
|
|
|
126
|
-
paths: List[str] = [str(path) for path in
|
|
126
|
+
paths: List[str] = [str(path) for path in configuration_paths]
|
|
127
127
|
for raw in paths:
|
|
128
128
|
candidate = Path(raw)
|
|
129
129
|
if not candidate.is_file():
|
|
130
|
-
raise FileNotFoundError(f"{
|
|
130
|
+
raise FileNotFoundError(f"{configuration_label} not found: {candidate}")
|
|
131
131
|
try:
|
|
132
132
|
view = load_yaml_view(paths)
|
|
133
133
|
except ValueError as exc:
|
|
134
|
-
message = mapping_error_message or f"{
|
|
134
|
+
message = mapping_error_message or f"{configuration_label} must be a mapping/object"
|
|
135
135
|
raise ValueError(message) from exc
|
|
136
136
|
return view
|
biblicus/constants.py
CHANGED
|
@@ -9,7 +9,7 @@ ANALYSIS_SCHEMA_VERSION = 1
|
|
|
9
9
|
CORPUS_DIR_NAME = ".biblicus"
|
|
10
10
|
DEFAULT_RAW_DIR = "raw"
|
|
11
11
|
SIDECAR_SUFFIX = ".biblicus.yml"
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
SNAPSHOTS_DIR_NAME = "snapshots"
|
|
13
|
+
EXTRACTION_SNAPSHOTS_DIR_NAME = "extraction"
|
|
14
14
|
ANALYSIS_RUNS_DIR_NAME = "analysis"
|
|
15
15
|
HOOK_LOGS_DIR_NAME = "hook_logs"
|
|
@@ -403,13 +403,15 @@ class ContextAssembler:
|
|
|
403
403
|
maximum_items_per_source = None
|
|
404
404
|
include_metadata = False
|
|
405
405
|
metadata_fields = None
|
|
406
|
-
|
|
406
|
+
retriever_id = None
|
|
407
407
|
corpus_root = None
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
408
|
+
snapshot_id = None
|
|
409
|
+
configuration_name = None
|
|
410
|
+
configuration = None
|
|
411
411
|
corpus_name = getattr(retriever_spec, "corpus", None)
|
|
412
412
|
join_with = "\n\n"
|
|
413
|
+
pipeline_config = None
|
|
414
|
+
query_config = None
|
|
413
415
|
|
|
414
416
|
if isinstance(config, dict):
|
|
415
417
|
split = config.get("split", split)
|
|
@@ -424,13 +426,26 @@ class ContextAssembler:
|
|
|
424
426
|
)
|
|
425
427
|
include_metadata = config.get("include_metadata", include_metadata)
|
|
426
428
|
metadata_fields = config.get("metadata_fields", metadata_fields)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
429
|
+
retriever_id = config.get("retriever_id", retriever_id)
|
|
430
|
+
snapshot_id = config.get("snapshot_id", snapshot_id)
|
|
431
|
+
configuration_name = config.get("configuration_name", configuration_name)
|
|
432
|
+
if isinstance(config.get("configuration"), dict):
|
|
433
|
+
configuration = config.get("configuration")
|
|
431
434
|
corpus_name = config.get("corpus", corpus_name)
|
|
432
435
|
join_with = config.get("join_with", join_with)
|
|
433
|
-
|
|
436
|
+
if isinstance(configuration, dict):
|
|
437
|
+
pipeline_config = configuration.get("pipeline")
|
|
438
|
+
if not isinstance(pipeline_config, dict) and isinstance(config.get("pipeline"), dict):
|
|
439
|
+
pipeline_config = config.get("pipeline")
|
|
440
|
+
if isinstance(pipeline_config, dict):
|
|
441
|
+
if isinstance(pipeline_config.get("query"), dict):
|
|
442
|
+
query_config = pipeline_config.get("query") or {}
|
|
443
|
+
if configuration is None and isinstance(pipeline_config.get("index"), dict):
|
|
444
|
+
configuration = pipeline_config.get("index") or {}
|
|
445
|
+
if configuration is None and isinstance(config.get("index"), dict):
|
|
446
|
+
configuration = config.get("index") or {}
|
|
447
|
+
if configuration is None and isinstance(pipeline_config, dict):
|
|
448
|
+
configuration = pipeline_config.get("index") or {}
|
|
434
449
|
if corpus_name and corpus_name in self._corpus_registry:
|
|
435
450
|
corpus_spec = self._corpus_registry[corpus_name]
|
|
436
451
|
corpus_config = corpus_spec.config if hasattr(corpus_spec, "config") else {}
|
|
@@ -442,17 +457,32 @@ class ContextAssembler:
|
|
|
442
457
|
maximum_cache_total_characters = corpus_config.get(
|
|
443
458
|
"maximum_cache_total_characters", maximum_cache_total_characters
|
|
444
459
|
)
|
|
445
|
-
backend_id = corpus_config.get("backend_id", backend_id)
|
|
446
460
|
corpus_root = corpus_config.get(
|
|
447
461
|
"corpus_root",
|
|
448
462
|
corpus_config.get("root", corpus_root),
|
|
449
463
|
)
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
464
|
+
if query_config:
|
|
465
|
+
if "limit" in query_config:
|
|
466
|
+
limit = query_config.get("limit", limit)
|
|
467
|
+
if "offset" in query_config:
|
|
468
|
+
offset = query_config.get("offset", offset)
|
|
469
|
+
if "maximum_total_characters" in query_config:
|
|
470
|
+
maximum_total_characters = query_config.get(
|
|
471
|
+
"maximum_total_characters", maximum_total_characters
|
|
472
|
+
)
|
|
473
|
+
if "maximum_items_per_source" in query_config:
|
|
474
|
+
maximum_items_per_source = query_config.get(
|
|
475
|
+
"maximum_items_per_source",
|
|
476
|
+
maximum_items_per_source,
|
|
455
477
|
)
|
|
478
|
+
if "max_items_per_source" in query_config and maximum_items_per_source is None:
|
|
479
|
+
maximum_items_per_source = query_config.get("max_items_per_source")
|
|
480
|
+
if "include_metadata" in query_config:
|
|
481
|
+
include_metadata = query_config.get("include_metadata", include_metadata)
|
|
482
|
+
if "metadata_fields" in query_config:
|
|
483
|
+
metadata_fields = query_config.get("metadata_fields", metadata_fields)
|
|
484
|
+
if "join_with" in query_config:
|
|
485
|
+
join_with = query_config.get("join_with", join_with)
|
|
456
486
|
|
|
457
487
|
allocated_tokens = self._allocate_pack_budget(pack_budget, policy, weight)
|
|
458
488
|
if allocated_tokens is not None:
|
|
@@ -486,11 +516,11 @@ class ContextAssembler:
|
|
|
486
516
|
"maximum_items_per_source": maximum_items_per_source,
|
|
487
517
|
"include_metadata": include_metadata,
|
|
488
518
|
"metadata_fields": metadata_fields,
|
|
489
|
-
"
|
|
519
|
+
"retriever_id": retriever_id,
|
|
490
520
|
"corpus_root": corpus_root,
|
|
491
|
-
"
|
|
492
|
-
"
|
|
493
|
-
"
|
|
521
|
+
"snapshot_id": snapshot_id,
|
|
522
|
+
"configuration_name": configuration_name,
|
|
523
|
+
"configuration": configuration,
|
|
494
524
|
},
|
|
495
525
|
)
|
|
496
526
|
context_pack = self._retrieve_with_expansion(
|
|
@@ -6,7 +6,6 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, Optional
|
|
8
8
|
|
|
9
|
-
from biblicus.backends import get_backend
|
|
10
9
|
from biblicus.context import (
|
|
11
10
|
ContextPack,
|
|
12
11
|
ContextPackPolicy,
|
|
@@ -15,67 +14,72 @@ from biblicus.context import (
|
|
|
15
14
|
fit_context_pack_to_token_budget,
|
|
16
15
|
)
|
|
17
16
|
from biblicus.corpus import Corpus
|
|
18
|
-
from biblicus.models import QueryBudget,
|
|
17
|
+
from biblicus.models import QueryBudget, RetrievalSnapshot
|
|
18
|
+
from biblicus.retrievers import get_retriever
|
|
19
19
|
|
|
20
20
|
from .models import ContextRetrieverRequest
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def
|
|
23
|
+
def _resolve_snapshot(
|
|
24
24
|
corpus: Corpus,
|
|
25
25
|
*,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
) ->
|
|
31
|
-
if
|
|
32
|
-
return corpus.
|
|
26
|
+
retriever_id: str,
|
|
27
|
+
snapshot_id: Optional[str],
|
|
28
|
+
configuration_name: Optional[str],
|
|
29
|
+
configuration: Optional[dict[str, Any]],
|
|
30
|
+
) -> RetrievalSnapshot:
|
|
31
|
+
if snapshot_id:
|
|
32
|
+
return corpus.load_snapshot(snapshot_id)
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
if
|
|
36
|
-
candidate = corpus.
|
|
37
|
-
if candidate.
|
|
34
|
+
latest_snapshot_id = corpus.latest_snapshot_id
|
|
35
|
+
if latest_snapshot_id:
|
|
36
|
+
candidate = corpus.load_snapshot(latest_snapshot_id)
|
|
37
|
+
if candidate.configuration.retriever_id == retriever_id:
|
|
38
38
|
return candidate
|
|
39
39
|
|
|
40
|
-
if
|
|
40
|
+
if configuration is None:
|
|
41
41
|
raise ValueError(
|
|
42
|
-
"No retrieval
|
|
43
|
-
"Provide
|
|
42
|
+
"No retrieval snapshot available for the requested retriever. "
|
|
43
|
+
"Provide snapshot_id or configuration to build one."
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
resolved_name =
|
|
48
|
-
return
|
|
46
|
+
retriever = get_retriever(retriever_id)
|
|
47
|
+
resolved_name = configuration_name or f"Context pack ({retriever_id})"
|
|
48
|
+
return retriever.build_snapshot(
|
|
49
|
+
corpus,
|
|
50
|
+
configuration_name=resolved_name,
|
|
51
|
+
configuration=configuration,
|
|
52
|
+
)
|
|
49
53
|
|
|
50
54
|
|
|
51
55
|
def retrieve_context_pack(
|
|
52
56
|
*,
|
|
53
57
|
request: ContextRetrieverRequest,
|
|
54
58
|
corpus: Corpus,
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
+
retriever_id: str,
|
|
60
|
+
snapshot_id: Optional[str] = None,
|
|
61
|
+
configuration_name: Optional[str] = None,
|
|
62
|
+
configuration: Optional[dict[str, Any]] = None,
|
|
59
63
|
join_with: str = "\n\n",
|
|
60
64
|
max_items_per_source: Optional[int] = None,
|
|
61
65
|
include_metadata: bool = False,
|
|
62
66
|
metadata_fields: Optional[list[str]] = None,
|
|
63
67
|
) -> ContextPack:
|
|
64
68
|
"""
|
|
65
|
-
Retrieve a context pack using a Biblicus
|
|
69
|
+
Retrieve a context pack using a Biblicus retriever.
|
|
66
70
|
|
|
67
71
|
:param request: Context retrieval request.
|
|
68
72
|
:type request: biblicus.context_engine.ContextRetrieverRequest
|
|
69
73
|
:param corpus: Corpus instance to query.
|
|
70
74
|
:type corpus: biblicus.corpus.Corpus
|
|
71
|
-
:param
|
|
72
|
-
:type
|
|
73
|
-
:param
|
|
74
|
-
:type
|
|
75
|
-
:param
|
|
76
|
-
:type
|
|
77
|
-
:param
|
|
78
|
-
:type
|
|
75
|
+
:param retriever_id: Retrieval retriever identifier.
|
|
76
|
+
:type retriever_id: str
|
|
77
|
+
:param snapshot_id: Optional retrieval snapshot identifier.
|
|
78
|
+
:type snapshot_id: str or None
|
|
79
|
+
:param configuration_name: Optional configuration name for snapshot builds.
|
|
80
|
+
:type configuration_name: str or None
|
|
81
|
+
:param configuration: Optional retriever configuration.
|
|
82
|
+
:type configuration: dict[str, Any] or None
|
|
79
83
|
:param join_with: Separator between context pack blocks.
|
|
80
84
|
:type join_with: str
|
|
81
85
|
:param max_items_per_source: Optional cap per source.
|
|
@@ -86,14 +90,14 @@ def retrieve_context_pack(
|
|
|
86
90
|
:type metadata_fields: list[str] or None
|
|
87
91
|
:return: Context pack derived from retrieval results.
|
|
88
92
|
:rtype: biblicus.context.ContextPack
|
|
89
|
-
:raises ValueError: If no compatible retrieval
|
|
93
|
+
:raises ValueError: If no compatible retrieval snapshot is available.
|
|
90
94
|
"""
|
|
91
|
-
|
|
95
|
+
snapshot = _resolve_snapshot(
|
|
92
96
|
corpus,
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
+
retriever_id=retriever_id,
|
|
98
|
+
snapshot_id=snapshot_id,
|
|
99
|
+
configuration_name=configuration_name,
|
|
100
|
+
configuration=configuration,
|
|
97
101
|
)
|
|
98
102
|
|
|
99
103
|
maximum_total_characters = request.maximum_total_characters
|
|
@@ -106,10 +110,10 @@ def retrieve_context_pack(
|
|
|
106
110
|
maximum_total_characters=maximum_total_characters,
|
|
107
111
|
max_items_per_source=max_items_per_source,
|
|
108
112
|
)
|
|
109
|
-
|
|
110
|
-
result =
|
|
113
|
+
retriever = get_retriever(retriever_id)
|
|
114
|
+
result = retriever.query(
|
|
111
115
|
corpus,
|
|
112
|
-
|
|
116
|
+
snapshot=snapshot,
|
|
113
117
|
query_text=request.query,
|
|
114
118
|
budget=budget,
|
|
115
119
|
)
|