biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
- biblicus-1.1.1.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -40,14 +40,14 @@ class DoclingSmolExtractorConfig(BaseModel):
|
|
|
40
40
|
|
|
41
41
|
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
42
|
:vartype output_format: str
|
|
43
|
-
:ivar
|
|
44
|
-
:vartype
|
|
43
|
+
:ivar retriever: Inference retriever (mlx or transformers).
|
|
44
|
+
:vartype retriever: str
|
|
45
45
|
"""
|
|
46
46
|
|
|
47
|
-
model_config = ConfigDict(extra="forbid")
|
|
47
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
48
48
|
|
|
49
49
|
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
-
|
|
50
|
+
retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
class DoclingSmolExtractor(TextExtractor):
|
|
@@ -71,7 +71,7 @@ class DoclingSmolExtractor(TextExtractor):
|
|
|
71
71
|
:type config: dict[str, Any]
|
|
72
72
|
:return: Parsed config.
|
|
73
73
|
:rtype: DoclingSmolExtractorConfig
|
|
74
|
-
:raises
|
|
74
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
75
75
|
"""
|
|
76
76
|
parsed = DoclingSmolExtractorConfig.model_validate(config)
|
|
77
77
|
|
|
@@ -82,19 +82,19 @@ class DoclingSmolExtractor(TextExtractor):
|
|
|
82
82
|
vlm_model_specs,
|
|
83
83
|
)
|
|
84
84
|
except ImportError as import_error:
|
|
85
|
-
raise
|
|
85
|
+
raise ExtractionSnapshotFatalError(
|
|
86
86
|
"DoclingSmol extractor requires an optional dependency. "
|
|
87
87
|
'Install it with pip install "biblicus[docling]".'
|
|
88
88
|
) from import_error
|
|
89
89
|
|
|
90
|
-
if parsed.
|
|
90
|
+
if parsed.retriever == "mlx":
|
|
91
91
|
try:
|
|
92
92
|
from docling.pipeline_options import vlm_model_specs
|
|
93
93
|
|
|
94
94
|
_ = vlm_model_specs.SMOLDOCLING_MLX
|
|
95
95
|
except (ImportError, AttributeError) as exc:
|
|
96
|
-
raise
|
|
97
|
-
"DoclingSmol extractor with MLX
|
|
96
|
+
raise ExtractionSnapshotFatalError(
|
|
97
|
+
"DoclingSmol extractor with MLX retriever requires MLX support. "
|
|
98
98
|
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
99
|
) from exc
|
|
100
100
|
|
|
@@ -167,7 +167,7 @@ class DoclingSmolExtractor(TextExtractor):
|
|
|
167
167
|
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
168
|
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
169
|
|
|
170
|
-
if config.
|
|
170
|
+
if config.retriever == "mlx":
|
|
171
171
|
vlm_options = vlm_model_specs.SMOLDOCLING_MLX
|
|
172
172
|
else:
|
|
173
173
|
vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
|
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -52,18 +52,18 @@ class MarkItDownExtractor(TextExtractor):
|
|
|
52
52
|
:type config: dict[str, Any]
|
|
53
53
|
:return: Parsed config.
|
|
54
54
|
:rtype: MarkItDownExtractorConfig
|
|
55
|
-
:raises
|
|
55
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
56
56
|
"""
|
|
57
57
|
try:
|
|
58
58
|
import markitdown
|
|
59
59
|
from markitdown import MarkItDown # noqa: F401
|
|
60
60
|
except ImportError as import_error:
|
|
61
|
-
raise
|
|
61
|
+
raise ExtractionSnapshotFatalError(
|
|
62
62
|
"MarkItDown extractor requires an optional dependency. "
|
|
63
63
|
'Install it with pip install "biblicus[markitdown]".'
|
|
64
64
|
) from import_error
|
|
65
65
|
if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
|
|
66
|
-
raise
|
|
66
|
+
raise ExtractionSnapshotFatalError(
|
|
67
67
|
"MarkItDown requires Python 3.10 or higher. "
|
|
68
68
|
"Upgrade your interpreter or use a compatible extractor."
|
|
69
69
|
)
|
|
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
|
-
from ..errors import
|
|
14
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
15
15
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
16
|
from ..user_config import resolve_openai_api_key
|
|
17
17
|
from .base import TextExtractor
|
|
@@ -74,19 +74,19 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
74
74
|
:type config: dict[str, Any]
|
|
75
75
|
:return: Parsed configuration model.
|
|
76
76
|
:rtype: OpenAiSpeechToTextExtractorConfig
|
|
77
|
-
:raises
|
|
77
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
|
|
78
78
|
"""
|
|
79
79
|
try:
|
|
80
80
|
from openai import OpenAI # noqa: F401
|
|
81
81
|
except ImportError as import_error:
|
|
82
|
-
raise
|
|
82
|
+
raise ExtractionSnapshotFatalError(
|
|
83
83
|
"OpenAI speech to text extractor requires an optional dependency. "
|
|
84
84
|
'Install it with pip install "biblicus[openai]".'
|
|
85
85
|
) from import_error
|
|
86
86
|
|
|
87
87
|
api_key = resolve_openai_api_key()
|
|
88
88
|
if api_key is None:
|
|
89
|
-
raise
|
|
89
|
+
raise ExtractionSnapshotFatalError(
|
|
90
90
|
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
91
91
|
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
92
92
|
"openai.api_key."
|
|
@@ -115,7 +115,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
115
115
|
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
116
116
|
:return: Extracted text payload, or None when the item is not audio.
|
|
117
117
|
:rtype: ExtractedText or None
|
|
118
|
-
:raises
|
|
118
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
|
|
119
119
|
"""
|
|
120
120
|
_ = previous_extractions
|
|
121
121
|
if not item.media_type.startswith("audio/"):
|
|
@@ -129,7 +129,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
129
129
|
|
|
130
130
|
api_key = resolve_openai_api_key()
|
|
131
131
|
if api_key is None:
|
|
132
|
-
raise
|
|
132
|
+
raise ExtractionSnapshotFatalError(
|
|
133
133
|
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
134
134
|
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
135
135
|
"openai.api_key."
|
|
@@ -138,7 +138,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
138
138
|
try:
|
|
139
139
|
from openai import OpenAI
|
|
140
140
|
except ImportError as import_error:
|
|
141
|
-
raise
|
|
141
|
+
raise ExtractionSnapshotFatalError(
|
|
142
142
|
"OpenAI speech to text extractor requires an optional dependency. "
|
|
143
143
|
'Install it with pip install "biblicus[openai]".'
|
|
144
144
|
) from import_error
|
|
@@ -16,7 +16,7 @@ from typing import Any, ClassVar, Dict, List, Optional, Tuple
|
|
|
16
16
|
from pydantic import BaseModel, ConfigDict, Field
|
|
17
17
|
|
|
18
18
|
from ..corpus import Corpus
|
|
19
|
-
from ..errors import
|
|
19
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
20
20
|
from ..inference import ApiProvider, InferenceBackendConfig, InferenceBackendMode, resolve_api_key
|
|
21
21
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
22
22
|
from .base import TextExtractor
|
|
@@ -26,8 +26,8 @@ class PaddleOcrVlExtractorConfig(BaseModel):
|
|
|
26
26
|
"""
|
|
27
27
|
Configuration for the PaddleOCR-VL extractor.
|
|
28
28
|
|
|
29
|
-
:ivar
|
|
30
|
-
:vartype
|
|
29
|
+
:ivar retriever: Inference retriever configuration for local or application programming interface execution.
|
|
30
|
+
:vartype retriever: InferenceBackendConfig
|
|
31
31
|
:ivar min_confidence: Minimum confidence threshold for including text.
|
|
32
32
|
:vartype min_confidence: float
|
|
33
33
|
:ivar joiner: String used to join recognized text lines.
|
|
@@ -38,9 +38,11 @@ class PaddleOcrVlExtractorConfig(BaseModel):
|
|
|
38
38
|
:vartype lang: str
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
|
-
model_config = ConfigDict(extra="forbid")
|
|
41
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
retriever: InferenceBackendConfig = Field(
|
|
44
|
+
default_factory=InferenceBackendConfig, alias="backend"
|
|
45
|
+
)
|
|
44
46
|
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
45
47
|
joiner: str = Field(default="\n")
|
|
46
48
|
use_angle_cls: bool = Field(default=True)
|
|
@@ -70,7 +72,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
70
72
|
:type config: dict[str, Any]
|
|
71
73
|
:return: Parsed configuration model.
|
|
72
74
|
:rtype: PaddleOcrVlExtractorConfig
|
|
73
|
-
:raises
|
|
75
|
+
:raises ExtractionSnapshotFatalError: If required dependencies are missing.
|
|
74
76
|
"""
|
|
75
77
|
import json
|
|
76
78
|
|
|
@@ -86,26 +88,26 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
86
88
|
|
|
87
89
|
parsed = PaddleOcrVlExtractorConfig.model_validate(parsed_config)
|
|
88
90
|
|
|
89
|
-
if parsed.
|
|
91
|
+
if parsed.retriever.mode == InferenceBackendMode.LOCAL:
|
|
90
92
|
try:
|
|
91
93
|
from paddleocr import PaddleOCR # noqa: F401
|
|
92
94
|
except ImportError as import_error:
|
|
93
|
-
raise
|
|
95
|
+
raise ExtractionSnapshotFatalError(
|
|
94
96
|
"PaddleOCR-VL extractor (local mode) requires paddleocr. "
|
|
95
97
|
'Install it with pip install "biblicus[paddleocr]".'
|
|
96
98
|
) from import_error
|
|
97
99
|
else:
|
|
98
100
|
# api_provider is guaranteed to be set by InferenceBackendConfig validator
|
|
99
101
|
api_key = resolve_api_key(
|
|
100
|
-
parsed.
|
|
101
|
-
config_override=parsed.
|
|
102
|
+
parsed.retriever.api_provider,
|
|
103
|
+
config_override=parsed.retriever.api_key,
|
|
102
104
|
)
|
|
103
105
|
if api_key is None:
|
|
104
|
-
provider_name = parsed.
|
|
105
|
-
raise
|
|
106
|
+
provider_name = parsed.retriever.api_provider.value.upper()
|
|
107
|
+
raise ExtractionSnapshotFatalError(
|
|
106
108
|
f"PaddleOCR-VL extractor (API mode) requires an API key for {provider_name}. "
|
|
107
109
|
f"Set {provider_name}_API_KEY environment variable or configure "
|
|
108
|
-
f"{parsed.
|
|
110
|
+
f"{parsed.retriever.api_provider.value} in user config."
|
|
109
111
|
)
|
|
110
112
|
|
|
111
113
|
return parsed
|
|
@@ -145,12 +147,12 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
145
147
|
|
|
146
148
|
source_path = corpus.root / item.relpath
|
|
147
149
|
|
|
148
|
-
if parsed_config.
|
|
150
|
+
if parsed_config.retriever.mode == InferenceBackendMode.LOCAL:
|
|
149
151
|
text, confidence = self._extract_local(source_path, parsed_config)
|
|
150
152
|
else:
|
|
151
153
|
api_key = resolve_api_key(
|
|
152
|
-
parsed_config.
|
|
153
|
-
config_override=parsed_config.
|
|
154
|
+
parsed_config.retriever.api_provider,
|
|
155
|
+
config_override=parsed_config.retriever.api_key,
|
|
154
156
|
)
|
|
155
157
|
text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
|
|
156
158
|
|
|
@@ -228,7 +230,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
228
230
|
:return: Tuple of extracted text and confidence score.
|
|
229
231
|
:rtype: tuple[str, float or None]
|
|
230
232
|
"""
|
|
231
|
-
if config.
|
|
233
|
+
if config.retriever.api_provider == ApiProvider.HUGGINGFACE:
|
|
232
234
|
return self._extract_via_huggingface_api(source_path, config, api_key)
|
|
233
235
|
else:
|
|
234
236
|
return "", None
|
|
@@ -257,7 +259,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
257
259
|
|
|
258
260
|
headers = {"Authorization": f"Bearer {api_key}"}
|
|
259
261
|
|
|
260
|
-
model_id = config.
|
|
262
|
+
model_id = config.retriever.model_id or "PaddlePaddle/PaddleOCR-VL"
|
|
261
263
|
api_url = f"https://api-inference.huggingface.co/models/{model_id}"
|
|
262
264
|
response = requests.post(
|
|
263
265
|
api_url,
|
biblicus/extractors/pipeline.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
10
|
|
|
11
11
|
from ..corpus import Corpus
|
|
12
|
-
from ..errors import
|
|
12
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
13
13
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
14
|
from .base import TextExtractor
|
|
15
15
|
|
|
@@ -20,14 +20,14 @@ class PipelineStepSpec(BaseModel):
|
|
|
20
20
|
|
|
21
21
|
:ivar extractor_id: Extractor plugin identifier.
|
|
22
22
|
:vartype extractor_id: str
|
|
23
|
-
:ivar
|
|
24
|
-
:vartype
|
|
23
|
+
:ivar configuration: Extractor configuration mapping.
|
|
24
|
+
:vartype configuration: dict[str, Any]
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
-
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
28
28
|
|
|
29
29
|
extractor_id: str = Field(min_length=1)
|
|
30
|
-
|
|
30
|
+
configuration: Dict[str, Any] = Field(default_factory=dict, alias="config")
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class PipelineExtractorConfig(BaseModel):
|
|
@@ -92,7 +92,7 @@ class PipelineExtractor(TextExtractor):
|
|
|
92
92
|
:type config: PipelineExtractorConfig
|
|
93
93
|
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
94
94
|
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
95
|
-
:raises
|
|
95
|
+
:raises ExtractionSnapshotFatalError: Always, because the pipeline is executed by the runner.
|
|
96
96
|
:return: None.
|
|
97
97
|
:rtype: None
|
|
98
98
|
"""
|
|
@@ -100,6 +100,6 @@ class PipelineExtractor(TextExtractor):
|
|
|
100
100
|
_ = item
|
|
101
101
|
_ = config
|
|
102
102
|
_ = previous_extractions
|
|
103
|
-
raise
|
|
104
|
-
"Pipeline extractor must be executed by the extraction
|
|
103
|
+
raise ExtractionSnapshotFatalError(
|
|
104
|
+
"Pipeline extractor must be executed by the extraction snapshotner."
|
|
105
105
|
)
|
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -54,12 +54,12 @@ class RapidOcrExtractor(TextExtractor):
|
|
|
54
54
|
:type config: dict[str, Any]
|
|
55
55
|
:return: Parsed configuration model.
|
|
56
56
|
:rtype: RapidOcrExtractorConfig
|
|
57
|
-
:raises
|
|
57
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is missing.
|
|
58
58
|
"""
|
|
59
59
|
try:
|
|
60
60
|
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
|
61
61
|
except ImportError as import_error:
|
|
62
|
-
raise
|
|
62
|
+
raise ExtractionSnapshotFatalError(
|
|
63
63
|
"RapidOCR extractor requires an optional dependency. "
|
|
64
64
|
'Install it with pip install "biblicus[ocr]".'
|
|
65
65
|
) from import_error
|
|
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
|
-
from ..errors import
|
|
14
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
15
15
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
16
|
from .base import TextExtractor
|
|
17
17
|
|
|
@@ -48,12 +48,12 @@ class UnstructuredExtractor(TextExtractor):
|
|
|
48
48
|
:type config: dict[str, Any]
|
|
49
49
|
:return: Parsed config.
|
|
50
50
|
:rtype: UnstructuredExtractorConfig
|
|
51
|
-
:raises
|
|
51
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
52
52
|
"""
|
|
53
53
|
try:
|
|
54
54
|
from unstructured.partition.auto import partition # noqa: F401
|
|
55
55
|
except ImportError as import_error:
|
|
56
|
-
raise
|
|
56
|
+
raise ExtractionSnapshotFatalError(
|
|
57
57
|
"Unstructured extractor requires an optional dependency. "
|
|
58
58
|
'Install it with pip install "biblicus[unstructured]".'
|
|
59
59
|
) from import_error
|
biblicus/hooks.py
CHANGED
|
@@ -18,8 +18,8 @@ class HookPoint(str, Enum):
|
|
|
18
18
|
:cvar after_ingest: Called after an item is ingested and indexed.
|
|
19
19
|
:cvar before_reindex: Called before a catalog rebuild starts.
|
|
20
20
|
:cvar after_reindex: Called after a catalog rebuild completes.
|
|
21
|
-
:cvar
|
|
22
|
-
:cvar
|
|
21
|
+
:cvar before_build_snapshot: Called before a retriever snapshot build starts.
|
|
22
|
+
:cvar after_build_snapshot: Called after a retriever snapshot build completes.
|
|
23
23
|
:cvar before_query: Called before a query is executed.
|
|
24
24
|
:cvar after_query: Called after a query completes.
|
|
25
25
|
:cvar before_evaluate_run: Called before an evaluation starts.
|
|
@@ -30,8 +30,8 @@ class HookPoint(str, Enum):
|
|
|
30
30
|
after_ingest = "after_ingest"
|
|
31
31
|
before_reindex = "before_reindex"
|
|
32
32
|
after_reindex = "after_reindex"
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
before_build_snapshot = "before_build_snapshot"
|
|
34
|
+
after_build_snapshot = "after_build_snapshot"
|
|
35
35
|
before_query = "before_query"
|
|
36
36
|
after_query = "after_query"
|
|
37
37
|
before_evaluate_run = "before_evaluate_run"
|
biblicus/knowledge_base.py
CHANGED
|
@@ -11,7 +11,6 @@ from typing import List, Optional, Sequence
|
|
|
11
11
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
|
-
from .backends import get_backend
|
|
15
14
|
from .context import (
|
|
16
15
|
ContextPack,
|
|
17
16
|
ContextPackPolicy,
|
|
@@ -20,17 +19,18 @@ from .context import (
|
|
|
20
19
|
fit_context_pack_to_token_budget,
|
|
21
20
|
)
|
|
22
21
|
from .corpus import Corpus
|
|
23
|
-
from .models import QueryBudget, RetrievalResult,
|
|
22
|
+
from .models import QueryBudget, RetrievalResult, RetrievalSnapshot
|
|
23
|
+
from .retrievers import get_retriever
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class KnowledgeBaseDefaults(BaseModel):
|
|
27
27
|
"""
|
|
28
28
|
Default configuration for a knowledge base workflow.
|
|
29
29
|
|
|
30
|
-
:ivar
|
|
31
|
-
:vartype
|
|
32
|
-
:ivar
|
|
33
|
-
:vartype
|
|
30
|
+
:ivar retriever_id: Retriever identifier to use for retrieval.
|
|
31
|
+
:vartype retriever_id: str
|
|
32
|
+
:ivar configuration_name: Human-readable retrieval configuration name.
|
|
33
|
+
:vartype configuration_name: str
|
|
34
34
|
:ivar query_budget: Default query budget to apply to retrieval.
|
|
35
35
|
:vartype query_budget: QueryBudget
|
|
36
36
|
:ivar tags: Tags to apply when importing the folder.
|
|
@@ -39,8 +39,8 @@ class KnowledgeBaseDefaults(BaseModel):
|
|
|
39
39
|
|
|
40
40
|
model_config = ConfigDict(extra="forbid")
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
retriever_id: str = Field(default="scan", min_length=1)
|
|
43
|
+
configuration_name: str = Field(default="Knowledge base", min_length=1)
|
|
44
44
|
query_budget: QueryBudget = Field(
|
|
45
45
|
default_factory=lambda: QueryBudget(
|
|
46
46
|
max_total_items=5,
|
|
@@ -58,17 +58,17 @@ class KnowledgeBase:
|
|
|
58
58
|
|
|
59
59
|
:ivar corpus: Corpus instance that stores the ingested items.
|
|
60
60
|
:vartype corpus: Corpus
|
|
61
|
-
:ivar
|
|
62
|
-
:vartype
|
|
63
|
-
:ivar
|
|
64
|
-
:vartype
|
|
61
|
+
:ivar retriever_id: Retriever identifier used for retrieval.
|
|
62
|
+
:vartype retriever_id: str
|
|
63
|
+
:ivar snapshot: Retrieval snapshot manifest associated with the knowledge base.
|
|
64
|
+
:vartype snapshot: RetrievalSnapshot
|
|
65
65
|
:ivar defaults: Default configuration used for this knowledge base.
|
|
66
66
|
:vartype defaults: KnowledgeBaseDefaults
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
69
|
corpus: Corpus
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
retriever_id: str
|
|
71
|
+
snapshot: RetrievalSnapshot
|
|
72
72
|
defaults: KnowledgeBaseDefaults
|
|
73
73
|
_temp_dir: Optional[TemporaryDirectory]
|
|
74
74
|
|
|
@@ -77,8 +77,8 @@ class KnowledgeBase:
|
|
|
77
77
|
cls,
|
|
78
78
|
folder: str | Path,
|
|
79
79
|
*,
|
|
80
|
-
|
|
81
|
-
|
|
80
|
+
retriever_id: Optional[str] = None,
|
|
81
|
+
configuration_name: Optional[str] = None,
|
|
82
82
|
query_budget: Optional[QueryBudget] = None,
|
|
83
83
|
tags: Optional[Sequence[str]] = None,
|
|
84
84
|
corpus_root: Optional[str | Path] = None,
|
|
@@ -88,10 +88,10 @@ class KnowledgeBase:
|
|
|
88
88
|
|
|
89
89
|
:param folder: Folder containing source files.
|
|
90
90
|
:type folder: str or Path
|
|
91
|
-
:param
|
|
92
|
-
:type
|
|
93
|
-
:param
|
|
94
|
-
:type
|
|
91
|
+
:param retriever_id: Optional retriever identifier override.
|
|
92
|
+
:type retriever_id: str or None
|
|
93
|
+
:param configuration_name: Optional configuration name override.
|
|
94
|
+
:type configuration_name: str or None
|
|
95
95
|
:param query_budget: Optional query budget override.
|
|
96
96
|
:type query_budget: QueryBudget or None
|
|
97
97
|
:param tags: Optional tags to apply during import.
|
|
@@ -110,8 +110,8 @@ class KnowledgeBase:
|
|
|
110
110
|
raise NotADirectoryError(f"Knowledge base folder is not a directory: {source_root}")
|
|
111
111
|
|
|
112
112
|
defaults = KnowledgeBaseDefaults()
|
|
113
|
-
|
|
114
|
-
|
|
113
|
+
resolved_retriever_id = retriever_id or defaults.retriever_id
|
|
114
|
+
resolved_configuration_name = configuration_name or defaults.configuration_name
|
|
115
115
|
resolved_query_budget = query_budget or defaults.query_budget
|
|
116
116
|
resolved_tags = list(tags) if tags is not None else defaults.tags
|
|
117
117
|
|
|
@@ -125,16 +125,18 @@ class KnowledgeBase:
|
|
|
125
125
|
corpus = Corpus.init(corpus_root_path)
|
|
126
126
|
corpus.import_tree(source_root, tags=resolved_tags)
|
|
127
127
|
|
|
128
|
-
|
|
129
|
-
|
|
128
|
+
retriever = get_retriever(resolved_retriever_id)
|
|
129
|
+
snapshot = retriever.build_snapshot(
|
|
130
|
+
corpus, configuration_name=resolved_configuration_name, configuration={}
|
|
131
|
+
)
|
|
130
132
|
|
|
131
133
|
return cls(
|
|
132
134
|
corpus=corpus,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
+
retriever_id=resolved_retriever_id,
|
|
136
|
+
snapshot=snapshot,
|
|
135
137
|
defaults=KnowledgeBaseDefaults(
|
|
136
|
-
|
|
137
|
-
|
|
138
|
+
retriever_id=resolved_retriever_id,
|
|
139
|
+
configuration_name=resolved_configuration_name,
|
|
138
140
|
query_budget=resolved_query_budget,
|
|
139
141
|
tags=resolved_tags,
|
|
140
142
|
),
|
|
@@ -152,11 +154,11 @@ class KnowledgeBase:
|
|
|
152
154
|
:return: Retrieval result containing evidence.
|
|
153
155
|
:rtype: RetrievalResult
|
|
154
156
|
"""
|
|
155
|
-
|
|
157
|
+
retriever = get_retriever(self.retriever_id)
|
|
156
158
|
resolved_budget = budget or self.defaults.query_budget
|
|
157
|
-
return
|
|
159
|
+
return retriever.query(
|
|
158
160
|
self.corpus,
|
|
159
|
-
|
|
161
|
+
snapshot=self.snapshot,
|
|
160
162
|
query_text=query_text,
|
|
161
163
|
budget=resolved_budget,
|
|
162
164
|
)
|