biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +25 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +248 -191
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context.py +27 -12
- biblicus/context_engine/__init__.py +53 -0
- biblicus/context_engine/assembler.py +1090 -0
- biblicus/context_engine/compaction.py +110 -0
- biblicus/context_engine/models.py +423 -0
- biblicus/context_engine/retrieval.py +133 -0
- biblicus/corpus.py +233 -124
- biblicus/errors.py +27 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +34 -32
- biblicus/models.py +84 -81
- biblicus/retrieval.py +49 -42
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
- biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +84 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +103 -100
- biblicus/sources.py +46 -11
- biblicus/text/link.py +6 -0
- biblicus/text/prompts.py +18 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -291
- biblicus-0.16.0.dist-info/RECORD +0 -86
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
|
-
from ..errors import
|
|
14
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
15
15
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
16
|
from ..user_config import resolve_deepgram_api_key
|
|
17
17
|
from .base import TextExtractor
|
|
@@ -66,19 +66,19 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
66
66
|
:type config: dict[str, Any]
|
|
67
67
|
:return: Parsed configuration model.
|
|
68
68
|
:rtype: DeepgramSpeechToTextExtractorConfig
|
|
69
|
-
:raises
|
|
69
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
|
|
70
70
|
"""
|
|
71
71
|
try:
|
|
72
72
|
from deepgram import DeepgramClient # noqa: F401
|
|
73
73
|
except ImportError as import_error:
|
|
74
|
-
raise
|
|
74
|
+
raise ExtractionSnapshotFatalError(
|
|
75
75
|
"Deepgram speech to text extractor requires an optional dependency. "
|
|
76
76
|
'Install it with pip install "biblicus[deepgram]".'
|
|
77
77
|
) from import_error
|
|
78
78
|
|
|
79
79
|
api_key = resolve_deepgram_api_key()
|
|
80
80
|
if api_key is None:
|
|
81
|
-
raise
|
|
81
|
+
raise ExtractionSnapshotFatalError(
|
|
82
82
|
"Deepgram speech to text extractor requires a Deepgram API key. "
|
|
83
83
|
"Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
84
84
|
"deepgram.api_key."
|
|
@@ -107,7 +107,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
107
107
|
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
108
108
|
:return: Extracted text payload, or None when the item is not audio.
|
|
109
109
|
:rtype: ExtractedText or None
|
|
110
|
-
:raises
|
|
110
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
|
|
111
111
|
"""
|
|
112
112
|
_ = previous_extractions
|
|
113
113
|
if not item.media_type.startswith("audio/"):
|
|
@@ -121,7 +121,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
121
121
|
|
|
122
122
|
api_key = resolve_deepgram_api_key()
|
|
123
123
|
if api_key is None:
|
|
124
|
-
raise
|
|
124
|
+
raise ExtractionSnapshotFatalError(
|
|
125
125
|
"Deepgram speech to text extractor requires a Deepgram API key. "
|
|
126
126
|
"Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
127
127
|
"deepgram.api_key."
|
|
@@ -130,7 +130,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
|
|
|
130
130
|
try:
|
|
131
131
|
from deepgram import DeepgramClient
|
|
132
132
|
except ImportError as import_error:
|
|
133
|
-
raise
|
|
133
|
+
raise ExtractionSnapshotFatalError(
|
|
134
134
|
"Deepgram speech to text extractor requires an optional dependency. "
|
|
135
135
|
'Install it with pip install "biblicus[deepgram]".'
|
|
136
136
|
) from import_error
|
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -40,14 +40,14 @@ class DoclingGraniteExtractorConfig(BaseModel):
|
|
|
40
40
|
|
|
41
41
|
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
42
|
:vartype output_format: str
|
|
43
|
-
:ivar
|
|
44
|
-
:vartype
|
|
43
|
+
:ivar retriever: Inference retriever (mlx or transformers).
|
|
44
|
+
:vartype retriever: str
|
|
45
45
|
"""
|
|
46
46
|
|
|
47
|
-
model_config = ConfigDict(extra="forbid")
|
|
47
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
48
48
|
|
|
49
49
|
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
-
|
|
50
|
+
retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
class DoclingGraniteExtractor(TextExtractor):
|
|
@@ -71,7 +71,7 @@ class DoclingGraniteExtractor(TextExtractor):
|
|
|
71
71
|
:type config: dict[str, Any]
|
|
72
72
|
:return: Parsed config.
|
|
73
73
|
:rtype: DoclingGraniteExtractorConfig
|
|
74
|
-
:raises
|
|
74
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
75
75
|
"""
|
|
76
76
|
parsed = DoclingGraniteExtractorConfig.model_validate(config)
|
|
77
77
|
|
|
@@ -82,19 +82,19 @@ class DoclingGraniteExtractor(TextExtractor):
|
|
|
82
82
|
vlm_model_specs,
|
|
83
83
|
)
|
|
84
84
|
except ImportError as import_error:
|
|
85
|
-
raise
|
|
85
|
+
raise ExtractionSnapshotFatalError(
|
|
86
86
|
"DoclingGranite extractor requires an optional dependency. "
|
|
87
87
|
'Install it with pip install "biblicus[docling]".'
|
|
88
88
|
) from import_error
|
|
89
89
|
|
|
90
|
-
if parsed.
|
|
90
|
+
if parsed.retriever == "mlx":
|
|
91
91
|
try:
|
|
92
92
|
from docling.pipeline_options import vlm_model_specs
|
|
93
93
|
|
|
94
94
|
_ = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
95
95
|
except (ImportError, AttributeError) as exc:
|
|
96
|
-
raise
|
|
97
|
-
"DoclingGranite extractor with MLX
|
|
96
|
+
raise ExtractionSnapshotFatalError(
|
|
97
|
+
"DoclingGranite extractor with MLX retriever requires MLX support. "
|
|
98
98
|
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
99
|
) from exc
|
|
100
100
|
|
|
@@ -167,7 +167,7 @@ class DoclingGraniteExtractor(TextExtractor):
|
|
|
167
167
|
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
168
|
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
169
|
|
|
170
|
-
if config.
|
|
170
|
+
if config.retriever == "mlx":
|
|
171
171
|
vlm_options = vlm_model_specs.GRANITE_DOCLING_MLX
|
|
172
172
|
else:
|
|
173
173
|
vlm_options = vlm_model_specs.GRANITE_DOCLING_TRANSFORMERS
|
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -40,14 +40,14 @@ class DoclingSmolExtractorConfig(BaseModel):
|
|
|
40
40
|
|
|
41
41
|
:ivar output_format: Output format for extracted content (markdown, text, or html).
|
|
42
42
|
:vartype output_format: str
|
|
43
|
-
:ivar
|
|
44
|
-
:vartype
|
|
43
|
+
:ivar retriever: Inference retriever (mlx or transformers).
|
|
44
|
+
:vartype retriever: str
|
|
45
45
|
"""
|
|
46
46
|
|
|
47
|
-
model_config = ConfigDict(extra="forbid")
|
|
47
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
48
48
|
|
|
49
49
|
output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
|
|
50
|
-
|
|
50
|
+
retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
class DoclingSmolExtractor(TextExtractor):
|
|
@@ -71,7 +71,7 @@ class DoclingSmolExtractor(TextExtractor):
|
|
|
71
71
|
:type config: dict[str, Any]
|
|
72
72
|
:return: Parsed config.
|
|
73
73
|
:rtype: DoclingSmolExtractorConfig
|
|
74
|
-
:raises
|
|
74
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
75
75
|
"""
|
|
76
76
|
parsed = DoclingSmolExtractorConfig.model_validate(config)
|
|
77
77
|
|
|
@@ -82,19 +82,19 @@ class DoclingSmolExtractor(TextExtractor):
|
|
|
82
82
|
vlm_model_specs,
|
|
83
83
|
)
|
|
84
84
|
except ImportError as import_error:
|
|
85
|
-
raise
|
|
85
|
+
raise ExtractionSnapshotFatalError(
|
|
86
86
|
"DoclingSmol extractor requires an optional dependency. "
|
|
87
87
|
'Install it with pip install "biblicus[docling]".'
|
|
88
88
|
) from import_error
|
|
89
89
|
|
|
90
|
-
if parsed.
|
|
90
|
+
if parsed.retriever == "mlx":
|
|
91
91
|
try:
|
|
92
92
|
from docling.pipeline_options import vlm_model_specs
|
|
93
93
|
|
|
94
94
|
_ = vlm_model_specs.SMOLDOCLING_MLX
|
|
95
95
|
except (ImportError, AttributeError) as exc:
|
|
96
|
-
raise
|
|
97
|
-
"DoclingSmol extractor with MLX
|
|
96
|
+
raise ExtractionSnapshotFatalError(
|
|
97
|
+
"DoclingSmol extractor with MLX retriever requires MLX support. "
|
|
98
98
|
'Install it with pip install "biblicus[docling-mlx]".'
|
|
99
99
|
) from exc
|
|
100
100
|
|
|
@@ -167,7 +167,7 @@ class DoclingSmolExtractor(TextExtractor):
|
|
|
167
167
|
from docling.format_options import InputFormat, PdfFormatOption
|
|
168
168
|
from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
|
|
169
169
|
|
|
170
|
-
if config.
|
|
170
|
+
if config.retriever == "mlx":
|
|
171
171
|
vlm_options = vlm_model_specs.SMOLDOCLING_MLX
|
|
172
172
|
else:
|
|
173
173
|
vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
|
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -52,18 +52,18 @@ class MarkItDownExtractor(TextExtractor):
|
|
|
52
52
|
:type config: dict[str, Any]
|
|
53
53
|
:return: Parsed config.
|
|
54
54
|
:rtype: MarkItDownExtractorConfig
|
|
55
|
-
:raises
|
|
55
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
56
56
|
"""
|
|
57
57
|
try:
|
|
58
58
|
import markitdown
|
|
59
59
|
from markitdown import MarkItDown # noqa: F401
|
|
60
60
|
except ImportError as import_error:
|
|
61
|
-
raise
|
|
61
|
+
raise ExtractionSnapshotFatalError(
|
|
62
62
|
"MarkItDown extractor requires an optional dependency. "
|
|
63
63
|
'Install it with pip install "biblicus[markitdown]".'
|
|
64
64
|
) from import_error
|
|
65
65
|
if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
|
|
66
|
-
raise
|
|
66
|
+
raise ExtractionSnapshotFatalError(
|
|
67
67
|
"MarkItDown requires Python 3.10 or higher. "
|
|
68
68
|
"Upgrade your interpreter or use a compatible extractor."
|
|
69
69
|
)
|
|
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
|
-
from ..errors import
|
|
14
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
15
15
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
16
|
from ..user_config import resolve_openai_api_key
|
|
17
17
|
from .base import TextExtractor
|
|
@@ -74,19 +74,19 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
74
74
|
:type config: dict[str, Any]
|
|
75
75
|
:return: Parsed configuration model.
|
|
76
76
|
:rtype: OpenAiSpeechToTextExtractorConfig
|
|
77
|
-
:raises
|
|
77
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
|
|
78
78
|
"""
|
|
79
79
|
try:
|
|
80
80
|
from openai import OpenAI # noqa: F401
|
|
81
81
|
except ImportError as import_error:
|
|
82
|
-
raise
|
|
82
|
+
raise ExtractionSnapshotFatalError(
|
|
83
83
|
"OpenAI speech to text extractor requires an optional dependency. "
|
|
84
84
|
'Install it with pip install "biblicus[openai]".'
|
|
85
85
|
) from import_error
|
|
86
86
|
|
|
87
87
|
api_key = resolve_openai_api_key()
|
|
88
88
|
if api_key is None:
|
|
89
|
-
raise
|
|
89
|
+
raise ExtractionSnapshotFatalError(
|
|
90
90
|
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
91
91
|
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
92
92
|
"openai.api_key."
|
|
@@ -115,7 +115,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
115
115
|
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
116
116
|
:return: Extracted text payload, or None when the item is not audio.
|
|
117
117
|
:rtype: ExtractedText or None
|
|
118
|
-
:raises
|
|
118
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
|
|
119
119
|
"""
|
|
120
120
|
_ = previous_extractions
|
|
121
121
|
if not item.media_type.startswith("audio/"):
|
|
@@ -129,7 +129,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
129
129
|
|
|
130
130
|
api_key = resolve_openai_api_key()
|
|
131
131
|
if api_key is None:
|
|
132
|
-
raise
|
|
132
|
+
raise ExtractionSnapshotFatalError(
|
|
133
133
|
"OpenAI speech to text extractor requires an OpenAI API key. "
|
|
134
134
|
"Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
|
|
135
135
|
"openai.api_key."
|
|
@@ -138,7 +138,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
|
|
|
138
138
|
try:
|
|
139
139
|
from openai import OpenAI
|
|
140
140
|
except ImportError as import_error:
|
|
141
|
-
raise
|
|
141
|
+
raise ExtractionSnapshotFatalError(
|
|
142
142
|
"OpenAI speech to text extractor requires an optional dependency. "
|
|
143
143
|
'Install it with pip install "biblicus[openai]".'
|
|
144
144
|
) from import_error
|
|
@@ -16,7 +16,7 @@ from typing import Any, ClassVar, Dict, List, Optional, Tuple
|
|
|
16
16
|
from pydantic import BaseModel, ConfigDict, Field
|
|
17
17
|
|
|
18
18
|
from ..corpus import Corpus
|
|
19
|
-
from ..errors import
|
|
19
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
20
20
|
from ..inference import ApiProvider, InferenceBackendConfig, InferenceBackendMode, resolve_api_key
|
|
21
21
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
22
22
|
from .base import TextExtractor
|
|
@@ -26,8 +26,8 @@ class PaddleOcrVlExtractorConfig(BaseModel):
|
|
|
26
26
|
"""
|
|
27
27
|
Configuration for the PaddleOCR-VL extractor.
|
|
28
28
|
|
|
29
|
-
:ivar
|
|
30
|
-
:vartype
|
|
29
|
+
:ivar retriever: Inference retriever configuration for local or application programming interface execution.
|
|
30
|
+
:vartype retriever: InferenceBackendConfig
|
|
31
31
|
:ivar min_confidence: Minimum confidence threshold for including text.
|
|
32
32
|
:vartype min_confidence: float
|
|
33
33
|
:ivar joiner: String used to join recognized text lines.
|
|
@@ -38,9 +38,11 @@ class PaddleOcrVlExtractorConfig(BaseModel):
|
|
|
38
38
|
:vartype lang: str
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
|
-
model_config = ConfigDict(extra="forbid")
|
|
41
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
retriever: InferenceBackendConfig = Field(
|
|
44
|
+
default_factory=InferenceBackendConfig, alias="backend"
|
|
45
|
+
)
|
|
44
46
|
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
45
47
|
joiner: str = Field(default="\n")
|
|
46
48
|
use_angle_cls: bool = Field(default=True)
|
|
@@ -70,7 +72,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
70
72
|
:type config: dict[str, Any]
|
|
71
73
|
:return: Parsed configuration model.
|
|
72
74
|
:rtype: PaddleOcrVlExtractorConfig
|
|
73
|
-
:raises
|
|
75
|
+
:raises ExtractionSnapshotFatalError: If required dependencies are missing.
|
|
74
76
|
"""
|
|
75
77
|
import json
|
|
76
78
|
|
|
@@ -86,26 +88,26 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
86
88
|
|
|
87
89
|
parsed = PaddleOcrVlExtractorConfig.model_validate(parsed_config)
|
|
88
90
|
|
|
89
|
-
if parsed.
|
|
91
|
+
if parsed.retriever.mode == InferenceBackendMode.LOCAL:
|
|
90
92
|
try:
|
|
91
93
|
from paddleocr import PaddleOCR # noqa: F401
|
|
92
94
|
except ImportError as import_error:
|
|
93
|
-
raise
|
|
95
|
+
raise ExtractionSnapshotFatalError(
|
|
94
96
|
"PaddleOCR-VL extractor (local mode) requires paddleocr. "
|
|
95
97
|
'Install it with pip install "biblicus[paddleocr]".'
|
|
96
98
|
) from import_error
|
|
97
99
|
else:
|
|
98
100
|
# api_provider is guaranteed to be set by InferenceBackendConfig validator
|
|
99
101
|
api_key = resolve_api_key(
|
|
100
|
-
parsed.
|
|
101
|
-
config_override=parsed.
|
|
102
|
+
parsed.retriever.api_provider,
|
|
103
|
+
config_override=parsed.retriever.api_key,
|
|
102
104
|
)
|
|
103
105
|
if api_key is None:
|
|
104
|
-
provider_name = parsed.
|
|
105
|
-
raise
|
|
106
|
+
provider_name = parsed.retriever.api_provider.value.upper()
|
|
107
|
+
raise ExtractionSnapshotFatalError(
|
|
106
108
|
f"PaddleOCR-VL extractor (API mode) requires an API key for {provider_name}. "
|
|
107
109
|
f"Set {provider_name}_API_KEY environment variable or configure "
|
|
108
|
-
f"{parsed.
|
|
110
|
+
f"{parsed.retriever.api_provider.value} in user config."
|
|
109
111
|
)
|
|
110
112
|
|
|
111
113
|
return parsed
|
|
@@ -145,12 +147,12 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
145
147
|
|
|
146
148
|
source_path = corpus.root / item.relpath
|
|
147
149
|
|
|
148
|
-
if parsed_config.
|
|
150
|
+
if parsed_config.retriever.mode == InferenceBackendMode.LOCAL:
|
|
149
151
|
text, confidence = self._extract_local(source_path, parsed_config)
|
|
150
152
|
else:
|
|
151
153
|
api_key = resolve_api_key(
|
|
152
|
-
parsed_config.
|
|
153
|
-
config_override=parsed_config.
|
|
154
|
+
parsed_config.retriever.api_provider,
|
|
155
|
+
config_override=parsed_config.retriever.api_key,
|
|
154
156
|
)
|
|
155
157
|
text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
|
|
156
158
|
|
|
@@ -228,7 +230,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
228
230
|
:return: Tuple of extracted text and confidence score.
|
|
229
231
|
:rtype: tuple[str, float or None]
|
|
230
232
|
"""
|
|
231
|
-
if config.
|
|
233
|
+
if config.retriever.api_provider == ApiProvider.HUGGINGFACE:
|
|
232
234
|
return self._extract_via_huggingface_api(source_path, config, api_key)
|
|
233
235
|
else:
|
|
234
236
|
return "", None
|
|
@@ -257,7 +259,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
257
259
|
|
|
258
260
|
headers = {"Authorization": f"Bearer {api_key}"}
|
|
259
261
|
|
|
260
|
-
model_id = config.
|
|
262
|
+
model_id = config.retriever.model_id or "PaddlePaddle/PaddleOCR-VL"
|
|
261
263
|
api_url = f"https://api-inference.huggingface.co/models/{model_id}"
|
|
262
264
|
response = requests.post(
|
|
263
265
|
api_url,
|
biblicus/extractors/pipeline.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
10
|
|
|
11
11
|
from ..corpus import Corpus
|
|
12
|
-
from ..errors import
|
|
12
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
13
13
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
14
14
|
from .base import TextExtractor
|
|
15
15
|
|
|
@@ -20,14 +20,14 @@ class PipelineStepSpec(BaseModel):
|
|
|
20
20
|
|
|
21
21
|
:ivar extractor_id: Extractor plugin identifier.
|
|
22
22
|
:vartype extractor_id: str
|
|
23
|
-
:ivar
|
|
24
|
-
:vartype
|
|
23
|
+
:ivar configuration: Extractor configuration mapping.
|
|
24
|
+
:vartype configuration: dict[str, Any]
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
-
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
28
28
|
|
|
29
29
|
extractor_id: str = Field(min_length=1)
|
|
30
|
-
|
|
30
|
+
configuration: Dict[str, Any] = Field(default_factory=dict, alias="config")
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class PipelineExtractorConfig(BaseModel):
|
|
@@ -92,7 +92,7 @@ class PipelineExtractor(TextExtractor):
|
|
|
92
92
|
:type config: PipelineExtractorConfig
|
|
93
93
|
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
94
94
|
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
95
|
-
:raises
|
|
95
|
+
:raises ExtractionSnapshotFatalError: Always, because the pipeline is executed by the runner.
|
|
96
96
|
:return: None.
|
|
97
97
|
:rtype: None
|
|
98
98
|
"""
|
|
@@ -100,6 +100,6 @@ class PipelineExtractor(TextExtractor):
|
|
|
100
100
|
_ = item
|
|
101
101
|
_ = config
|
|
102
102
|
_ = previous_extractions
|
|
103
|
-
raise
|
|
104
|
-
"Pipeline extractor must be executed by the extraction
|
|
103
|
+
raise ExtractionSnapshotFatalError(
|
|
104
|
+
"Pipeline extractor must be executed by the extraction snapshotner."
|
|
105
105
|
)
|
|
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, Field
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..errors import
|
|
15
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
16
16
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
17
|
from .base import TextExtractor
|
|
18
18
|
|
|
@@ -54,12 +54,12 @@ class RapidOcrExtractor(TextExtractor):
|
|
|
54
54
|
:type config: dict[str, Any]
|
|
55
55
|
:return: Parsed configuration model.
|
|
56
56
|
:rtype: RapidOcrExtractorConfig
|
|
57
|
-
:raises
|
|
57
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is missing.
|
|
58
58
|
"""
|
|
59
59
|
try:
|
|
60
60
|
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
|
61
61
|
except ImportError as import_error:
|
|
62
|
-
raise
|
|
62
|
+
raise ExtractionSnapshotFatalError(
|
|
63
63
|
"RapidOCR extractor requires an optional dependency. "
|
|
64
64
|
'Install it with pip install "biblicus[ocr]".'
|
|
65
65
|
) from import_error
|
|
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict
|
|
12
12
|
|
|
13
13
|
from ..corpus import Corpus
|
|
14
|
-
from ..errors import
|
|
14
|
+
from ..errors import ExtractionSnapshotFatalError
|
|
15
15
|
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
16
|
from .base import TextExtractor
|
|
17
17
|
|
|
@@ -48,12 +48,12 @@ class UnstructuredExtractor(TextExtractor):
|
|
|
48
48
|
:type config: dict[str, Any]
|
|
49
49
|
:return: Parsed config.
|
|
50
50
|
:rtype: UnstructuredExtractorConfig
|
|
51
|
-
:raises
|
|
51
|
+
:raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
|
|
52
52
|
"""
|
|
53
53
|
try:
|
|
54
54
|
from unstructured.partition.auto import partition # noqa: F401
|
|
55
55
|
except ImportError as import_error:
|
|
56
|
-
raise
|
|
56
|
+
raise ExtractionSnapshotFatalError(
|
|
57
57
|
"Unstructured extractor requires an optional dependency. "
|
|
58
58
|
'Install it with pip install "biblicus[unstructured]".'
|
|
59
59
|
) from import_error
|
biblicus/hooks.py
CHANGED
|
@@ -18,8 +18,8 @@ class HookPoint(str, Enum):
|
|
|
18
18
|
:cvar after_ingest: Called after an item is ingested and indexed.
|
|
19
19
|
:cvar before_reindex: Called before a catalog rebuild starts.
|
|
20
20
|
:cvar after_reindex: Called after a catalog rebuild completes.
|
|
21
|
-
:cvar
|
|
22
|
-
:cvar
|
|
21
|
+
:cvar before_build_snapshot: Called before a retriever snapshot build starts.
|
|
22
|
+
:cvar after_build_snapshot: Called after a retriever snapshot build completes.
|
|
23
23
|
:cvar before_query: Called before a query is executed.
|
|
24
24
|
:cvar after_query: Called after a query completes.
|
|
25
25
|
:cvar before_evaluate_run: Called before an evaluation starts.
|
|
@@ -30,8 +30,8 @@ class HookPoint(str, Enum):
|
|
|
30
30
|
after_ingest = "after_ingest"
|
|
31
31
|
before_reindex = "before_reindex"
|
|
32
32
|
after_reindex = "after_reindex"
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
before_build_snapshot = "before_build_snapshot"
|
|
34
|
+
after_build_snapshot = "after_build_snapshot"
|
|
35
35
|
before_query = "before_query"
|
|
36
36
|
after_query = "after_query"
|
|
37
37
|
before_evaluate_run = "before_evaluate_run"
|