biblicus 0.16.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. biblicus/__init__.py +25 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +248 -191
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context.py +27 -12
  12. biblicus/context_engine/__init__.py +53 -0
  13. biblicus/context_engine/assembler.py +1090 -0
  14. biblicus/context_engine/compaction.py +110 -0
  15. biblicus/context_engine/models.py +423 -0
  16. biblicus/context_engine/retrieval.py +133 -0
  17. biblicus/corpus.py +233 -124
  18. biblicus/errors.py +27 -3
  19. biblicus/evaluation.py +27 -25
  20. biblicus/extraction.py +103 -98
  21. biblicus/extraction_evaluation.py +26 -26
  22. biblicus/extractors/deepgram_stt.py +7 -7
  23. biblicus/extractors/docling_granite_text.py +11 -11
  24. biblicus/extractors/docling_smol_text.py +11 -11
  25. biblicus/extractors/markitdown_text.py +4 -4
  26. biblicus/extractors/openai_stt.py +7 -7
  27. biblicus/extractors/paddleocr_vl_text.py +20 -18
  28. biblicus/extractors/pipeline.py +8 -8
  29. biblicus/extractors/rapidocr_text.py +3 -3
  30. biblicus/extractors/unstructured_text.py +3 -3
  31. biblicus/hooks.py +4 -4
  32. biblicus/knowledge_base.py +34 -32
  33. biblicus/models.py +84 -81
  34. biblicus/retrieval.py +49 -42
  35. biblicus/retrievers/__init__.py +50 -0
  36. biblicus/retrievers/base.py +65 -0
  37. biblicus/{backends → retrievers}/embedding_index_common.py +80 -44
  38. biblicus/{backends → retrievers}/embedding_index_file.py +96 -61
  39. biblicus/{backends → retrievers}/embedding_index_inmemory.py +100 -69
  40. biblicus/retrievers/hybrid.py +301 -0
  41. biblicus/{backends → retrievers}/scan.py +84 -73
  42. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  43. biblicus/{backends → retrievers}/tf_vector.py +103 -100
  44. biblicus/sources.py +46 -11
  45. biblicus/text/link.py +6 -0
  46. biblicus/text/prompts.py +18 -8
  47. biblicus/text/tool_loop.py +63 -5
  48. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +32 -23
  49. biblicus-1.1.0.dist-info/RECORD +91 -0
  50. biblicus/backends/__init__.py +0 -50
  51. biblicus/backends/base.py +0 -65
  52. biblicus/backends/hybrid.py +0 -291
  53. biblicus-0.16.0.dist-info/RECORD +0 -86
  54. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
  55. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
  56. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
  57. {biblicus-0.16.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
  from pydantic import BaseModel, ConfigDict, Field
12
12
 
13
13
  from ..corpus import Corpus
14
- from ..errors import ExtractionRunFatalError
14
+ from ..errors import ExtractionSnapshotFatalError
15
15
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
16
  from ..user_config import resolve_deepgram_api_key
17
17
  from .base import TextExtractor
@@ -66,19 +66,19 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
66
66
  :type config: dict[str, Any]
67
67
  :return: Parsed configuration model.
68
68
  :rtype: DeepgramSpeechToTextExtractorConfig
69
- :raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
69
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
70
70
  """
71
71
  try:
72
72
  from deepgram import DeepgramClient # noqa: F401
73
73
  except ImportError as import_error:
74
- raise ExtractionRunFatalError(
74
+ raise ExtractionSnapshotFatalError(
75
75
  "Deepgram speech to text extractor requires an optional dependency. "
76
76
  'Install it with pip install "biblicus[deepgram]".'
77
77
  ) from import_error
78
78
 
79
79
  api_key = resolve_deepgram_api_key()
80
80
  if api_key is None:
81
- raise ExtractionRunFatalError(
81
+ raise ExtractionSnapshotFatalError(
82
82
  "Deepgram speech to text extractor requires a Deepgram API key. "
83
83
  "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
84
84
  "deepgram.api_key."
@@ -107,7 +107,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
107
107
  :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
108
108
  :return: Extracted text payload, or None when the item is not audio.
109
109
  :rtype: ExtractedText or None
110
- :raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
110
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
111
111
  """
112
112
  _ = previous_extractions
113
113
  if not item.media_type.startswith("audio/"):
@@ -121,7 +121,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
121
121
 
122
122
  api_key = resolve_deepgram_api_key()
123
123
  if api_key is None:
124
- raise ExtractionRunFatalError(
124
+ raise ExtractionSnapshotFatalError(
125
125
  "Deepgram speech to text extractor requires a Deepgram API key. "
126
126
  "Set DEEPGRAM_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
127
127
  "deepgram.api_key."
@@ -130,7 +130,7 @@ class DeepgramSpeechToTextExtractor(TextExtractor):
130
130
  try:
131
131
  from deepgram import DeepgramClient
132
132
  except ImportError as import_error:
133
- raise ExtractionRunFatalError(
133
+ raise ExtractionSnapshotFatalError(
134
134
  "Deepgram speech to text extractor requires an optional dependency. "
135
135
  'Install it with pip install "biblicus[deepgram]".'
136
136
  ) from import_error
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -40,14 +40,14 @@ class DoclingGraniteExtractorConfig(BaseModel):
40
40
 
41
41
  :ivar output_format: Output format for extracted content (markdown, text, or html).
42
42
  :vartype output_format: str
43
- :ivar backend: Inference backend (mlx or transformers).
44
- :vartype backend: str
43
+ :ivar retriever: Inference retriever (mlx or transformers).
44
+ :vartype retriever: str
45
45
  """
46
46
 
47
- model_config = ConfigDict(extra="forbid")
47
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
48
48
 
49
49
  output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
50
- backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
50
+ retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
51
51
 
52
52
 
53
53
  class DoclingGraniteExtractor(TextExtractor):
@@ -71,7 +71,7 @@ class DoclingGraniteExtractor(TextExtractor):
71
71
  :type config: dict[str, Any]
72
72
  :return: Parsed config.
73
73
  :rtype: DoclingGraniteExtractorConfig
74
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
74
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
75
75
  """
76
76
  parsed = DoclingGraniteExtractorConfig.model_validate(config)
77
77
 
@@ -82,19 +82,19 @@ class DoclingGraniteExtractor(TextExtractor):
82
82
  vlm_model_specs,
83
83
  )
84
84
  except ImportError as import_error:
85
- raise ExtractionRunFatalError(
85
+ raise ExtractionSnapshotFatalError(
86
86
  "DoclingGranite extractor requires an optional dependency. "
87
87
  'Install it with pip install "biblicus[docling]".'
88
88
  ) from import_error
89
89
 
90
- if parsed.backend == "mlx":
90
+ if parsed.retriever == "mlx":
91
91
  try:
92
92
  from docling.pipeline_options import vlm_model_specs
93
93
 
94
94
  _ = vlm_model_specs.GRANITE_DOCLING_MLX
95
95
  except (ImportError, AttributeError) as exc:
96
- raise ExtractionRunFatalError(
97
- "DoclingGranite extractor with MLX backend requires MLX support. "
96
+ raise ExtractionSnapshotFatalError(
97
+ "DoclingGranite extractor with MLX retriever requires MLX support. "
98
98
  'Install it with pip install "biblicus[docling-mlx]".'
99
99
  ) from exc
100
100
 
@@ -167,7 +167,7 @@ class DoclingGraniteExtractor(TextExtractor):
167
167
  from docling.format_options import InputFormat, PdfFormatOption
168
168
  from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
169
169
 
170
- if config.backend == "mlx":
170
+ if config.retriever == "mlx":
171
171
  vlm_options = vlm_model_specs.GRANITE_DOCLING_MLX
172
172
  else:
173
173
  vlm_options = vlm_model_specs.GRANITE_DOCLING_TRANSFORMERS
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -40,14 +40,14 @@ class DoclingSmolExtractorConfig(BaseModel):
40
40
 
41
41
  :ivar output_format: Output format for extracted content (markdown, text, or html).
42
42
  :vartype output_format: str
43
- :ivar backend: Inference backend (mlx or transformers).
44
- :vartype backend: str
43
+ :ivar retriever: Inference retriever (mlx or transformers).
44
+ :vartype retriever: str
45
45
  """
46
46
 
47
- model_config = ConfigDict(extra="forbid")
47
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
48
48
 
49
49
  output_format: str = Field(default="markdown", pattern="^(markdown|text|html)$")
50
- backend: str = Field(default="mlx", pattern="^(mlx|transformers)$")
50
+ retriever: str = Field(default="mlx", pattern="^(mlx|transformers)$", alias="backend")
51
51
 
52
52
 
53
53
  class DoclingSmolExtractor(TextExtractor):
@@ -71,7 +71,7 @@ class DoclingSmolExtractor(TextExtractor):
71
71
  :type config: dict[str, Any]
72
72
  :return: Parsed config.
73
73
  :rtype: DoclingSmolExtractorConfig
74
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
74
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
75
75
  """
76
76
  parsed = DoclingSmolExtractorConfig.model_validate(config)
77
77
 
@@ -82,19 +82,19 @@ class DoclingSmolExtractor(TextExtractor):
82
82
  vlm_model_specs,
83
83
  )
84
84
  except ImportError as import_error:
85
- raise ExtractionRunFatalError(
85
+ raise ExtractionSnapshotFatalError(
86
86
  "DoclingSmol extractor requires an optional dependency. "
87
87
  'Install it with pip install "biblicus[docling]".'
88
88
  ) from import_error
89
89
 
90
- if parsed.backend == "mlx":
90
+ if parsed.retriever == "mlx":
91
91
  try:
92
92
  from docling.pipeline_options import vlm_model_specs
93
93
 
94
94
  _ = vlm_model_specs.SMOLDOCLING_MLX
95
95
  except (ImportError, AttributeError) as exc:
96
- raise ExtractionRunFatalError(
97
- "DoclingSmol extractor with MLX backend requires MLX support. "
96
+ raise ExtractionSnapshotFatalError(
97
+ "DoclingSmol extractor with MLX retriever requires MLX support. "
98
98
  'Install it with pip install "biblicus[docling-mlx]".'
99
99
  ) from exc
100
100
 
@@ -167,7 +167,7 @@ class DoclingSmolExtractor(TextExtractor):
167
167
  from docling.format_options import InputFormat, PdfFormatOption
168
168
  from docling.pipeline_options import VlmPipelineOptions, vlm_model_specs
169
169
 
170
- if config.backend == "mlx":
170
+ if config.retriever == "mlx":
171
171
  vlm_options = vlm_model_specs.SMOLDOCLING_MLX
172
172
  else:
173
173
  vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -52,18 +52,18 @@ class MarkItDownExtractor(TextExtractor):
52
52
  :type config: dict[str, Any]
53
53
  :return: Parsed config.
54
54
  :rtype: MarkItDownExtractorConfig
55
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
55
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
56
56
  """
57
57
  try:
58
58
  import markitdown
59
59
  from markitdown import MarkItDown # noqa: F401
60
60
  except ImportError as import_error:
61
- raise ExtractionRunFatalError(
61
+ raise ExtractionSnapshotFatalError(
62
62
  "MarkItDown extractor requires an optional dependency. "
63
63
  'Install it with pip install "biblicus[markitdown]".'
64
64
  ) from import_error
65
65
  if sys.version_info < (3, 10) and not getattr(markitdown, "__biblicus_fake__", False):
66
- raise ExtractionRunFatalError(
66
+ raise ExtractionSnapshotFatalError(
67
67
  "MarkItDown requires Python 3.10 or higher. "
68
68
  "Upgrade your interpreter or use a compatible extractor."
69
69
  )
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
  from pydantic import BaseModel, ConfigDict, Field, model_validator
12
12
 
13
13
  from ..corpus import Corpus
14
- from ..errors import ExtractionRunFatalError
14
+ from ..errors import ExtractionSnapshotFatalError
15
15
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
16
  from ..user_config import resolve_openai_api_key
17
17
  from .base import TextExtractor
@@ -74,19 +74,19 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
74
74
  :type config: dict[str, Any]
75
75
  :return: Parsed configuration model.
76
76
  :rtype: OpenAiSpeechToTextExtractorConfig
77
- :raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
77
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required environment is missing.
78
78
  """
79
79
  try:
80
80
  from openai import OpenAI # noqa: F401
81
81
  except ImportError as import_error:
82
- raise ExtractionRunFatalError(
82
+ raise ExtractionSnapshotFatalError(
83
83
  "OpenAI speech to text extractor requires an optional dependency. "
84
84
  'Install it with pip install "biblicus[openai]".'
85
85
  ) from import_error
86
86
 
87
87
  api_key = resolve_openai_api_key()
88
88
  if api_key is None:
89
- raise ExtractionRunFatalError(
89
+ raise ExtractionSnapshotFatalError(
90
90
  "OpenAI speech to text extractor requires an OpenAI API key. "
91
91
  "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
92
92
  "openai.api_key."
@@ -115,7 +115,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
115
115
  :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
116
116
  :return: Extracted text payload, or None when the item is not audio.
117
117
  :rtype: ExtractedText or None
118
- :raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
118
+ :raises ExtractionSnapshotFatalError: If the optional dependency or required configuration is missing.
119
119
  """
120
120
  _ = previous_extractions
121
121
  if not item.media_type.startswith("audio/"):
@@ -129,7 +129,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
129
129
 
130
130
  api_key = resolve_openai_api_key()
131
131
  if api_key is None:
132
- raise ExtractionRunFatalError(
132
+ raise ExtractionSnapshotFatalError(
133
133
  "OpenAI speech to text extractor requires an OpenAI API key. "
134
134
  "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
135
135
  "openai.api_key."
@@ -138,7 +138,7 @@ class OpenAiSpeechToTextExtractor(TextExtractor):
138
138
  try:
139
139
  from openai import OpenAI
140
140
  except ImportError as import_error:
141
- raise ExtractionRunFatalError(
141
+ raise ExtractionSnapshotFatalError(
142
142
  "OpenAI speech to text extractor requires an optional dependency. "
143
143
  'Install it with pip install "biblicus[openai]".'
144
144
  ) from import_error
@@ -16,7 +16,7 @@ from typing import Any, ClassVar, Dict, List, Optional, Tuple
16
16
  from pydantic import BaseModel, ConfigDict, Field
17
17
 
18
18
  from ..corpus import Corpus
19
- from ..errors import ExtractionRunFatalError
19
+ from ..errors import ExtractionSnapshotFatalError
20
20
  from ..inference import ApiProvider, InferenceBackendConfig, InferenceBackendMode, resolve_api_key
21
21
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
22
22
  from .base import TextExtractor
@@ -26,8 +26,8 @@ class PaddleOcrVlExtractorConfig(BaseModel):
26
26
  """
27
27
  Configuration for the PaddleOCR-VL extractor.
28
28
 
29
- :ivar backend: Inference backend configuration for local or application programming interface execution.
30
- :vartype backend: InferenceBackendConfig
29
+ :ivar retriever: Inference retriever configuration for local or application programming interface execution.
30
+ :vartype retriever: InferenceBackendConfig
31
31
  :ivar min_confidence: Minimum confidence threshold for including text.
32
32
  :vartype min_confidence: float
33
33
  :ivar joiner: String used to join recognized text lines.
@@ -38,9 +38,11 @@ class PaddleOcrVlExtractorConfig(BaseModel):
38
38
  :vartype lang: str
39
39
  """
40
40
 
41
- model_config = ConfigDict(extra="forbid")
41
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
42
42
 
43
- backend: InferenceBackendConfig = Field(default_factory=InferenceBackendConfig)
43
+ retriever: InferenceBackendConfig = Field(
44
+ default_factory=InferenceBackendConfig, alias="backend"
45
+ )
44
46
  min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
45
47
  joiner: str = Field(default="\n")
46
48
  use_angle_cls: bool = Field(default=True)
@@ -70,7 +72,7 @@ class PaddleOcrVlExtractor(TextExtractor):
70
72
  :type config: dict[str, Any]
71
73
  :return: Parsed configuration model.
72
74
  :rtype: PaddleOcrVlExtractorConfig
73
- :raises ExtractionRunFatalError: If required dependencies are missing.
75
+ :raises ExtractionSnapshotFatalError: If required dependencies are missing.
74
76
  """
75
77
  import json
76
78
 
@@ -86,26 +88,26 @@ class PaddleOcrVlExtractor(TextExtractor):
86
88
 
87
89
  parsed = PaddleOcrVlExtractorConfig.model_validate(parsed_config)
88
90
 
89
- if parsed.backend.mode == InferenceBackendMode.LOCAL:
91
+ if parsed.retriever.mode == InferenceBackendMode.LOCAL:
90
92
  try:
91
93
  from paddleocr import PaddleOCR # noqa: F401
92
94
  except ImportError as import_error:
93
- raise ExtractionRunFatalError(
95
+ raise ExtractionSnapshotFatalError(
94
96
  "PaddleOCR-VL extractor (local mode) requires paddleocr. "
95
97
  'Install it with pip install "biblicus[paddleocr]".'
96
98
  ) from import_error
97
99
  else:
98
100
  # api_provider is guaranteed to be set by InferenceBackendConfig validator
99
101
  api_key = resolve_api_key(
100
- parsed.backend.api_provider,
101
- config_override=parsed.backend.api_key,
102
+ parsed.retriever.api_provider,
103
+ config_override=parsed.retriever.api_key,
102
104
  )
103
105
  if api_key is None:
104
- provider_name = parsed.backend.api_provider.value.upper()
105
- raise ExtractionRunFatalError(
106
+ provider_name = parsed.retriever.api_provider.value.upper()
107
+ raise ExtractionSnapshotFatalError(
106
108
  f"PaddleOCR-VL extractor (API mode) requires an API key for {provider_name}. "
107
109
  f"Set {provider_name}_API_KEY environment variable or configure "
108
- f"{parsed.backend.api_provider.value} in user config."
110
+ f"{parsed.retriever.api_provider.value} in user config."
109
111
  )
110
112
 
111
113
  return parsed
@@ -145,12 +147,12 @@ class PaddleOcrVlExtractor(TextExtractor):
145
147
 
146
148
  source_path = corpus.root / item.relpath
147
149
 
148
- if parsed_config.backend.mode == InferenceBackendMode.LOCAL:
150
+ if parsed_config.retriever.mode == InferenceBackendMode.LOCAL:
149
151
  text, confidence = self._extract_local(source_path, parsed_config)
150
152
  else:
151
153
  api_key = resolve_api_key(
152
- parsed_config.backend.api_provider,
153
- config_override=parsed_config.backend.api_key,
154
+ parsed_config.retriever.api_provider,
155
+ config_override=parsed_config.retriever.api_key,
154
156
  )
155
157
  text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
156
158
 
@@ -228,7 +230,7 @@ class PaddleOcrVlExtractor(TextExtractor):
228
230
  :return: Tuple of extracted text and confidence score.
229
231
  :rtype: tuple[str, float or None]
230
232
  """
231
- if config.backend.api_provider == ApiProvider.HUGGINGFACE:
233
+ if config.retriever.api_provider == ApiProvider.HUGGINGFACE:
232
234
  return self._extract_via_huggingface_api(source_path, config, api_key)
233
235
  else:
234
236
  return "", None
@@ -257,7 +259,7 @@ class PaddleOcrVlExtractor(TextExtractor):
257
259
 
258
260
  headers = {"Authorization": f"Bearer {api_key}"}
259
261
 
260
- model_id = config.backend.model_id or "PaddlePaddle/PaddleOCR-VL"
262
+ model_id = config.retriever.model_id or "PaddlePaddle/PaddleOCR-VL"
261
263
  api_url = f"https://api-inference.huggingface.co/models/{model_id}"
262
264
  response = requests.post(
263
265
  api_url,
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
9
9
  from pydantic import BaseModel, ConfigDict, Field, model_validator
10
10
 
11
11
  from ..corpus import Corpus
12
- from ..errors import ExtractionRunFatalError
12
+ from ..errors import ExtractionSnapshotFatalError
13
13
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
14
14
  from .base import TextExtractor
15
15
 
@@ -20,14 +20,14 @@ class PipelineStepSpec(BaseModel):
20
20
 
21
21
  :ivar extractor_id: Extractor plugin identifier.
22
22
  :vartype extractor_id: str
23
- :ivar config: Extractor configuration mapping.
24
- :vartype config: dict[str, Any]
23
+ :ivar configuration: Extractor configuration mapping.
24
+ :vartype configuration: dict[str, Any]
25
25
  """
26
26
 
27
- model_config = ConfigDict(extra="forbid")
27
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
28
28
 
29
29
  extractor_id: str = Field(min_length=1)
30
- config: Dict[str, Any] = Field(default_factory=dict)
30
+ configuration: Dict[str, Any] = Field(default_factory=dict, alias="config")
31
31
 
32
32
 
33
33
  class PipelineExtractorConfig(BaseModel):
@@ -92,7 +92,7 @@ class PipelineExtractor(TextExtractor):
92
92
  :type config: PipelineExtractorConfig
93
93
  :param previous_extractions: Prior step outputs for this item within the pipeline.
94
94
  :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
95
- :raises ExtractionRunFatalError: Always, because the pipeline is executed by the runner.
95
+ :raises ExtractionSnapshotFatalError: Always, because the pipeline is executed by the runner.
96
96
  :return: None.
97
97
  :rtype: None
98
98
  """
@@ -100,6 +100,6 @@ class PipelineExtractor(TextExtractor):
100
100
  _ = item
101
101
  _ = config
102
102
  _ = previous_extractions
103
- raise ExtractionRunFatalError(
104
- "Pipeline extractor must be executed by the extraction runner."
103
+ raise ExtractionSnapshotFatalError(
104
+ "Pipeline extractor must be executed by the extraction snapshotner."
105
105
  )
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
  from pydantic import BaseModel, ConfigDict, Field
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..errors import ExtractionRunFatalError
15
+ from ..errors import ExtractionSnapshotFatalError
16
16
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
17
  from .base import TextExtractor
18
18
 
@@ -54,12 +54,12 @@ class RapidOcrExtractor(TextExtractor):
54
54
  :type config: dict[str, Any]
55
55
  :return: Parsed configuration model.
56
56
  :rtype: RapidOcrExtractorConfig
57
- :raises ExtractionRunFatalError: If the optional dependency is missing.
57
+ :raises ExtractionSnapshotFatalError: If the optional dependency is missing.
58
58
  """
59
59
  try:
60
60
  from rapidocr_onnxruntime import RapidOCR # noqa: F401
61
61
  except ImportError as import_error:
62
- raise ExtractionRunFatalError(
62
+ raise ExtractionSnapshotFatalError(
63
63
  "RapidOCR extractor requires an optional dependency. "
64
64
  'Install it with pip install "biblicus[ocr]".'
65
65
  ) from import_error
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
  from pydantic import BaseModel, ConfigDict
12
12
 
13
13
  from ..corpus import Corpus
14
- from ..errors import ExtractionRunFatalError
14
+ from ..errors import ExtractionSnapshotFatalError
15
15
  from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
16
  from .base import TextExtractor
17
17
 
@@ -48,12 +48,12 @@ class UnstructuredExtractor(TextExtractor):
48
48
  :type config: dict[str, Any]
49
49
  :return: Parsed config.
50
50
  :rtype: UnstructuredExtractorConfig
51
- :raises ExtractionRunFatalError: If the optional dependency is not installed.
51
+ :raises ExtractionSnapshotFatalError: If the optional dependency is not installed.
52
52
  """
53
53
  try:
54
54
  from unstructured.partition.auto import partition # noqa: F401
55
55
  except ImportError as import_error:
56
- raise ExtractionRunFatalError(
56
+ raise ExtractionSnapshotFatalError(
57
57
  "Unstructured extractor requires an optional dependency. "
58
58
  'Install it with pip install "biblicus[unstructured]".'
59
59
  ) from import_error
biblicus/hooks.py CHANGED
@@ -18,8 +18,8 @@ class HookPoint(str, Enum):
18
18
  :cvar after_ingest: Called after an item is ingested and indexed.
19
19
  :cvar before_reindex: Called before a catalog rebuild starts.
20
20
  :cvar after_reindex: Called after a catalog rebuild completes.
21
- :cvar before_build_run: Called before a backend run build starts.
22
- :cvar after_build_run: Called after a backend run build completes.
21
+ :cvar before_build_snapshot: Called before a retriever snapshot build starts.
22
+ :cvar after_build_snapshot: Called after a retriever snapshot build completes.
23
23
  :cvar before_query: Called before a query is executed.
24
24
  :cvar after_query: Called after a query completes.
25
25
  :cvar before_evaluate_run: Called before an evaluation starts.
@@ -30,8 +30,8 @@ class HookPoint(str, Enum):
30
30
  after_ingest = "after_ingest"
31
31
  before_reindex = "before_reindex"
32
32
  after_reindex = "after_reindex"
33
- before_build_run = "before_build_run"
34
- after_build_run = "after_build_run"
33
+ before_build_snapshot = "before_build_snapshot"
34
+ after_build_snapshot = "after_build_snapshot"
35
35
  before_query = "before_query"
36
36
  after_query = "after_query"
37
37
  before_evaluate_run = "before_evaluate_run"