biblicus 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -109,6 +109,7 @@ class RapidOcrExtractor(TextExtractor):
109
109
  return ExtractedText(text="", producer_extractor_id=self.extractor_id)
110
110
 
111
111
  lines: list[str] = []
112
+ confidences: list[float] = []
112
113
  for entry in result:
113
114
  if not isinstance(entry, list) or len(entry) < 3:
114
115
  continue
@@ -124,6 +125,12 @@ class RapidOcrExtractor(TextExtractor):
124
125
  cleaned = text_value.strip()
125
126
  if cleaned:
126
127
  lines.append(cleaned)
128
+ confidences.append(confidence)
127
129
 
128
130
  text = parsed_config.joiner.join(lines).strip()
129
- return ExtractedText(text=text, producer_extractor_id=self.extractor_id)
131
+ avg_confidence = sum(confidences) / len(confidences) if confidences else None
132
+ return ExtractedText(
133
+ text=text,
134
+ producer_extractor_id=self.extractor_id,
135
+ confidence=avg_confidence,
136
+ )
@@ -0,0 +1,121 @@
1
+ """
2
+ Simple override selection extractor that always uses the last extraction for matching types.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import fnmatch
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+
12
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
13
+ from .base import TextExtractor
14
+
15
+
16
+ class SelectOverrideConfig(BaseModel):
17
+ """
18
+ Configuration for simple override selection.
19
+
20
+ :ivar media_type_patterns: List of media type patterns where override applies.
21
+ :vartype media_type_patterns: list[str]
22
+ :ivar fallback_to_first: If True, fall back to first extraction when no override match.
23
+ :vartype fallback_to_first: bool
24
+ """
25
+
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+ media_type_patterns: List[str] = Field(default_factory=lambda: ["*/*"])
29
+ fallback_to_first: bool = Field(default=False)
30
+
31
+
32
+ class SelectOverrideExtractor(TextExtractor):
33
+ """
34
+ Simple override selector that uses the last extraction for matching media types.
35
+
36
+ For items matching the configured patterns, always use the last extraction.
37
+ For non-matching items, use the first extraction (if fallback_to_first) or last.
38
+
39
+ :ivar extractor_id: Extractor identifier.
40
+ :vartype extractor_id: str
41
+ """
42
+
43
+ extractor_id = "select-override"
44
+
45
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
46
+ """
47
+ Validate selection extractor configuration.
48
+
49
+ :param config: Configuration mapping.
50
+ :type config: dict[str, Any]
51
+ :return: Parsed configuration.
52
+ :rtype: SelectOverrideConfig
53
+ """
54
+ import json
55
+
56
+ # Parse JSON values from CLI string format
57
+ parsed_config = {}
58
+ for key, value in config.items():
59
+ if isinstance(value, str) and value.startswith("["):
60
+ try:
61
+ parsed_config[key] = json.loads(value)
62
+ except json.JSONDecodeError:
63
+ parsed_config[key] = value
64
+ else:
65
+ parsed_config[key] = value
66
+
67
+ return SelectOverrideConfig.model_validate(parsed_config)
68
+
69
+ def extract_text(
70
+ self,
71
+ *,
72
+ corpus,
73
+ item: CatalogItem,
74
+ config: BaseModel,
75
+ previous_extractions: List[ExtractionStepOutput],
76
+ ) -> Optional[ExtractedText]:
77
+ """
78
+ Select extracted text using simple override logic.
79
+
80
+ :param corpus: Corpus containing the item bytes.
81
+ :type corpus: Corpus
82
+ :param item: Catalog item being processed.
83
+ :type item: CatalogItem
84
+ :param config: Parsed configuration model.
85
+ :type config: SelectOverrideConfig
86
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
87
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
88
+ :return: Selected extracted text payload or None when no prior outputs exist.
89
+ :rtype: ExtractedText or None
90
+ """
91
+ _ = corpus
92
+ parsed_config = (
93
+ config
94
+ if isinstance(config, SelectOverrideConfig)
95
+ else SelectOverrideConfig.model_validate(config)
96
+ )
97
+
98
+ extracted_candidates = [e for e in previous_extractions if e.text is not None]
99
+
100
+ if not extracted_candidates:
101
+ return None
102
+
103
+ matches_pattern = any(
104
+ fnmatch.fnmatch(item.media_type, pattern)
105
+ for pattern in parsed_config.media_type_patterns
106
+ )
107
+
108
+ if matches_pattern:
109
+ candidate = extracted_candidates[-1]
110
+ elif parsed_config.fallback_to_first:
111
+ candidate = extracted_candidates[0]
112
+ else:
113
+ candidate = extracted_candidates[-1]
114
+
115
+ producer = candidate.producer_extractor_id or candidate.extractor_id
116
+ return ExtractedText(
117
+ text=candidate.text or "",
118
+ producer_extractor_id=producer,
119
+ source_step_index=candidate.step_index,
120
+ confidence=candidate.confidence,
121
+ )
@@ -0,0 +1,187 @@
1
+ """
2
+ Smart override selection extractor that intelligently chooses between extraction results.
3
+
4
+ This extractor implements the smart override behavior where it compares the most recent
5
+ extraction against previous ones and makes an intelligent choice based on content quality
6
+ and confidence scores.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import fnmatch
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from pydantic import BaseModel, ConfigDict, Field
15
+
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+
20
+ class SelectSmartOverrideConfig(BaseModel):
21
+ """
22
+ Configuration for smart override selection.
23
+
24
+ :ivar media_type_patterns: List of media type patterns to consider (e.g., image/*).
25
+ :vartype media_type_patterns: list[str]
26
+ :ivar min_confidence_threshold: Minimum confidence to consider an extraction good.
27
+ :vartype min_confidence_threshold: float
28
+ :ivar min_text_length: Minimum text length to consider an extraction meaningful.
29
+ :vartype min_text_length: int
30
+ """
31
+
32
+ model_config = ConfigDict(extra="forbid")
33
+
34
+ media_type_patterns: List[str] = Field(default_factory=lambda: ["*/*"])
35
+ min_confidence_threshold: float = Field(default=0.7, ge=0.0, le=1.0)
36
+ min_text_length: int = Field(default=10, ge=0)
37
+
38
+
39
+ class SelectSmartOverrideExtractor(TextExtractor):
40
+ """
41
+ Smart override selector that intelligently chooses between extraction results.
42
+
43
+ This extractor applies smart override logic for items matching the configured media
44
+ type patterns. The selection rules are:
45
+
46
+ 1. If the item's media type doesn't match any configured patterns, use last extraction.
47
+ 2. If the last extraction has meaningful content, use it.
48
+ 3. If the last extraction is empty or low-confidence but a previous extraction has
49
+ good content with confidence, use the previous one.
50
+ 4. Otherwise, use the last extraction.
51
+
52
+ Meaningful content is defined as text length >= min_text_length AND (confidence
53
+ >= min_confidence_threshold OR confidence is not available).
54
+
55
+ :ivar extractor_id: Extractor identifier.
56
+ :vartype extractor_id: str
57
+ """
58
+
59
+ extractor_id = "select-smart-override"
60
+
61
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
62
+ """
63
+ Validate selection extractor configuration.
64
+
65
+ :param config: Configuration mapping.
66
+ :type config: dict[str, Any]
67
+ :return: Parsed configuration.
68
+ :rtype: SelectSmartOverrideConfig
69
+ """
70
+ import json
71
+
72
+ # Parse JSON values from CLI string format
73
+ parsed_config = {}
74
+ for key, value in config.items():
75
+ if isinstance(value, str) and value.startswith("["):
76
+ try:
77
+ parsed_config[key] = json.loads(value)
78
+ except json.JSONDecodeError:
79
+ parsed_config[key] = value
80
+ else:
81
+ parsed_config[key] = value
82
+
83
+ return SelectSmartOverrideConfig.model_validate(parsed_config)
84
+
85
+ def extract_text(
86
+ self,
87
+ *,
88
+ corpus,
89
+ item: CatalogItem,
90
+ config: BaseModel,
91
+ previous_extractions: List[ExtractionStepOutput],
92
+ ) -> Optional[ExtractedText]:
93
+ """
94
+ Select extracted text using smart override logic.
95
+
96
+ :param corpus: Corpus containing the item bytes.
97
+ :type corpus: Corpus
98
+ :param item: Catalog item being processed.
99
+ :type item: CatalogItem
100
+ :param config: Parsed configuration model.
101
+ :type config: SelectSmartOverrideConfig
102
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
103
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
104
+ :return: Selected extracted text payload or None when no prior outputs exist.
105
+ :rtype: ExtractedText or None
106
+ """
107
+ _ = corpus
108
+ parsed_config = (
109
+ config
110
+ if isinstance(config, SelectSmartOverrideConfig)
111
+ else SelectSmartOverrideConfig.model_validate(config)
112
+ )
113
+
114
+ matches_pattern = any(
115
+ fnmatch.fnmatch(item.media_type, pattern)
116
+ for pattern in parsed_config.media_type_patterns
117
+ )
118
+
119
+ extracted_candidates = [e for e in previous_extractions if e.text is not None]
120
+
121
+ if not extracted_candidates:
122
+ return None
123
+
124
+ if not matches_pattern:
125
+ return self._extraction_to_result(extracted_candidates[-1])
126
+
127
+ last_extraction = extracted_candidates[-1]
128
+ previous_candidates = extracted_candidates[:-1]
129
+
130
+ last_is_meaningful = self._is_meaningful(last_extraction, parsed_config)
131
+
132
+ if last_is_meaningful:
133
+ return self._extraction_to_result(last_extraction)
134
+
135
+ best_candidate = None
136
+ best_confidence = -1.0
137
+ for prev in previous_candidates:
138
+ if self._is_meaningful(prev, parsed_config):
139
+ prev_confidence = prev.confidence if prev.confidence is not None else 0.0
140
+ if prev_confidence > best_confidence:
141
+ best_candidate = prev
142
+ best_confidence = prev_confidence
143
+
144
+ if best_candidate is not None:
145
+ return self._extraction_to_result(best_candidate)
146
+
147
+ return self._extraction_to_result(last_extraction)
148
+
149
+ def _is_meaningful(
150
+ self, extraction: ExtractionStepOutput, config: SelectSmartOverrideConfig
151
+ ) -> bool:
152
+ """
153
+ Check if an extraction has meaningful content.
154
+
155
+ :param extraction: Extraction step output to check.
156
+ :type extraction: ExtractionStepOutput
157
+ :param config: Parsed configuration.
158
+ :type config: SelectSmartOverrideConfig
159
+ :return: True if the extraction has meaningful content.
160
+ :rtype: bool
161
+ """
162
+ text = (extraction.text or "").strip()
163
+ if len(text) < config.min_text_length:
164
+ return False
165
+
166
+ confidence = extraction.confidence
167
+ if confidence is not None and confidence < config.min_confidence_threshold:
168
+ return False
169
+
170
+ return True
171
+
172
+ def _extraction_to_result(self, extraction: ExtractionStepOutput) -> ExtractedText:
173
+ """
174
+ Convert an ExtractionStepOutput to ExtractedText.
175
+
176
+ :param extraction: Extraction step output to convert.
177
+ :type extraction: ExtractionStepOutput
178
+ :return: Extracted text result.
179
+ :rtype: ExtractedText
180
+ """
181
+ producer = extraction.producer_extractor_id or extraction.extractor_id
182
+ return ExtractedText(
183
+ text=extraction.text or "",
184
+ producer_extractor_id=producer,
185
+ source_step_index=extraction.step_index,
186
+ confidence=extraction.confidence,
187
+ )
biblicus/inference.py ADDED
@@ -0,0 +1,104 @@
1
+ """
2
+ Inference backend abstraction for machine learning powered components.
3
+
4
+ This module provides reusable configuration and credential resolution patterns for components
5
+ that can execute locally or via API providers.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from enum import Enum
12
+ from typing import Optional
13
+
14
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
15
+
16
+
17
+ class InferenceBackendMode(str, Enum):
18
+ """Execution mode for inference backends."""
19
+
20
+ LOCAL = "local"
21
+ API = "api"
22
+
23
+
24
+ class ApiProvider(str, Enum):
25
+ """Supported application programming interface providers for inference."""
26
+
27
+ HUGGINGFACE = "huggingface"
28
+ OPENAI = "openai"
29
+
30
+
31
+ class InferenceBackendConfig(BaseModel):
32
+ """
33
+ Composable configuration for inference backends.
34
+
35
+ This config can be embedded in extractor or transformer configurations to provide
36
+ a uniform interface for local versus application programming interface execution.
37
+
38
+ :ivar mode: Execution mode, local or application programming interface.
39
+ :vartype mode: InferenceBackendMode
40
+ :ivar api_provider: Application programming interface provider when mode is application programming interface.
41
+ :vartype api_provider: ApiProvider or None
42
+ :ivar api_key: Optional per-config application programming interface key override.
43
+ :vartype api_key: str or None
44
+ :ivar model_id: Optional model identifier for application programming interface requests.
45
+ :vartype model_id: str or None
46
+ """
47
+
48
+ model_config = ConfigDict(extra="forbid")
49
+
50
+ mode: InferenceBackendMode = Field(default=InferenceBackendMode.LOCAL)
51
+ api_provider: Optional[ApiProvider] = Field(default=None)
52
+ api_key: Optional[str] = Field(default=None)
53
+ model_id: Optional[str] = Field(default=None)
54
+
55
+ @model_validator(mode="after")
56
+ def _validate_api_provider_required(self) -> "InferenceBackendConfig":
57
+ if self.mode == InferenceBackendMode.API and self.api_provider is None:
58
+ raise ValueError("api_provider is required when mode is 'api'")
59
+ return self
60
+
61
+
62
+ def resolve_api_key(
63
+ provider: ApiProvider,
64
+ *,
65
+ config_override: Optional[str] = None,
66
+ ) -> Optional[str]:
67
+ """
68
+ Resolve an application programming interface key with precedence rules.
69
+
70
+ Precedence order (highest to lowest):
71
+ 1. Explicit config override parameter
72
+ 2. Environment variable for the provider
73
+ 3. User configuration file
74
+
75
+ :param provider: Application programming interface provider to resolve key for.
76
+ :type provider: ApiProvider
77
+ :param config_override: Optional explicit key from configuration.
78
+ :type config_override: str or None
79
+ :return: Resolved application programming interface key or None if unavailable.
80
+ :rtype: str or None
81
+ """
82
+ if config_override is not None:
83
+ return config_override
84
+
85
+ from .user_config import load_user_config
86
+
87
+ if provider == ApiProvider.HUGGINGFACE:
88
+ env_key = os.environ.get("HUGGINGFACE_API_KEY")
89
+ if env_key:
90
+ return env_key
91
+ user_config = load_user_config()
92
+ if user_config.huggingface is not None:
93
+ return user_config.huggingface.api_key
94
+ return None
95
+ elif provider == ApiProvider.OPENAI:
96
+ env_key = os.environ.get("OPENAI_API_KEY")
97
+ if env_key:
98
+ return env_key
99
+ user_config = load_user_config()
100
+ if user_config.openai is not None:
101
+ return user_config.openai.api_key
102
+ return None
103
+ else:
104
+ return None
biblicus/models.py CHANGED
@@ -399,6 +399,8 @@ class ExtractedText(BaseModel):
399
399
  :vartype producer_extractor_id: str
400
400
  :ivar source_step_index: Optional pipeline step index where this text originated.
401
401
  :vartype source_step_index: int or None
402
+ :ivar confidence: Optional confidence score from 0.0 to 1.0.
403
+ :vartype confidence: float or None
402
404
  """
403
405
 
404
406
  model_config = ConfigDict(extra="forbid")
@@ -406,6 +408,7 @@ class ExtractedText(BaseModel):
406
408
  text: str
407
409
  producer_extractor_id: str = Field(min_length=1)
408
410
  source_step_index: Optional[int] = Field(default=None, ge=1)
411
+ confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
409
412
 
410
413
 
411
414
  class ExtractionStepOutput(BaseModel):
@@ -426,6 +429,8 @@ class ExtractionStepOutput(BaseModel):
426
429
  :vartype producer_extractor_id: str or None
427
430
  :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
428
431
  :vartype source_step_index: int or None
432
+ :ivar confidence: Optional confidence score from 0.0 to 1.0.
433
+ :vartype confidence: float or None
429
434
  :ivar error_type: Optional error type name for errored steps.
430
435
  :vartype error_type: str or None
431
436
  :ivar error_message: Optional error message for errored steps.
@@ -441,5 +446,6 @@ class ExtractionStepOutput(BaseModel):
441
446
  text_characters: int = Field(default=0, ge=0)
442
447
  producer_extractor_id: Optional[str] = None
443
448
  source_step_index: Optional[int] = Field(default=None, ge=1)
449
+ confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
444
450
  error_type: Optional[str] = None
445
451
  error_message: Optional[str] = None
biblicus/user_config.py CHANGED
@@ -29,17 +29,49 @@ class OpenAiUserConfig(BaseModel):
29
29
  api_key: str = Field(min_length=1)
30
30
 
31
31
 
32
+ class HuggingFaceUserConfig(BaseModel):
33
+ """
34
+ Configuration for HuggingFace integrations.
35
+
36
+ :ivar api_key: HuggingFace API key used for authenticated requests.
37
+ :vartype api_key: str
38
+ """
39
+
40
+ model_config = ConfigDict(extra="forbid")
41
+
42
+ api_key: str = Field(min_length=1)
43
+
44
+
45
+ class DeepgramUserConfig(BaseModel):
46
+ """
47
+ Configuration for Deepgram integrations.
48
+
49
+ :ivar api_key: Deepgram API key used for authenticated requests.
50
+ :vartype api_key: str
51
+ """
52
+
53
+ model_config = ConfigDict(extra="forbid")
54
+
55
+ api_key: str = Field(min_length=1)
56
+
57
+
32
58
  class BiblicusUserConfig(BaseModel):
33
59
  """
34
60
  Parsed user configuration for Biblicus.
35
61
 
36
62
  :ivar openai: Optional OpenAI configuration.
37
63
  :vartype openai: OpenAiUserConfig or None
64
+ :ivar huggingface: Optional HuggingFace configuration.
65
+ :vartype huggingface: HuggingFaceUserConfig or None
66
+ :ivar deepgram: Optional Deepgram configuration.
67
+ :vartype deepgram: DeepgramUserConfig or None
38
68
  """
39
69
 
40
70
  model_config = ConfigDict(extra="forbid")
41
71
 
42
72
  openai: Optional[OpenAiUserConfig] = None
73
+ huggingface: Optional[HuggingFaceUserConfig] = None
74
+ deepgram: Optional[DeepgramUserConfig] = None
43
75
 
44
76
 
45
77
  def default_user_config_paths(
@@ -136,3 +168,47 @@ def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Op
136
168
  if loaded.openai is None:
137
169
  return None
138
170
  return loaded.openai.api_key
171
+
172
+
173
+ def resolve_huggingface_api_key(
174
+ *, config: Optional[BiblicusUserConfig] = None
175
+ ) -> Optional[str]:
176
+ """
177
+ Resolve a HuggingFace API key from environment or user configuration.
178
+
179
+ Environment takes precedence over configuration.
180
+
181
+ :param config: Optional pre-loaded user configuration.
182
+ :type config: BiblicusUserConfig or None
183
+ :return: API key string, or None when no key is available.
184
+ :rtype: str or None
185
+ """
186
+ env_key = os.environ.get("HUGGINGFACE_API_KEY")
187
+ if env_key:
188
+ return env_key
189
+ loaded = config or load_user_config()
190
+ if loaded.huggingface is None:
191
+ return None
192
+ return loaded.huggingface.api_key
193
+
194
+
195
+ def resolve_deepgram_api_key(
196
+ *, config: Optional[BiblicusUserConfig] = None
197
+ ) -> Optional[str]:
198
+ """
199
+ Resolve a Deepgram API key from environment or user configuration.
200
+
201
+ Environment takes precedence over configuration.
202
+
203
+ :param config: Optional pre-loaded user configuration.
204
+ :type config: BiblicusUserConfig or None
205
+ :return: API key string, or None when no key is available.
206
+ :rtype: str or None
207
+ """
208
+ env_key = os.environ.get("DEEPGRAM_API_KEY")
209
+ if env_key:
210
+ return env_key
211
+ loaded = config or load_user_config()
212
+ if loaded.deepgram is None:
213
+ return None
214
+ return loaded.deepgram.api_key