biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +33 -49
  11. biblicus/corpus.py +39 -58
  12. biblicus/errors.py +15 -0
  13. biblicus/evaluation.py +4 -8
  14. biblicus/extraction.py +276 -77
  15. biblicus/extractors/__init__.py +14 -3
  16. biblicus/extractors/base.py +12 -5
  17. biblicus/extractors/metadata_text.py +13 -5
  18. biblicus/extractors/openai_stt.py +180 -0
  19. biblicus/extractors/pass_through_text.py +16 -6
  20. biblicus/extractors/pdf_text.py +100 -0
  21. biblicus/extractors/pipeline.py +105 -0
  22. biblicus/extractors/rapidocr_text.py +129 -0
  23. biblicus/extractors/select_longest_text.py +105 -0
  24. biblicus/extractors/select_text.py +100 -0
  25. biblicus/extractors/unstructured_text.py +100 -0
  26. biblicus/frontmatter.py +0 -3
  27. biblicus/hook_logging.py +0 -5
  28. biblicus/hook_manager.py +3 -5
  29. biblicus/hooks.py +3 -7
  30. biblicus/ignore.py +0 -3
  31. biblicus/models.py +87 -0
  32. biblicus/retrieval.py +0 -4
  33. biblicus/sources.py +44 -9
  34. biblicus/time.py +0 -1
  35. biblicus/uris.py +3 -4
  36. biblicus/user_config.py +138 -0
  37. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
  38. biblicus-0.3.0.dist-info/RECORD +44 -0
  39. biblicus/extractors/cascade.py +0 -101
  40. biblicus-0.2.0.dist-info/RECORD +0 -32
  41. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
  42. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,180 @@
1
+ """
2
+ OpenAI-backed speech to text extractor plugin.
3
+
4
+ This extractor is implemented as an optional dependency so the core installation stays small.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
12
+
13
+ from ..corpus import Corpus
14
+ from ..errors import ExtractionRunFatalError
15
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
+ from ..user_config import resolve_openai_api_key
17
+ from .base import TextExtractor
18
+
19
+
20
+ class OpenAiSpeechToTextExtractorConfig(BaseModel):
21
+ """
22
+ Configuration for OpenAI speech to text extraction.
23
+
24
+ :ivar model: OpenAI transcription model identifier.
25
+ :vartype model: str
26
+ :ivar response_format: OpenAI transcription response format.
27
+ :vartype response_format: str
28
+ :ivar language: Optional language code hint for transcription.
29
+ :vartype language: str or None
30
+ :ivar prompt: Optional prompt text to guide transcription.
31
+ :vartype prompt: str or None
32
+ :ivar no_speech_probability_threshold: Optional threshold for suppressing hallucinated transcripts.
33
+ :vartype no_speech_probability_threshold: float or None
34
+ """
35
+
36
+ model_config = ConfigDict(extra="forbid")
37
+
38
+ model: str = Field(default="whisper-1", min_length=1)
39
+ response_format: str = Field(default="json", min_length=1)
40
+ language: Optional[str] = Field(default=None, min_length=1)
41
+ prompt: Optional[str] = Field(default=None, min_length=1)
42
+ no_speech_probability_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
43
+
44
+ @model_validator(mode="after")
45
+ def _validate_no_speech_threshold(self) -> "OpenAiSpeechToTextExtractorConfig":
46
+ if self.no_speech_probability_threshold is None:
47
+ return self
48
+ if self.response_format != "verbose_json":
49
+ raise ValueError(
50
+ "no_speech_probability_threshold requires response_format='verbose_json' "
51
+ "so the transcription API returns per-segment no-speech probabilities"
52
+ )
53
+ return self
54
+
55
+
56
+ class OpenAiSpeechToTextExtractor(TextExtractor):
57
+ """
58
+ Extractor plugin that transcribes audio items using the OpenAI API.
59
+
60
+ This extractor is intended as a practical, hosted speech to text implementation.
61
+ It skips non-audio items.
62
+
63
+ :ivar extractor_id: Extractor identifier.
64
+ :vartype extractor_id: str
65
+ """
66
+
67
+ extractor_id = "stt-openai"
68
+
69
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
70
+ """
71
+ Validate extractor configuration and ensure prerequisites are available.
72
+
73
+ :param config: Configuration mapping.
74
+ :type config: dict[str, Any]
75
+ :return: Parsed configuration model.
76
+ :rtype: OpenAiSpeechToTextExtractorConfig
77
+ :raises ExtractionRunFatalError: If the optional dependency or required environment is missing.
78
+ """
79
+ try:
80
+ from openai import OpenAI # noqa: F401
81
+ except ImportError as import_error:
82
+ raise ExtractionRunFatalError(
83
+ "OpenAI speech to text extractor requires an optional dependency. "
84
+ 'Install it with pip install "biblicus[openai]".'
85
+ ) from import_error
86
+
87
+ api_key = resolve_openai_api_key()
88
+ if api_key is None:
89
+ raise ExtractionRunFatalError(
90
+ "OpenAI speech to text extractor requires an OpenAI API key. "
91
+ "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
92
+ "openai.api_key."
93
+ )
94
+
95
+ return OpenAiSpeechToTextExtractorConfig.model_validate(config)
96
+
97
+ def extract_text(
98
+ self,
99
+ *,
100
+ corpus: Corpus,
101
+ item: CatalogItem,
102
+ config: BaseModel,
103
+ previous_extractions: List[ExtractionStepOutput],
104
+ ) -> Optional[ExtractedText]:
105
+ """
106
+ Transcribe an audio item.
107
+
108
+ :param corpus: Corpus containing the item bytes.
109
+ :type corpus: Corpus
110
+ :param item: Catalog item being processed.
111
+ :type item: CatalogItem
112
+ :param config: Parsed configuration model.
113
+ :type config: OpenAiSpeechToTextExtractorConfig
114
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
115
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
116
+ :return: Extracted text payload, or None when the item is not audio.
117
+ :rtype: ExtractedText or None
118
+ :raises ExtractionRunFatalError: If the optional dependency or required configuration is missing.
119
+ """
120
+ _ = previous_extractions
121
+ if not item.media_type.startswith("audio/"):
122
+ return None
123
+
124
+ parsed_config = (
125
+ config
126
+ if isinstance(config, OpenAiSpeechToTextExtractorConfig)
127
+ else OpenAiSpeechToTextExtractorConfig.model_validate(config)
128
+ )
129
+
130
+ api_key = resolve_openai_api_key()
131
+ if api_key is None:
132
+ raise ExtractionRunFatalError(
133
+ "OpenAI speech to text extractor requires an OpenAI API key. "
134
+ "Set OPENAI_API_KEY or configure it in ~/.biblicus/config.yml or ./.biblicus/config.yml under "
135
+ "openai.api_key."
136
+ )
137
+
138
+ try:
139
+ from openai import OpenAI
140
+ except ImportError as import_error:
141
+ raise ExtractionRunFatalError(
142
+ "OpenAI speech to text extractor requires an optional dependency. "
143
+ 'Install it with pip install "biblicus[openai]".'
144
+ ) from import_error
145
+
146
+ client = OpenAI(api_key=api_key)
147
+ source_path = corpus.root / item.relpath
148
+ with source_path.open("rb") as audio_handle:
149
+ result = client.audio.transcriptions.create(
150
+ file=audio_handle,
151
+ model=parsed_config.model,
152
+ response_format=parsed_config.response_format,
153
+ language=parsed_config.language,
154
+ prompt=parsed_config.prompt,
155
+ )
156
+
157
+ transcript_text: str
158
+ no_speech_probability_threshold = parsed_config.no_speech_probability_threshold
159
+
160
+ if isinstance(result, dict):
161
+ transcript_text = str(result.get("text") or "")
162
+ segments = result.get("segments")
163
+ if (
164
+ no_speech_probability_threshold is not None
165
+ and isinstance(segments, list)
166
+ and segments
167
+ ):
168
+ probabilities: list[float] = []
169
+ for entry in segments:
170
+ if not isinstance(entry, dict):
171
+ continue
172
+ value = entry.get("no_speech_prob", entry.get("no_speech_probability"))
173
+ if isinstance(value, (int, float)):
174
+ probabilities.append(float(value))
175
+ if probabilities and max(probabilities) >= no_speech_probability_threshold:
176
+ transcript_text = ""
177
+ else:
178
+ transcript_text = str(getattr(result, "text", "") or "")
179
+
180
+ return ExtractedText(text=transcript_text.strip(), producer_extractor_id=self.extractor_id)
@@ -4,13 +4,13 @@ Pass-through extractor for text items.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- from typing import Any, Dict, Optional
7
+ from typing import Any, Dict, List, Optional
8
8
 
9
9
  from pydantic import BaseModel, ConfigDict
10
10
 
11
11
  from ..corpus import Corpus
12
12
  from ..frontmatter import parse_front_matter
13
- from ..models import CatalogItem, ExtractedText
13
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
14
14
  from .base import TextExtractor
15
15
 
16
16
 
@@ -45,10 +45,16 @@ class PassThroughTextExtractor(TextExtractor):
45
45
  :return: Parsed config.
46
46
  :rtype: PassThroughTextExtractorConfig
47
47
  """
48
-
49
48
  return PassThroughTextExtractorConfig.model_validate(config)
50
49
 
51
- def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
50
+ def extract_text(
51
+ self,
52
+ *,
53
+ corpus: Corpus,
54
+ item: CatalogItem,
55
+ config: BaseModel,
56
+ previous_extractions: List[ExtractionStepOutput],
57
+ ) -> Optional[ExtractedText]:
52
58
  """
53
59
  Extract text by reading the raw item content from the corpus.
54
60
 
@@ -58,11 +64,13 @@ class PassThroughTextExtractor(TextExtractor):
58
64
  :type item: CatalogItem
59
65
  :param config: Parsed configuration model.
60
66
  :type config: PassThroughTextExtractorConfig
67
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
68
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
61
69
  :return: Extracted text payload, or None if the item is not text.
62
70
  :rtype: ExtractedText or None
63
71
  """
64
-
65
72
  _ = config
73
+ _ = previous_extractions
66
74
  media_type = item.media_type
67
75
  if media_type != "text/markdown" and not media_type.startswith("text/"):
68
76
  return None
@@ -71,4 +79,6 @@ class PassThroughTextExtractor(TextExtractor):
71
79
  markdown_text = raw_bytes.decode("utf-8")
72
80
  parsed_document = parse_front_matter(markdown_text)
73
81
  return ExtractedText(text=parsed_document.body, producer_extractor_id=self.extractor_id)
74
- return ExtractedText(text=raw_bytes.decode("utf-8"), producer_extractor_id=self.extractor_id)
82
+ return ExtractedText(
83
+ text=raw_bytes.decode("utf-8"), producer_extractor_id=self.extractor_id
84
+ )
@@ -0,0 +1,100 @@
1
+ """
2
+ Portable Document Format text extractor plugin.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from io import BytesIO
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+ from pypdf import PdfReader
12
+
13
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
14
+ from .base import TextExtractor
15
+
16
+
17
+ class PortableDocumentFormatTextExtractorConfig(BaseModel):
18
+ """
19
+ Configuration for Portable Document Format text extraction.
20
+
21
+ :ivar max_pages: Optional maximum number of pages to process.
22
+ :vartype max_pages: int or None
23
+ """
24
+
25
+ model_config = ConfigDict(extra="forbid")
26
+
27
+ max_pages: Optional[int] = Field(default=None, ge=1)
28
+
29
+
30
+ class PortableDocumentFormatTextExtractor(TextExtractor):
31
+ """
32
+ Extractor plugin that attempts to extract text from Portable Document Format items.
33
+
34
+ This extractor only handles items whose media type is `application/pdf`.
35
+ Items of other media types are skipped.
36
+
37
+ :ivar extractor_id: Extractor identifier.
38
+ :vartype extractor_id: str
39
+ """
40
+
41
+ extractor_id = "pdf-text"
42
+
43
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
44
+ """
45
+ Validate extractor configuration.
46
+
47
+ :param config: Configuration mapping.
48
+ :type config: dict[str, Any]
49
+ :return: Parsed configuration.
50
+ :rtype: PortableDocumentFormatTextExtractorConfig
51
+ """
52
+ return PortableDocumentFormatTextExtractorConfig.model_validate(config)
53
+
54
+ def extract_text(
55
+ self,
56
+ *,
57
+ corpus,
58
+ item: CatalogItem,
59
+ config: BaseModel,
60
+ previous_extractions: List[ExtractionStepOutput],
61
+ ) -> Optional[ExtractedText]:
62
+ """
63
+ Extract text for a Portable Document Format item.
64
+
65
+ :param corpus: Corpus containing the item bytes.
66
+ :type corpus: Corpus
67
+ :param item: Catalog item being processed.
68
+ :type item: CatalogItem
69
+ :param config: Parsed configuration model.
70
+ :type config: PortableDocumentFormatTextExtractorConfig
71
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
72
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
73
+ :return: Extracted text payload, or None when the item is not a Portable Document Format item.
74
+ :rtype: ExtractedText or None
75
+ """
76
+ if item.media_type != "application/pdf":
77
+ return None
78
+
79
+ _ = previous_extractions
80
+ parsed_config = (
81
+ config
82
+ if isinstance(config, PortableDocumentFormatTextExtractorConfig)
83
+ else PortableDocumentFormatTextExtractorConfig.model_validate(config)
84
+ )
85
+
86
+ pdf_path = corpus.root / item.relpath
87
+ pdf_bytes = pdf_path.read_bytes()
88
+ reader = PdfReader(BytesIO(pdf_bytes))
89
+
90
+ texts: list[str] = []
91
+ pages = list(reader.pages)
92
+ if parsed_config.max_pages is not None:
93
+ pages = pages[: int(parsed_config.max_pages)]
94
+
95
+ for page in pages:
96
+ page_text = page.extract_text() or ""
97
+ texts.append(page_text)
98
+
99
+ combined_text = "\n".join(texts).strip()
100
+ return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)
@@ -0,0 +1,105 @@
1
+ """
2
+ Pipeline extractor configuration and validation.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
10
+
11
+ from ..corpus import Corpus
12
+ from ..errors import ExtractionRunFatalError
13
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
14
+ from .base import TextExtractor
15
+
16
+
17
+ class PipelineStepSpec(BaseModel):
18
+ """
19
+ Single extractor step within a pipeline.
20
+
21
+ :ivar extractor_id: Extractor plugin identifier.
22
+ :vartype extractor_id: str
23
+ :ivar config: Extractor configuration mapping.
24
+ :vartype config: dict[str, Any]
25
+ """
26
+
27
+ model_config = ConfigDict(extra="forbid")
28
+
29
+ extractor_id: str = Field(min_length=1)
30
+ config: Dict[str, Any] = Field(default_factory=dict)
31
+
32
+
33
+ class PipelineExtractorConfig(BaseModel):
34
+ """
35
+ Configuration for the pipeline extractor.
36
+
37
+ :ivar steps: Ordered list of extractor steps to run.
38
+ :vartype steps: list[PipelineStepSpec]
39
+ """
40
+
41
+ model_config = ConfigDict(extra="forbid")
42
+
43
+ steps: List[PipelineStepSpec] = Field(min_length=1)
44
+
45
+ @model_validator(mode="after")
46
+ def _forbid_pipeline_step(self) -> "PipelineExtractorConfig":
47
+ if any(step.extractor_id == "pipeline" for step in self.steps):
48
+ raise ValueError("Pipeline steps cannot include the pipeline extractor itself")
49
+ return self
50
+
51
+
52
+ class PipelineExtractor(TextExtractor):
53
+ """
54
+ Pipeline extractor configuration shim.
55
+
56
+ The pipeline extractor is executed by the extraction engine so it can persist
57
+ per-step artifacts. This class only validates configuration.
58
+
59
+ :ivar extractor_id: Extractor identifier.
60
+ :vartype extractor_id: str
61
+ """
62
+
63
+ extractor_id = "pipeline"
64
+
65
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
66
+ """
67
+ Validate pipeline configuration.
68
+
69
+ :param config: Configuration mapping.
70
+ :type config: dict[str, Any]
71
+ :return: Parsed configuration.
72
+ :rtype: PipelineExtractorConfig
73
+ """
74
+ return PipelineExtractorConfig.model_validate(config)
75
+
76
+ def extract_text(
77
+ self,
78
+ *,
79
+ corpus: Corpus,
80
+ item: CatalogItem,
81
+ config: BaseModel,
82
+ previous_extractions: List[ExtractionStepOutput],
83
+ ) -> Optional[ExtractedText]:
84
+ """
85
+ Reject direct execution of the pipeline extractor.
86
+
87
+ :param corpus: Corpus containing the item bytes.
88
+ :type corpus: Corpus
89
+ :param item: Catalog item being processed.
90
+ :type item: CatalogItem
91
+ :param config: Parsed configuration model.
92
+ :type config: PipelineExtractorConfig
93
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
94
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
95
+ :raises ExtractionRunFatalError: Always, because the pipeline is executed by the runner.
96
+ :return: None.
97
+ :rtype: None
98
+ """
99
+ _ = corpus
100
+ _ = item
101
+ _ = config
102
+ _ = previous_extractions
103
+ raise ExtractionRunFatalError(
104
+ "Pipeline extractor must be executed by the extraction runner."
105
+ )
@@ -0,0 +1,129 @@
1
+ """
2
+ RapidOCR-backed optical character recognition extractor plugin.
3
+
4
+ This extractor is an optional dependency. It exists as a practical default for extracting text
5
+ from image items without requiring a separate daemon.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from ..corpus import Corpus
15
+ from ..errors import ExtractionRunFatalError
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+
20
+ class RapidOcrExtractorConfig(BaseModel):
21
+ """
22
+ Configuration for the RapidOCR extractor.
23
+
24
+ :ivar min_confidence: Minimum per-line confidence to include in output.
25
+ :vartype min_confidence: float
26
+ :ivar joiner: Joiner used to combine recognized lines.
27
+ :vartype joiner: str
28
+ """
29
+
30
+ model_config = ConfigDict(extra="forbid")
31
+
32
+ min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
33
+ joiner: str = Field(default="\n")
34
+
35
+
36
+ class RapidOcrExtractor(TextExtractor):
37
+ """
38
+ Extractor plugin that performs optical character recognition on image items using RapidOCR.
39
+
40
+ This extractor handles common image media types such as Portable Network Graphics and Joint Photographic Experts Group.
41
+ It returns an empty extracted text artifact when the image is handled but no text is recognized.
42
+
43
+ :ivar extractor_id: Extractor identifier.
44
+ :vartype extractor_id: str
45
+ """
46
+
47
+ extractor_id = "ocr-rapidocr"
48
+
49
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
50
+ """
51
+ Validate extractor configuration and ensure prerequisites are available.
52
+
53
+ :param config: Configuration mapping.
54
+ :type config: dict[str, Any]
55
+ :return: Parsed configuration model.
56
+ :rtype: RapidOcrExtractorConfig
57
+ :raises ExtractionRunFatalError: If the optional dependency is missing.
58
+ """
59
+ try:
60
+ from rapidocr_onnxruntime import RapidOCR # noqa: F401
61
+ except ImportError as import_error:
62
+ raise ExtractionRunFatalError(
63
+ "RapidOCR extractor requires an optional dependency. "
64
+ 'Install it with pip install "biblicus[ocr]".'
65
+ ) from import_error
66
+
67
+ return RapidOcrExtractorConfig.model_validate(config)
68
+
69
+ def extract_text(
70
+ self,
71
+ *,
72
+ corpus: Corpus,
73
+ item: CatalogItem,
74
+ config: BaseModel,
75
+ previous_extractions: List[ExtractionStepOutput],
76
+ ) -> Optional[ExtractedText]:
77
+ """
78
+ Extract text from an image item using optical character recognition.
79
+
80
+ :param corpus: Corpus containing the item bytes.
81
+ :type corpus: Corpus
82
+ :param item: Catalog item being processed.
83
+ :type item: CatalogItem
84
+ :param config: Parsed configuration model.
85
+ :type config: RapidOcrExtractorConfig
86
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
87
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
88
+ :return: Extracted text payload, or None when the item is not an image.
89
+ :rtype: ExtractedText or None
90
+ """
91
+ _ = previous_extractions
92
+ media_type = item.media_type
93
+ if not media_type.startswith("image/"):
94
+ return None
95
+
96
+ parsed_config = (
97
+ config
98
+ if isinstance(config, RapidOcrExtractorConfig)
99
+ else RapidOcrExtractorConfig.model_validate(config)
100
+ )
101
+
102
+ from rapidocr_onnxruntime import RapidOCR
103
+
104
+ source_path = corpus.root / item.relpath
105
+ ocr = RapidOCR()
106
+ result, _elapsed = ocr(str(source_path))
107
+
108
+ if result is None:
109
+ return ExtractedText(text="", producer_extractor_id=self.extractor_id)
110
+
111
+ lines: list[str] = []
112
+ for entry in result:
113
+ if not isinstance(entry, list) or len(entry) < 3:
114
+ continue
115
+ text_value = entry[1]
116
+ confidence_value = entry[2]
117
+ if not isinstance(text_value, str):
118
+ continue
119
+ if not isinstance(confidence_value, (int, float)):
120
+ continue
121
+ confidence = float(confidence_value)
122
+ if confidence < parsed_config.min_confidence:
123
+ continue
124
+ cleaned = text_value.strip()
125
+ if cleaned:
126
+ lines.append(cleaned)
127
+
128
+ text = parsed_config.joiner.join(lines).strip()
129
+ return ExtractedText(text=text, producer_extractor_id=self.extractor_id)
@@ -0,0 +1,105 @@
1
+ """
2
+ Selection extractor that chooses the longest available text from previous pipeline outputs.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
12
+ from .base import TextExtractor
13
+
14
+
15
+ class SelectLongestTextExtractorConfig(BaseModel):
16
+ """
17
+ Configuration for the longest text selection extractor.
18
+
19
+ Version zero does not expose configuration for this extractor.
20
+ """
21
+
22
+ model_config = ConfigDict(extra="forbid")
23
+
24
+
25
+ class SelectLongestTextExtractor(TextExtractor):
26
+ """
27
+ Extractor plugin that selects the longest text from previous pipeline outputs.
28
+
29
+ This extractor does not attempt to score semantic quality. It is a deterministic
30
+ selection policy for cases where multiple steps can produce usable text for the
31
+ same item.
32
+
33
+ The selection rules are:
34
+
35
+ - If any prior extracted texts are non-empty after stripping whitespace, choose the one
36
+ with the greatest stripped character count.
37
+ - Ties are broken by earliest pipeline step index.
38
+ - If no prior extracted texts are usable but prior extracted texts exist, select the
39
+ earliest extracted text even if it is empty.
40
+
41
+ :ivar extractor_id: Extractor identifier.
42
+ :vartype extractor_id: str
43
+ """
44
+
45
+ extractor_id = "select-longest-text"
46
+
47
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
48
+ """
49
+ Validate selection extractor configuration.
50
+
51
+ :param config: Configuration mapping.
52
+ :type config: dict[str, Any]
53
+ :return: Parsed configuration.
54
+ :rtype: SelectLongestTextExtractorConfig
55
+ """
56
+ return SelectLongestTextExtractorConfig.model_validate(config)
57
+
58
+ def extract_text(
59
+ self,
60
+ *,
61
+ corpus,
62
+ item: CatalogItem,
63
+ config: BaseModel,
64
+ previous_extractions: List[ExtractionStepOutput],
65
+ ) -> Optional[ExtractedText]:
66
+ """
67
+ Select the longest extracted text from previous pipeline outputs.
68
+
69
+ :param corpus: Corpus containing the item bytes.
70
+ :type corpus: Corpus
71
+ :param item: Catalog item being processed.
72
+ :type item: CatalogItem
73
+ :param config: Parsed configuration model.
74
+ :type config: SelectLongestTextExtractorConfig
75
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
76
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
77
+ :return: Selected extracted text payload or None when no prior outputs exist.
78
+ :rtype: ExtractedText or None
79
+ """
80
+ _ = corpus
81
+ _ = item
82
+ _ = config
83
+
84
+ extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
85
+ if not extracted_candidates:
86
+ return None
87
+
88
+ usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
89
+ if usable_candidates:
90
+ candidate = max(usable_candidates, key=lambda entry: len(entry.text.strip()))
91
+ ties = [
92
+ entry
93
+ for entry in usable_candidates
94
+ if len(entry.text.strip()) == len(candidate.text.strip())
95
+ ]
96
+ candidate = min(ties, key=lambda entry: int(entry.step_index))
97
+ else:
98
+ candidate = min(extracted_candidates, key=lambda entry: int(entry.step_index))
99
+
100
+ producer = candidate.producer_extractor_id or candidate.extractor_id
101
+ return ExtractedText(
102
+ text=candidate.text or "",
103
+ producer_extractor_id=producer,
104
+ source_step_index=candidate.step_index,
105
+ )