biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +2 -2
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +0 -2
- biblicus/backends/base.py +3 -3
- biblicus/backends/scan.py +21 -15
- biblicus/backends/sqlite_full_text_search.py +14 -15
- biblicus/cli.py +33 -49
- biblicus/corpus.py +39 -58
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +4 -8
- biblicus/extraction.py +276 -77
- biblicus/extractors/__init__.py +14 -3
- biblicus/extractors/base.py +12 -5
- biblicus/extractors/metadata_text.py +13 -5
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +16 -6
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +0 -3
- biblicus/hook_logging.py +0 -5
- biblicus/hook_manager.py +3 -5
- biblicus/hooks.py +3 -7
- biblicus/ignore.py +0 -3
- biblicus/models.py +87 -0
- biblicus/retrieval.py +0 -4
- biblicus/sources.py +44 -9
- biblicus/time.py +0 -1
- biblicus/uris.py +3 -4
- biblicus/user_config.py +138 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
- biblicus-0.3.0.dist-info/RECORD +44 -0
- biblicus/extractors/cascade.py +0 -101
- biblicus-0.2.0.dist-info/RECORD +0 -32
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Selection extractor that chooses text from previous pipeline outputs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
12
|
+
from .base import TextExtractor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SelectTextExtractorConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for the selection extractor.
|
|
18
|
+
|
|
19
|
+
The selection extractor is intentionally minimal and requires no configuration.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
model_config = ConfigDict(extra="forbid")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SelectTextExtractor(TextExtractor):
|
|
26
|
+
"""
|
|
27
|
+
Extractor plugin that selects from previous pipeline outputs.
|
|
28
|
+
|
|
29
|
+
This extractor is used as a final step when you want to make an explicit choice among
|
|
30
|
+
multiple extraction outputs in the same pipeline.
|
|
31
|
+
|
|
32
|
+
It selects the first usable extracted text in pipeline order. Usable means the text is
|
|
33
|
+
non-empty after stripping whitespace. If no usable text exists but prior extracted text
|
|
34
|
+
exists, it selects the first extracted text even if it is empty.
|
|
35
|
+
|
|
36
|
+
:ivar extractor_id: Extractor identifier.
|
|
37
|
+
:vartype extractor_id: str
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
extractor_id = "select-text"
|
|
41
|
+
|
|
42
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
43
|
+
"""
|
|
44
|
+
Validate selection extractor configuration.
|
|
45
|
+
|
|
46
|
+
:param config: Configuration mapping.
|
|
47
|
+
:type config: dict[str, Any]
|
|
48
|
+
:return: Parsed configuration.
|
|
49
|
+
:rtype: SelectTextExtractorConfig
|
|
50
|
+
"""
|
|
51
|
+
return SelectTextExtractorConfig.model_validate(config)
|
|
52
|
+
|
|
53
|
+
def extract_text(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
corpus,
|
|
57
|
+
item: CatalogItem,
|
|
58
|
+
config: BaseModel,
|
|
59
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
60
|
+
) -> Optional[ExtractedText]:
|
|
61
|
+
"""
|
|
62
|
+
Select extracted text from previous pipeline outputs.
|
|
63
|
+
|
|
64
|
+
:param corpus: Corpus containing the item bytes.
|
|
65
|
+
:type corpus: Corpus
|
|
66
|
+
:param item: Catalog item being processed.
|
|
67
|
+
:type item: CatalogItem
|
|
68
|
+
:param config: Parsed configuration model.
|
|
69
|
+
:type config: SelectTextExtractorConfig
|
|
70
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
71
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
72
|
+
:return: Selected extracted text payload or None when no prior outputs exist.
|
|
73
|
+
:rtype: ExtractedText or None
|
|
74
|
+
"""
|
|
75
|
+
_ = corpus
|
|
76
|
+
_ = item
|
|
77
|
+
_ = config
|
|
78
|
+
|
|
79
|
+
extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
|
|
80
|
+
usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
|
|
81
|
+
|
|
82
|
+
if usable_candidates:
|
|
83
|
+
candidate = usable_candidates[0]
|
|
84
|
+
producer = candidate.producer_extractor_id or candidate.extractor_id
|
|
85
|
+
return ExtractedText(
|
|
86
|
+
text=candidate.text or "",
|
|
87
|
+
producer_extractor_id=producer,
|
|
88
|
+
source_step_index=candidate.step_index,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if extracted_candidates:
|
|
92
|
+
candidate = extracted_candidates[0]
|
|
93
|
+
producer = candidate.producer_extractor_id or candidate.extractor_id
|
|
94
|
+
return ExtractedText(
|
|
95
|
+
text=candidate.text or "",
|
|
96
|
+
producer_extractor_id=producer,
|
|
97
|
+
source_step_index=candidate.step_index,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return None
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unstructured-based text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor is implemented as an optional dependency so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
|
+
|
|
13
|
+
from ..corpus import Corpus
|
|
14
|
+
from ..errors import ExtractionRunFatalError
|
|
15
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
|
+
from .base import TextExtractor
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class UnstructuredExtractorConfig(BaseModel):
|
|
20
|
+
"""
|
|
21
|
+
Configuration for the Unstructured extractor.
|
|
22
|
+
|
|
23
|
+
Version zero does not expose any configuration for this extractor.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class UnstructuredExtractor(TextExtractor):
|
|
30
|
+
"""
|
|
31
|
+
Extractor plugin backed by the `unstructured` library.
|
|
32
|
+
|
|
33
|
+
The intent is broad format coverage as a last-resort extractor. This extractor skips items
|
|
34
|
+
that are already text so the pass-through extractor remains the canonical choice for text
|
|
35
|
+
items and Markdown front matter handling.
|
|
36
|
+
|
|
37
|
+
:ivar extractor_id: Extractor identifier.
|
|
38
|
+
:vartype extractor_id: str
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
extractor_id = "unstructured"
|
|
42
|
+
|
|
43
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
44
|
+
"""
|
|
45
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
46
|
+
|
|
47
|
+
:param config: Configuration mapping.
|
|
48
|
+
:type config: dict[str, Any]
|
|
49
|
+
:return: Parsed config.
|
|
50
|
+
:rtype: UnstructuredExtractorConfig
|
|
51
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
from unstructured.partition.auto import partition # noqa: F401
|
|
55
|
+
except ImportError as import_error:
|
|
56
|
+
raise ExtractionRunFatalError(
|
|
57
|
+
"Unstructured extractor requires an optional dependency. "
|
|
58
|
+
'Install it with pip install "biblicus[unstructured]".'
|
|
59
|
+
) from import_error
|
|
60
|
+
return UnstructuredExtractorConfig.model_validate(config)
|
|
61
|
+
|
|
62
|
+
def extract_text(
|
|
63
|
+
self,
|
|
64
|
+
*,
|
|
65
|
+
corpus: Corpus,
|
|
66
|
+
item: CatalogItem,
|
|
67
|
+
config: BaseModel,
|
|
68
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
69
|
+
) -> Optional[ExtractedText]:
|
|
70
|
+
"""
|
|
71
|
+
Extract text for a non-text item using Unstructured.
|
|
72
|
+
|
|
73
|
+
:param corpus: Corpus containing the item bytes.
|
|
74
|
+
:type corpus: Corpus
|
|
75
|
+
:param item: Catalog item being processed.
|
|
76
|
+
:type item: CatalogItem
|
|
77
|
+
:param config: Parsed configuration model.
|
|
78
|
+
:type config: UnstructuredExtractorConfig
|
|
79
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
80
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
81
|
+
:return: Extracted text payload, or None when the item is already text.
|
|
82
|
+
:rtype: ExtractedText or None
|
|
83
|
+
"""
|
|
84
|
+
_ = config
|
|
85
|
+
_ = previous_extractions
|
|
86
|
+
media_type = item.media_type
|
|
87
|
+
if media_type == "text/markdown" or media_type.startswith("text/"):
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
from unstructured.partition.auto import partition
|
|
91
|
+
|
|
92
|
+
source_path = corpus.root / item.relpath
|
|
93
|
+
elements = partition(filename=str(source_path))
|
|
94
|
+
lines: list[str] = []
|
|
95
|
+
for element in elements or []:
|
|
96
|
+
text = getattr(element, "text", None)
|
|
97
|
+
if isinstance(text, str) and text.strip():
|
|
98
|
+
lines.append(text.strip())
|
|
99
|
+
combined_text = "\n".join(lines).strip()
|
|
100
|
+
return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)
|
biblicus/frontmatter.py
CHANGED
|
@@ -35,7 +35,6 @@ def parse_front_matter(text: str) -> FrontMatterDocument:
|
|
|
35
35
|
:rtype: FrontMatterDocument
|
|
36
36
|
:raises ValueError: If front matter is present but not a mapping.
|
|
37
37
|
"""
|
|
38
|
-
|
|
39
38
|
if not text.startswith("---\n"):
|
|
40
39
|
return FrontMatterDocument(metadata={}, body=text)
|
|
41
40
|
|
|
@@ -64,7 +63,6 @@ def render_front_matter(metadata: Dict[str, Any], body: str) -> str:
|
|
|
64
63
|
:return: Markdown with Yet Another Markup Language front matter.
|
|
65
64
|
:rtype: str
|
|
66
65
|
"""
|
|
67
|
-
|
|
68
66
|
if not metadata:
|
|
69
67
|
return body
|
|
70
68
|
|
|
@@ -87,6 +85,5 @@ def split_markdown_front_matter(path_text: str) -> Tuple[Dict[str, Any], str]:
|
|
|
87
85
|
:return: Metadata mapping and body text.
|
|
88
86
|
:rtype: tuple[dict[str, Any], str]
|
|
89
87
|
"""
|
|
90
|
-
|
|
91
88
|
parsed_document = parse_front_matter(path_text)
|
|
92
89
|
return parsed_document.metadata, parsed_document.body
|
biblicus/hook_logging.py
CHANGED
|
@@ -23,7 +23,6 @@ def new_operation_id() -> str:
|
|
|
23
23
|
:return: Operation identifier.
|
|
24
24
|
:rtype: str
|
|
25
25
|
"""
|
|
26
|
-
|
|
27
26
|
return str(uuid.uuid4())
|
|
28
27
|
|
|
29
28
|
|
|
@@ -36,7 +35,6 @@ def redact_source_uri(source_uri: str) -> str:
|
|
|
36
35
|
:return: Redacted source uniform resource identifier.
|
|
37
36
|
:rtype: str
|
|
38
37
|
"""
|
|
39
|
-
|
|
40
38
|
parsed = urlparse(source_uri)
|
|
41
39
|
|
|
42
40
|
if not parsed.scheme:
|
|
@@ -117,7 +115,6 @@ class HookLogger:
|
|
|
117
115
|
:param operation_id: Operation identifier for grouping records.
|
|
118
116
|
:type operation_id: str
|
|
119
117
|
"""
|
|
120
|
-
|
|
121
118
|
self.log_dir = log_dir
|
|
122
119
|
self.operation_id = operation_id
|
|
123
120
|
|
|
@@ -129,7 +126,6 @@ class HookLogger:
|
|
|
129
126
|
:return: Log file path.
|
|
130
127
|
:rtype: Path
|
|
131
128
|
"""
|
|
132
|
-
|
|
133
129
|
return self.log_dir / f"{self.operation_id}.jsonl"
|
|
134
130
|
|
|
135
131
|
def record(
|
|
@@ -166,7 +162,6 @@ class HookLogger:
|
|
|
166
162
|
:return: None.
|
|
167
163
|
:rtype: None
|
|
168
164
|
"""
|
|
169
|
-
|
|
170
165
|
self.log_dir.mkdir(parents=True, exist_ok=True)
|
|
171
166
|
entry = HookLogEntry(
|
|
172
167
|
operation_id=self.operation_id,
|
biblicus/hook_manager.py
CHANGED
|
@@ -55,7 +55,6 @@ class HookManager:
|
|
|
55
55
|
:param operation_id: Optional operation identifier override.
|
|
56
56
|
:type operation_id: str or None
|
|
57
57
|
"""
|
|
58
|
-
|
|
59
58
|
self.corpus_uri = corpus_uri
|
|
60
59
|
self.log_dir = log_dir
|
|
61
60
|
self.operation_id = operation_id or new_operation_id()
|
|
@@ -63,7 +62,9 @@ class HookManager:
|
|
|
63
62
|
self._logger = HookLogger(log_dir=self.log_dir, operation_id=self.operation_id)
|
|
64
63
|
|
|
65
64
|
@classmethod
|
|
66
|
-
def from_config(
|
|
65
|
+
def from_config(
|
|
66
|
+
cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]
|
|
67
|
+
) -> "HookManager":
|
|
67
68
|
"""
|
|
68
69
|
Build a hook manager from config data.
|
|
69
70
|
|
|
@@ -77,7 +78,6 @@ class HookManager:
|
|
|
77
78
|
:rtype: HookManager
|
|
78
79
|
:raises KeyError: If a hook identifier is unknown.
|
|
79
80
|
"""
|
|
80
|
-
|
|
81
81
|
log_dir = corpus_root / CORPUS_DIR_NAME / HOOK_LOGS_DIR_NAME
|
|
82
82
|
hooks: List[LifecycleHook] = []
|
|
83
83
|
|
|
@@ -124,7 +124,6 @@ class HookManager:
|
|
|
124
124
|
:rtype: IngestMutation
|
|
125
125
|
:raises ValueError: If ingestion is denied by a hook.
|
|
126
126
|
"""
|
|
127
|
-
|
|
128
127
|
context = IngestHookContext(
|
|
129
128
|
hook_point=hook_point,
|
|
130
129
|
operation_id=self.operation_id,
|
|
@@ -195,7 +194,6 @@ class HookManager:
|
|
|
195
194
|
:rtype: dict[str, Any]
|
|
196
195
|
:raises ValueError: If a hook raises an exception.
|
|
197
196
|
"""
|
|
198
|
-
|
|
199
197
|
try:
|
|
200
198
|
result = hook.run(context)
|
|
201
199
|
except Exception as exc:
|
biblicus/hooks.py
CHANGED
|
@@ -164,7 +164,6 @@ class LifecycleHook:
|
|
|
164
164
|
:rtype: HookResult
|
|
165
165
|
:raises NotImplementedError: If the hook does not implement run.
|
|
166
166
|
"""
|
|
167
|
-
|
|
168
167
|
_ = context
|
|
169
168
|
raise NotImplementedError("LifecycleHook.run must be implemented by concrete hooks")
|
|
170
169
|
|
|
@@ -192,7 +191,6 @@ class AddTagsHook:
|
|
|
192
191
|
:param tags: Tags to add.
|
|
193
192
|
:type tags: Sequence[str]
|
|
194
193
|
"""
|
|
195
|
-
|
|
196
194
|
self.hook_points = list(hook_points)
|
|
197
195
|
self.tags = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
|
|
198
196
|
|
|
@@ -205,7 +203,6 @@ class AddTagsHook:
|
|
|
205
203
|
:return: Ingest mutation result.
|
|
206
204
|
:rtype: HookResult
|
|
207
205
|
"""
|
|
208
|
-
|
|
209
206
|
_ = context
|
|
210
207
|
return IngestMutation(add_tags=list(self.tags))
|
|
211
208
|
|
|
@@ -229,7 +226,6 @@ class DenyAllHook:
|
|
|
229
226
|
:param hook_points: Hook points where the hook runs.
|
|
230
227
|
:type hook_points: Sequence[HookPoint]
|
|
231
228
|
"""
|
|
232
|
-
|
|
233
229
|
self.hook_points = list(hook_points)
|
|
234
230
|
|
|
235
231
|
def run(self, context: HookContext) -> HookResult:
|
|
@@ -241,7 +237,6 @@ class DenyAllHook:
|
|
|
241
237
|
:return: Ingest denial result.
|
|
242
238
|
:rtype: HookResult
|
|
243
239
|
"""
|
|
244
|
-
|
|
245
240
|
_ = context
|
|
246
241
|
return IngestMutation(deny=True, deny_reason="Ingest denied by deny-all hook")
|
|
247
242
|
|
|
@@ -256,10 +251,11 @@ def build_builtin_hook(spec: HookSpec) -> LifecycleHook:
|
|
|
256
251
|
:rtype: LifecycleHook
|
|
257
252
|
:raises KeyError: If the hook identifier is unknown.
|
|
258
253
|
"""
|
|
259
|
-
|
|
260
254
|
if spec.hook_id == AddTagsHook.hook_id:
|
|
261
255
|
tags = spec.config.get("tags") or []
|
|
262
|
-
return AddTagsHook(
|
|
256
|
+
return AddTagsHook(
|
|
257
|
+
hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else []
|
|
258
|
+
)
|
|
263
259
|
if spec.hook_id == DenyAllHook.hook_id:
|
|
264
260
|
return DenyAllHook(hook_points=spec.hook_points)
|
|
265
261
|
raise KeyError(f"Unknown hook_id {spec.hook_id!r}")
|
biblicus/ignore.py
CHANGED
|
@@ -34,7 +34,6 @@ class CorpusIgnoreSpec(BaseModel):
|
|
|
34
34
|
:return: True if the path should be ignored.
|
|
35
35
|
:rtype: bool
|
|
36
36
|
"""
|
|
37
|
-
|
|
38
37
|
normalized = relpath.replace("\\", "/").lstrip("/")
|
|
39
38
|
return any(fnmatch.fnmatch(normalized, pattern) for pattern in self.patterns)
|
|
40
39
|
|
|
@@ -50,7 +49,6 @@ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
|
|
|
50
49
|
:return: Parsed ignore specification.
|
|
51
50
|
:rtype: CorpusIgnoreSpec
|
|
52
51
|
"""
|
|
53
|
-
|
|
54
52
|
ignore_path = corpus_root / ".biblicusignore"
|
|
55
53
|
if not ignore_path.is_file():
|
|
56
54
|
return CorpusIgnoreSpec(patterns=[])
|
|
@@ -64,4 +62,3 @@ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
|
|
|
64
62
|
continue
|
|
65
63
|
patterns.append(line)
|
|
66
64
|
return CorpusIgnoreSpec(patterns=patterns)
|
|
67
|
-
|
biblicus/models.py
CHANGED
|
@@ -142,6 +142,53 @@ class CorpusCatalog(BaseModel):
|
|
|
142
142
|
return self
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
class ExtractionRunReference(BaseModel):
|
|
146
|
+
"""
|
|
147
|
+
Reference to an extraction run.
|
|
148
|
+
|
|
149
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
150
|
+
:vartype extractor_id: str
|
|
151
|
+
:ivar run_id: Extraction run identifier.
|
|
152
|
+
:vartype run_id: str
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
model_config = ConfigDict(extra="forbid")
|
|
156
|
+
|
|
157
|
+
extractor_id: str = Field(min_length=1)
|
|
158
|
+
run_id: str = Field(min_length=1)
|
|
159
|
+
|
|
160
|
+
def as_string(self) -> str:
|
|
161
|
+
"""
|
|
162
|
+
Serialize the reference as a single string.
|
|
163
|
+
|
|
164
|
+
:return: Reference in the form extractor_id:run_id.
|
|
165
|
+
:rtype: str
|
|
166
|
+
"""
|
|
167
|
+
return f"{self.extractor_id}:{self.run_id}"
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
|
|
171
|
+
"""
|
|
172
|
+
Parse an extraction run reference in the form extractor_id:run_id.
|
|
173
|
+
|
|
174
|
+
:param value: Raw reference string.
|
|
175
|
+
:type value: str
|
|
176
|
+
:return: Parsed extraction run reference.
|
|
177
|
+
:rtype: ExtractionRunReference
|
|
178
|
+
:raises ValueError: If the reference is not well formed.
|
|
179
|
+
"""
|
|
180
|
+
if ":" not in value:
|
|
181
|
+
raise ValueError("Extraction run reference must be extractor_id:run_id")
|
|
182
|
+
extractor_id, run_id = value.split(":", 1)
|
|
183
|
+
extractor_id = extractor_id.strip()
|
|
184
|
+
run_id = run_id.strip()
|
|
185
|
+
if not extractor_id or not run_id:
|
|
186
|
+
raise ValueError(
|
|
187
|
+
"Extraction run reference must be extractor_id:run_id with non-empty parts"
|
|
188
|
+
)
|
|
189
|
+
return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
|
|
190
|
+
|
|
191
|
+
|
|
145
192
|
class QueryBudget(BaseModel):
|
|
146
193
|
"""
|
|
147
194
|
Evidence selection budget for retrieval.
|
|
@@ -319,9 +366,49 @@ class ExtractedText(BaseModel):
|
|
|
319
366
|
:vartype text: str
|
|
320
367
|
:ivar producer_extractor_id: Extractor identifier that produced this text.
|
|
321
368
|
:vartype producer_extractor_id: str
|
|
369
|
+
:ivar source_step_index: Optional pipeline step index where this text originated.
|
|
370
|
+
:vartype source_step_index: int or None
|
|
322
371
|
"""
|
|
323
372
|
|
|
324
373
|
model_config = ConfigDict(extra="forbid")
|
|
325
374
|
|
|
326
375
|
text: str
|
|
327
376
|
producer_extractor_id: str = Field(min_length=1)
|
|
377
|
+
source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
class ExtractionStepOutput(BaseModel):
|
|
381
|
+
"""
|
|
382
|
+
In-memory representation of a pipeline step output for a single item.
|
|
383
|
+
|
|
384
|
+
:ivar step_index: One-based pipeline step index.
|
|
385
|
+
:vartype step_index: int
|
|
386
|
+
:ivar extractor_id: Extractor identifier for the step.
|
|
387
|
+
:vartype extractor_id: str
|
|
388
|
+
:ivar status: Step status, extracted, skipped, or errored.
|
|
389
|
+
:vartype status: str
|
|
390
|
+
:ivar text: Extracted text content, when produced.
|
|
391
|
+
:vartype text: str or None
|
|
392
|
+
:ivar text_characters: Character count of the extracted text.
|
|
393
|
+
:vartype text_characters: int
|
|
394
|
+
:ivar producer_extractor_id: Extractor identifier that produced the text content.
|
|
395
|
+
:vartype producer_extractor_id: str or None
|
|
396
|
+
:ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
|
|
397
|
+
:vartype source_step_index: int or None
|
|
398
|
+
:ivar error_type: Optional error type name for errored steps.
|
|
399
|
+
:vartype error_type: str or None
|
|
400
|
+
:ivar error_message: Optional error message for errored steps.
|
|
401
|
+
:vartype error_message: str or None
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
model_config = ConfigDict(extra="forbid")
|
|
405
|
+
|
|
406
|
+
step_index: int = Field(ge=1)
|
|
407
|
+
extractor_id: str
|
|
408
|
+
status: str
|
|
409
|
+
text: Optional[str] = None
|
|
410
|
+
text_characters: int = Field(default=0, ge=0)
|
|
411
|
+
producer_extractor_id: Optional[str] = None
|
|
412
|
+
source_step_index: Optional[int] = Field(default=None, ge=1)
|
|
413
|
+
error_type: Optional[str] = None
|
|
414
|
+
error_message: Optional[str] = None
|
biblicus/retrieval.py
CHANGED
|
@@ -34,7 +34,6 @@ def create_recipe_manifest(
|
|
|
34
34
|
:return: Deterministic recipe manifest.
|
|
35
35
|
:rtype: RecipeManifest
|
|
36
36
|
"""
|
|
37
|
-
|
|
38
37
|
config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
|
|
39
38
|
recipe_seed = f"{backend_id}:{config_json}"
|
|
40
39
|
recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
|
|
@@ -69,7 +68,6 @@ def create_run_manifest(
|
|
|
69
68
|
:return: Run manifest.
|
|
70
69
|
:rtype: RetrievalRun
|
|
71
70
|
"""
|
|
72
|
-
|
|
73
71
|
catalog = corpus.load_catalog()
|
|
74
72
|
created_at = utc_now_iso()
|
|
75
73
|
run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
|
|
@@ -93,7 +91,6 @@ def hash_text(text: str) -> str:
|
|
|
93
91
|
:return: Secure Hash Algorithm 256 hex digest.
|
|
94
92
|
:rtype: str
|
|
95
93
|
"""
|
|
96
|
-
|
|
97
94
|
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
98
95
|
|
|
99
96
|
|
|
@@ -108,7 +105,6 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
|
|
|
108
105
|
:return: Evidence list respecting the budget.
|
|
109
106
|
:rtype: list[Evidence]
|
|
110
107
|
"""
|
|
111
|
-
|
|
112
108
|
selected_evidence: List[Evidence] = []
|
|
113
109
|
source_counts: Dict[str, int] = {}
|
|
114
110
|
total_characters = 0
|
biblicus/sources.py
CHANGED
|
@@ -21,7 +21,6 @@ def _looks_like_uri(value: str) -> bool:
|
|
|
21
21
|
:return: True if the string has a valid uniform resource identifier scheme prefix.
|
|
22
22
|
:rtype: bool
|
|
23
23
|
"""
|
|
24
|
-
|
|
25
24
|
return "://" in value and value.split("://", 1)[0].isidentifier()
|
|
26
25
|
|
|
27
26
|
|
|
@@ -34,7 +33,6 @@ def _filename_from_url_path(path: str) -> str:
|
|
|
34
33
|
:return: Filename or a fallback name.
|
|
35
34
|
:rtype: str
|
|
36
35
|
"""
|
|
37
|
-
|
|
38
36
|
filename = Path(unquote(path)).name
|
|
39
37
|
return filename or "download"
|
|
40
38
|
|
|
@@ -48,7 +46,6 @@ def _media_type_from_filename(name: str) -> str:
|
|
|
48
46
|
:return: Guessed media type or application/octet-stream.
|
|
49
47
|
:rtype: str
|
|
50
48
|
"""
|
|
51
|
-
|
|
52
49
|
media_type, _ = mimetypes.guess_type(name)
|
|
53
50
|
return media_type or "application/octet-stream"
|
|
54
51
|
|
|
@@ -62,7 +59,6 @@ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
|
|
|
62
59
|
:return: Detected media type or None.
|
|
63
60
|
:rtype: str or None
|
|
64
61
|
"""
|
|
65
|
-
|
|
66
62
|
prefix = data[:32]
|
|
67
63
|
if prefix.startswith(b"%PDF-"):
|
|
68
64
|
return "application/pdf"
|
|
@@ -70,11 +66,46 @@ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
|
|
|
70
66
|
return "image/png"
|
|
71
67
|
if prefix[:3] == b"\xff\xd8\xff":
|
|
72
68
|
return "image/jpeg"
|
|
73
|
-
if prefix.
|
|
69
|
+
if prefix.startswith(b"RIFF") and prefix[8:12] == b"WAVE":
|
|
70
|
+
return "audio/x-wav"
|
|
71
|
+
if prefix.startswith(b"ID3") or (
|
|
72
|
+
len(prefix) >= 2 and prefix[0] == 0xFF and (prefix[1] & 0xE0) == 0xE0
|
|
73
|
+
):
|
|
74
|
+
return "audio/mpeg"
|
|
75
|
+
if prefix.startswith(b"OggS"):
|
|
76
|
+
return "audio/ogg"
|
|
77
|
+
if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(
|
|
78
|
+
b"<html"
|
|
79
|
+
):
|
|
74
80
|
return "text/html"
|
|
75
81
|
return None
|
|
76
82
|
|
|
77
83
|
|
|
84
|
+
def _normalize_media_type(*, filename: str, media_type: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Normalize media types that are commonly mislabelled by upstream sources.
|
|
87
|
+
|
|
88
|
+
This function exists to keep the corpus usable for humans. When a source provides a filename
|
|
89
|
+
extension that users recognize (for example, ``.ogg``), Biblicus prefers a matching media type
|
|
90
|
+
so that downstream processing can make reasonable decisions.
|
|
91
|
+
|
|
92
|
+
:param filename: Filename associated with the payload.
|
|
93
|
+
:type filename: str
|
|
94
|
+
:param media_type: Media type reported or guessed for the payload.
|
|
95
|
+
:type media_type: str
|
|
96
|
+
:return: Normalized media type.
|
|
97
|
+
:rtype: str
|
|
98
|
+
"""
|
|
99
|
+
suffix = Path(filename).suffix.lower()
|
|
100
|
+
if media_type in {"application/ogg", "application/x-ogg"} and suffix in {
|
|
101
|
+
".ogg",
|
|
102
|
+
".oga",
|
|
103
|
+
".ogx",
|
|
104
|
+
}:
|
|
105
|
+
return "audio/ogg"
|
|
106
|
+
return media_type
|
|
107
|
+
|
|
108
|
+
|
|
78
109
|
def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
|
|
79
110
|
"""
|
|
80
111
|
Ensure the filename has a usable extension for the media type.
|
|
@@ -86,10 +117,12 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
|
|
|
86
117
|
:return: Filename with extension.
|
|
87
118
|
:rtype: str
|
|
88
119
|
"""
|
|
89
|
-
|
|
90
120
|
if Path(filename).suffix:
|
|
91
121
|
return filename
|
|
92
|
-
|
|
122
|
+
if media_type == "audio/ogg":
|
|
123
|
+
ext = ".ogg"
|
|
124
|
+
else:
|
|
125
|
+
ext = mimetypes.guess_extension(media_type) or ""
|
|
93
126
|
return filename + ext if ext else filename
|
|
94
127
|
|
|
95
128
|
|
|
@@ -127,7 +160,6 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
127
160
|
:raises ValueError: If a file:// uniform resource identifier has a non-local host.
|
|
128
161
|
:raises NotImplementedError: If the uniform resource identifier scheme is unsupported.
|
|
129
162
|
"""
|
|
130
|
-
|
|
131
163
|
if isinstance(source, Path):
|
|
132
164
|
path = source.resolve()
|
|
133
165
|
media_type = _media_type_from_filename(path.name)
|
|
@@ -144,7 +176,9 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
144
176
|
parsed = urlparse(source)
|
|
145
177
|
if parsed.scheme == "file":
|
|
146
178
|
if parsed.netloc not in ("", "localhost"):
|
|
147
|
-
raise ValueError(
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
|
|
181
|
+
)
|
|
148
182
|
path = Path(unquote(parsed.path)).resolve()
|
|
149
183
|
return load_source(path, source_uri=source_uri or source)
|
|
150
184
|
|
|
@@ -160,6 +194,7 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
|
|
|
160
194
|
if sniffed:
|
|
161
195
|
media_type = sniffed
|
|
162
196
|
filename = _ensure_extension_for_media_type(filename, media_type)
|
|
197
|
+
media_type = _normalize_media_type(filename=filename, media_type=media_type)
|
|
163
198
|
if Path(filename).suffix.lower() in {".md", ".markdown"}:
|
|
164
199
|
media_type = "text/markdown"
|
|
165
200
|
return SourcePayload(
|
biblicus/time.py
CHANGED
biblicus/uris.py
CHANGED
|
@@ -18,7 +18,6 @@ def _looks_like_uri(value: str) -> bool:
|
|
|
18
18
|
:return: True if the string has a valid uniform resource identifier scheme prefix.
|
|
19
19
|
:rtype: bool
|
|
20
20
|
"""
|
|
21
|
-
|
|
22
21
|
return "://" in value and value.split("://", 1)[0].isidentifier()
|
|
23
22
|
|
|
24
23
|
|
|
@@ -33,7 +32,6 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
|
|
|
33
32
|
:raises NotImplementedError: If a non-file uniform resource identifier scheme is used.
|
|
34
33
|
:raises ValueError: If a file:// uniform resource identifier has a non-local host.
|
|
35
34
|
"""
|
|
36
|
-
|
|
37
35
|
if isinstance(ref, Path):
|
|
38
36
|
return ref.resolve()
|
|
39
37
|
|
|
@@ -45,7 +43,9 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
|
|
|
45
43
|
f"(got {parsed.scheme}://)"
|
|
46
44
|
)
|
|
47
45
|
if parsed.netloc not in ("", "localhost"):
|
|
48
|
-
raise ValueError(
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
|
|
48
|
+
)
|
|
49
49
|
return Path(unquote(parsed.path)).resolve()
|
|
50
50
|
|
|
51
51
|
return Path(ref).resolve()
|
|
@@ -60,5 +60,4 @@ def normalize_corpus_uri(ref: Union[str, Path]) -> str:
|
|
|
60
60
|
:return: Canonical file:// uniform resource identifier.
|
|
61
61
|
:rtype: str
|
|
62
62
|
"""
|
|
63
|
-
|
|
64
63
|
return corpus_ref_to_path(ref).as_uri()
|