biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. biblicus/__init__.py +2 -2
  2. biblicus/_vendor/dotyaml/__init__.py +14 -0
  3. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  4. biblicus/_vendor/dotyaml/loader.py +181 -0
  5. biblicus/_vendor/dotyaml/transformer.py +135 -0
  6. biblicus/backends/__init__.py +0 -2
  7. biblicus/backends/base.py +3 -3
  8. biblicus/backends/scan.py +21 -15
  9. biblicus/backends/sqlite_full_text_search.py +14 -15
  10. biblicus/cli.py +33 -49
  11. biblicus/corpus.py +39 -58
  12. biblicus/errors.py +15 -0
  13. biblicus/evaluation.py +4 -8
  14. biblicus/extraction.py +276 -77
  15. biblicus/extractors/__init__.py +14 -3
  16. biblicus/extractors/base.py +12 -5
  17. biblicus/extractors/metadata_text.py +13 -5
  18. biblicus/extractors/openai_stt.py +180 -0
  19. biblicus/extractors/pass_through_text.py +16 -6
  20. biblicus/extractors/pdf_text.py +100 -0
  21. biblicus/extractors/pipeline.py +105 -0
  22. biblicus/extractors/rapidocr_text.py +129 -0
  23. biblicus/extractors/select_longest_text.py +105 -0
  24. biblicus/extractors/select_text.py +100 -0
  25. biblicus/extractors/unstructured_text.py +100 -0
  26. biblicus/frontmatter.py +0 -3
  27. biblicus/hook_logging.py +0 -5
  28. biblicus/hook_manager.py +3 -5
  29. biblicus/hooks.py +3 -7
  30. biblicus/ignore.py +0 -3
  31. biblicus/models.py +87 -0
  32. biblicus/retrieval.py +0 -4
  33. biblicus/sources.py +44 -9
  34. biblicus/time.py +0 -1
  35. biblicus/uris.py +3 -4
  36. biblicus/user_config.py +138 -0
  37. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
  38. biblicus-0.3.0.dist-info/RECORD +44 -0
  39. biblicus/extractors/cascade.py +0 -101
  40. biblicus-0.2.0.dist-info/RECORD +0 -32
  41. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
  42. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
  43. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
  44. {biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,100 @@
1
+ """
2
+ Selection extractor that chooses text from previous pipeline outputs.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
12
+ from .base import TextExtractor
13
+
14
+
15
+ class SelectTextExtractorConfig(BaseModel):
16
+ """
17
+ Configuration for the selection extractor.
18
+
19
+ The selection extractor is intentionally minimal and requires no configuration.
20
+ """
21
+
22
+ model_config = ConfigDict(extra="forbid")
23
+
24
+
25
+ class SelectTextExtractor(TextExtractor):
26
+ """
27
+ Extractor plugin that selects from previous pipeline outputs.
28
+
29
+ This extractor is used as a final step when you want to make an explicit choice among
30
+ multiple extraction outputs in the same pipeline.
31
+
32
+ It selects the first usable extracted text in pipeline order. Usable means the text is
33
+ non-empty after stripping whitespace. If no usable text exists but prior extracted text
34
+ exists, it selects the first extracted text even if it is empty.
35
+
36
+ :ivar extractor_id: Extractor identifier.
37
+ :vartype extractor_id: str
38
+ """
39
+
40
+ extractor_id = "select-text"
41
+
42
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
43
+ """
44
+ Validate selection extractor configuration.
45
+
46
+ :param config: Configuration mapping.
47
+ :type config: dict[str, Any]
48
+ :return: Parsed configuration.
49
+ :rtype: SelectTextExtractorConfig
50
+ """
51
+ return SelectTextExtractorConfig.model_validate(config)
52
+
53
+ def extract_text(
54
+ self,
55
+ *,
56
+ corpus,
57
+ item: CatalogItem,
58
+ config: BaseModel,
59
+ previous_extractions: List[ExtractionStepOutput],
60
+ ) -> Optional[ExtractedText]:
61
+ """
62
+ Select extracted text from previous pipeline outputs.
63
+
64
+ :param corpus: Corpus containing the item bytes.
65
+ :type corpus: Corpus
66
+ :param item: Catalog item being processed.
67
+ :type item: CatalogItem
68
+ :param config: Parsed configuration model.
69
+ :type config: SelectTextExtractorConfig
70
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
71
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
72
+ :return: Selected extracted text payload or None when no prior outputs exist.
73
+ :rtype: ExtractedText or None
74
+ """
75
+ _ = corpus
76
+ _ = item
77
+ _ = config
78
+
79
+ extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
80
+ usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
81
+
82
+ if usable_candidates:
83
+ candidate = usable_candidates[0]
84
+ producer = candidate.producer_extractor_id or candidate.extractor_id
85
+ return ExtractedText(
86
+ text=candidate.text or "",
87
+ producer_extractor_id=producer,
88
+ source_step_index=candidate.step_index,
89
+ )
90
+
91
+ if extracted_candidates:
92
+ candidate = extracted_candidates[0]
93
+ producer = candidate.producer_extractor_id or candidate.extractor_id
94
+ return ExtractedText(
95
+ text=candidate.text or "",
96
+ producer_extractor_id=producer,
97
+ source_step_index=candidate.step_index,
98
+ )
99
+
100
+ return None
@@ -0,0 +1,100 @@
1
+ """
2
+ Unstructured-based text extraction plugin.
3
+
4
+ This extractor is implemented as an optional dependency so the core installation stays small.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from pydantic import BaseModel, ConfigDict
12
+
13
+ from ..corpus import Corpus
14
+ from ..errors import ExtractionRunFatalError
15
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
+ from .base import TextExtractor
17
+
18
+
19
+ class UnstructuredExtractorConfig(BaseModel):
20
+ """
21
+ Configuration for the Unstructured extractor.
22
+
23
+ Version zero does not expose any configuration for this extractor.
24
+ """
25
+
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+
29
+ class UnstructuredExtractor(TextExtractor):
30
+ """
31
+ Extractor plugin backed by the `unstructured` library.
32
+
33
+ The intent is broad format coverage as a last-resort extractor. This extractor skips items
34
+ that are already text so the pass-through extractor remains the canonical choice for text
35
+ items and Markdown front matter handling.
36
+
37
+ :ivar extractor_id: Extractor identifier.
38
+ :vartype extractor_id: str
39
+ """
40
+
41
+ extractor_id = "unstructured"
42
+
43
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
44
+ """
45
+ Validate extractor configuration and ensure the dependency is installed.
46
+
47
+ :param config: Configuration mapping.
48
+ :type config: dict[str, Any]
49
+ :return: Parsed config.
50
+ :rtype: UnstructuredExtractorConfig
51
+ :raises ExtractionRunFatalError: If the optional dependency is not installed.
52
+ """
53
+ try:
54
+ from unstructured.partition.auto import partition # noqa: F401
55
+ except ImportError as import_error:
56
+ raise ExtractionRunFatalError(
57
+ "Unstructured extractor requires an optional dependency. "
58
+ 'Install it with pip install "biblicus[unstructured]".'
59
+ ) from import_error
60
+ return UnstructuredExtractorConfig.model_validate(config)
61
+
62
+ def extract_text(
63
+ self,
64
+ *,
65
+ corpus: Corpus,
66
+ item: CatalogItem,
67
+ config: BaseModel,
68
+ previous_extractions: List[ExtractionStepOutput],
69
+ ) -> Optional[ExtractedText]:
70
+ """
71
+ Extract text for a non-text item using Unstructured.
72
+
73
+ :param corpus: Corpus containing the item bytes.
74
+ :type corpus: Corpus
75
+ :param item: Catalog item being processed.
76
+ :type item: CatalogItem
77
+ :param config: Parsed configuration model.
78
+ :type config: UnstructuredExtractorConfig
79
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
80
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
81
+ :return: Extracted text payload, or None when the item is already text.
82
+ :rtype: ExtractedText or None
83
+ """
84
+ _ = config
85
+ _ = previous_extractions
86
+ media_type = item.media_type
87
+ if media_type == "text/markdown" or media_type.startswith("text/"):
88
+ return None
89
+
90
+ from unstructured.partition.auto import partition
91
+
92
+ source_path = corpus.root / item.relpath
93
+ elements = partition(filename=str(source_path))
94
+ lines: list[str] = []
95
+ for element in elements or []:
96
+ text = getattr(element, "text", None)
97
+ if isinstance(text, str) and text.strip():
98
+ lines.append(text.strip())
99
+ combined_text = "\n".join(lines).strip()
100
+ return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)
biblicus/frontmatter.py CHANGED
@@ -35,7 +35,6 @@ def parse_front_matter(text: str) -> FrontMatterDocument:
35
35
  :rtype: FrontMatterDocument
36
36
  :raises ValueError: If front matter is present but not a mapping.
37
37
  """
38
-
39
38
  if not text.startswith("---\n"):
40
39
  return FrontMatterDocument(metadata={}, body=text)
41
40
 
@@ -64,7 +63,6 @@ def render_front_matter(metadata: Dict[str, Any], body: str) -> str:
64
63
  :return: Markdown with Yet Another Markup Language front matter.
65
64
  :rtype: str
66
65
  """
67
-
68
66
  if not metadata:
69
67
  return body
70
68
 
@@ -87,6 +85,5 @@ def split_markdown_front_matter(path_text: str) -> Tuple[Dict[str, Any], str]:
87
85
  :return: Metadata mapping and body text.
88
86
  :rtype: tuple[dict[str, Any], str]
89
87
  """
90
-
91
88
  parsed_document = parse_front_matter(path_text)
92
89
  return parsed_document.metadata, parsed_document.body
biblicus/hook_logging.py CHANGED
@@ -23,7 +23,6 @@ def new_operation_id() -> str:
23
23
  :return: Operation identifier.
24
24
  :rtype: str
25
25
  """
26
-
27
26
  return str(uuid.uuid4())
28
27
 
29
28
 
@@ -36,7 +35,6 @@ def redact_source_uri(source_uri: str) -> str:
36
35
  :return: Redacted source uniform resource identifier.
37
36
  :rtype: str
38
37
  """
39
-
40
38
  parsed = urlparse(source_uri)
41
39
 
42
40
  if not parsed.scheme:
@@ -117,7 +115,6 @@ class HookLogger:
117
115
  :param operation_id: Operation identifier for grouping records.
118
116
  :type operation_id: str
119
117
  """
120
-
121
118
  self.log_dir = log_dir
122
119
  self.operation_id = operation_id
123
120
 
@@ -129,7 +126,6 @@ class HookLogger:
129
126
  :return: Log file path.
130
127
  :rtype: Path
131
128
  """
132
-
133
129
  return self.log_dir / f"{self.operation_id}.jsonl"
134
130
 
135
131
  def record(
@@ -166,7 +162,6 @@ class HookLogger:
166
162
  :return: None.
167
163
  :rtype: None
168
164
  """
169
-
170
165
  self.log_dir.mkdir(parents=True, exist_ok=True)
171
166
  entry = HookLogEntry(
172
167
  operation_id=self.operation_id,
biblicus/hook_manager.py CHANGED
@@ -55,7 +55,6 @@ class HookManager:
55
55
  :param operation_id: Optional operation identifier override.
56
56
  :type operation_id: str or None
57
57
  """
58
-
59
58
  self.corpus_uri = corpus_uri
60
59
  self.log_dir = log_dir
61
60
  self.operation_id = operation_id or new_operation_id()
@@ -63,7 +62,9 @@ class HookManager:
63
62
  self._logger = HookLogger(log_dir=self.log_dir, operation_id=self.operation_id)
64
63
 
65
64
  @classmethod
66
- def from_config(cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]) -> "HookManager":
65
+ def from_config(
66
+ cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]
67
+ ) -> "HookManager":
67
68
  """
68
69
  Build a hook manager from config data.
69
70
 
@@ -77,7 +78,6 @@ class HookManager:
77
78
  :rtype: HookManager
78
79
  :raises KeyError: If a hook identifier is unknown.
79
80
  """
80
-
81
81
  log_dir = corpus_root / CORPUS_DIR_NAME / HOOK_LOGS_DIR_NAME
82
82
  hooks: List[LifecycleHook] = []
83
83
 
@@ -124,7 +124,6 @@ class HookManager:
124
124
  :rtype: IngestMutation
125
125
  :raises ValueError: If ingestion is denied by a hook.
126
126
  """
127
-
128
127
  context = IngestHookContext(
129
128
  hook_point=hook_point,
130
129
  operation_id=self.operation_id,
@@ -195,7 +194,6 @@ class HookManager:
195
194
  :rtype: dict[str, Any]
196
195
  :raises ValueError: If a hook raises an exception.
197
196
  """
198
-
199
197
  try:
200
198
  result = hook.run(context)
201
199
  except Exception as exc:
biblicus/hooks.py CHANGED
@@ -164,7 +164,6 @@ class LifecycleHook:
164
164
  :rtype: HookResult
165
165
  :raises NotImplementedError: If the hook does not implement run.
166
166
  """
167
-
168
167
  _ = context
169
168
  raise NotImplementedError("LifecycleHook.run must be implemented by concrete hooks")
170
169
 
@@ -192,7 +191,6 @@ class AddTagsHook:
192
191
  :param tags: Tags to add.
193
192
  :type tags: Sequence[str]
194
193
  """
195
-
196
194
  self.hook_points = list(hook_points)
197
195
  self.tags = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
198
196
 
@@ -205,7 +203,6 @@ class AddTagsHook:
205
203
  :return: Ingest mutation result.
206
204
  :rtype: HookResult
207
205
  """
208
-
209
206
  _ = context
210
207
  return IngestMutation(add_tags=list(self.tags))
211
208
 
@@ -229,7 +226,6 @@ class DenyAllHook:
229
226
  :param hook_points: Hook points where the hook runs.
230
227
  :type hook_points: Sequence[HookPoint]
231
228
  """
232
-
233
229
  self.hook_points = list(hook_points)
234
230
 
235
231
  def run(self, context: HookContext) -> HookResult:
@@ -241,7 +237,6 @@ class DenyAllHook:
241
237
  :return: Ingest denial result.
242
238
  :rtype: HookResult
243
239
  """
244
-
245
240
  _ = context
246
241
  return IngestMutation(deny=True, deny_reason="Ingest denied by deny-all hook")
247
242
 
@@ -256,10 +251,11 @@ def build_builtin_hook(spec: HookSpec) -> LifecycleHook:
256
251
  :rtype: LifecycleHook
257
252
  :raises KeyError: If the hook identifier is unknown.
258
253
  """
259
-
260
254
  if spec.hook_id == AddTagsHook.hook_id:
261
255
  tags = spec.config.get("tags") or []
262
- return AddTagsHook(hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else [])
256
+ return AddTagsHook(
257
+ hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else []
258
+ )
263
259
  if spec.hook_id == DenyAllHook.hook_id:
264
260
  return DenyAllHook(hook_points=spec.hook_points)
265
261
  raise KeyError(f"Unknown hook_id {spec.hook_id!r}")
biblicus/ignore.py CHANGED
@@ -34,7 +34,6 @@ class CorpusIgnoreSpec(BaseModel):
34
34
  :return: True if the path should be ignored.
35
35
  :rtype: bool
36
36
  """
37
-
38
37
  normalized = relpath.replace("\\", "/").lstrip("/")
39
38
  return any(fnmatch.fnmatch(normalized, pattern) for pattern in self.patterns)
40
39
 
@@ -50,7 +49,6 @@ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
50
49
  :return: Parsed ignore specification.
51
50
  :rtype: CorpusIgnoreSpec
52
51
  """
53
-
54
52
  ignore_path = corpus_root / ".biblicusignore"
55
53
  if not ignore_path.is_file():
56
54
  return CorpusIgnoreSpec(patterns=[])
@@ -64,4 +62,3 @@ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
64
62
  continue
65
63
  patterns.append(line)
66
64
  return CorpusIgnoreSpec(patterns=patterns)
67
-
biblicus/models.py CHANGED
@@ -142,6 +142,53 @@ class CorpusCatalog(BaseModel):
142
142
  return self
143
143
 
144
144
 
145
+ class ExtractionRunReference(BaseModel):
146
+ """
147
+ Reference to an extraction run.
148
+
149
+ :ivar extractor_id: Extractor plugin identifier.
150
+ :vartype extractor_id: str
151
+ :ivar run_id: Extraction run identifier.
152
+ :vartype run_id: str
153
+ """
154
+
155
+ model_config = ConfigDict(extra="forbid")
156
+
157
+ extractor_id: str = Field(min_length=1)
158
+ run_id: str = Field(min_length=1)
159
+
160
+ def as_string(self) -> str:
161
+ """
162
+ Serialize the reference as a single string.
163
+
164
+ :return: Reference in the form extractor_id:run_id.
165
+ :rtype: str
166
+ """
167
+ return f"{self.extractor_id}:{self.run_id}"
168
+
169
+
170
+ def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
171
+ """
172
+ Parse an extraction run reference in the form extractor_id:run_id.
173
+
174
+ :param value: Raw reference string.
175
+ :type value: str
176
+ :return: Parsed extraction run reference.
177
+ :rtype: ExtractionRunReference
178
+ :raises ValueError: If the reference is not well formed.
179
+ """
180
+ if ":" not in value:
181
+ raise ValueError("Extraction run reference must be extractor_id:run_id")
182
+ extractor_id, run_id = value.split(":", 1)
183
+ extractor_id = extractor_id.strip()
184
+ run_id = run_id.strip()
185
+ if not extractor_id or not run_id:
186
+ raise ValueError(
187
+ "Extraction run reference must be extractor_id:run_id with non-empty parts"
188
+ )
189
+ return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
190
+
191
+
145
192
  class QueryBudget(BaseModel):
146
193
  """
147
194
  Evidence selection budget for retrieval.
@@ -319,9 +366,49 @@ class ExtractedText(BaseModel):
319
366
  :vartype text: str
320
367
  :ivar producer_extractor_id: Extractor identifier that produced this text.
321
368
  :vartype producer_extractor_id: str
369
+ :ivar source_step_index: Optional pipeline step index where this text originated.
370
+ :vartype source_step_index: int or None
322
371
  """
323
372
 
324
373
  model_config = ConfigDict(extra="forbid")
325
374
 
326
375
  text: str
327
376
  producer_extractor_id: str = Field(min_length=1)
377
+ source_step_index: Optional[int] = Field(default=None, ge=1)
378
+
379
+
380
+ class ExtractionStepOutput(BaseModel):
381
+ """
382
+ In-memory representation of a pipeline step output for a single item.
383
+
384
+ :ivar step_index: One-based pipeline step index.
385
+ :vartype step_index: int
386
+ :ivar extractor_id: Extractor identifier for the step.
387
+ :vartype extractor_id: str
388
+ :ivar status: Step status, extracted, skipped, or errored.
389
+ :vartype status: str
390
+ :ivar text: Extracted text content, when produced.
391
+ :vartype text: str or None
392
+ :ivar text_characters: Character count of the extracted text.
393
+ :vartype text_characters: int
394
+ :ivar producer_extractor_id: Extractor identifier that produced the text content.
395
+ :vartype producer_extractor_id: str or None
396
+ :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
397
+ :vartype source_step_index: int or None
398
+ :ivar error_type: Optional error type name for errored steps.
399
+ :vartype error_type: str or None
400
+ :ivar error_message: Optional error message for errored steps.
401
+ :vartype error_message: str or None
402
+ """
403
+
404
+ model_config = ConfigDict(extra="forbid")
405
+
406
+ step_index: int = Field(ge=1)
407
+ extractor_id: str
408
+ status: str
409
+ text: Optional[str] = None
410
+ text_characters: int = Field(default=0, ge=0)
411
+ producer_extractor_id: Optional[str] = None
412
+ source_step_index: Optional[int] = Field(default=None, ge=1)
413
+ error_type: Optional[str] = None
414
+ error_message: Optional[str] = None
biblicus/retrieval.py CHANGED
@@ -34,7 +34,6 @@ def create_recipe_manifest(
34
34
  :return: Deterministic recipe manifest.
35
35
  :rtype: RecipeManifest
36
36
  """
37
-
38
37
  config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
39
38
  recipe_seed = f"{backend_id}:{config_json}"
40
39
  recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
@@ -69,7 +68,6 @@ def create_run_manifest(
69
68
  :return: Run manifest.
70
69
  :rtype: RetrievalRun
71
70
  """
72
-
73
71
  catalog = corpus.load_catalog()
74
72
  created_at = utc_now_iso()
75
73
  run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
@@ -93,7 +91,6 @@ def hash_text(text: str) -> str:
93
91
  :return: Secure Hash Algorithm 256 hex digest.
94
92
  :rtype: str
95
93
  """
96
-
97
94
  return hashlib.sha256(text.encode("utf-8")).hexdigest()
98
95
 
99
96
 
@@ -108,7 +105,6 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
108
105
  :return: Evidence list respecting the budget.
109
106
  :rtype: list[Evidence]
110
107
  """
111
-
112
108
  selected_evidence: List[Evidence] = []
113
109
  source_counts: Dict[str, int] = {}
114
110
  total_characters = 0
biblicus/sources.py CHANGED
@@ -21,7 +21,6 @@ def _looks_like_uri(value: str) -> bool:
21
21
  :return: True if the string has a valid uniform resource identifier scheme prefix.
22
22
  :rtype: bool
23
23
  """
24
-
25
24
  return "://" in value and value.split("://", 1)[0].isidentifier()
26
25
 
27
26
 
@@ -34,7 +33,6 @@ def _filename_from_url_path(path: str) -> str:
34
33
  :return: Filename or a fallback name.
35
34
  :rtype: str
36
35
  """
37
-
38
36
  filename = Path(unquote(path)).name
39
37
  return filename or "download"
40
38
 
@@ -48,7 +46,6 @@ def _media_type_from_filename(name: str) -> str:
48
46
  :return: Guessed media type or application/octet-stream.
49
47
  :rtype: str
50
48
  """
51
-
52
49
  media_type, _ = mimetypes.guess_type(name)
53
50
  return media_type or "application/octet-stream"
54
51
 
@@ -62,7 +59,6 @@ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
62
59
  :return: Detected media type or None.
63
60
  :rtype: str or None
64
61
  """
65
-
66
62
  prefix = data[:32]
67
63
  if prefix.startswith(b"%PDF-"):
68
64
  return "application/pdf"
@@ -70,11 +66,46 @@ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
70
66
  return "image/png"
71
67
  if prefix[:3] == b"\xff\xd8\xff":
72
68
  return "image/jpeg"
73
- if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(b"<html"):
69
+ if prefix.startswith(b"RIFF") and prefix[8:12] == b"WAVE":
70
+ return "audio/x-wav"
71
+ if prefix.startswith(b"ID3") or (
72
+ len(prefix) >= 2 and prefix[0] == 0xFF and (prefix[1] & 0xE0) == 0xE0
73
+ ):
74
+ return "audio/mpeg"
75
+ if prefix.startswith(b"OggS"):
76
+ return "audio/ogg"
77
+ if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(
78
+ b"<html"
79
+ ):
74
80
  return "text/html"
75
81
  return None
76
82
 
77
83
 
84
+ def _normalize_media_type(*, filename: str, media_type: str) -> str:
85
+ """
86
+ Normalize media types that are commonly mislabelled by upstream sources.
87
+
88
+ This function exists to keep the corpus usable for humans. When a source provides a filename
89
+ extension that users recognize (for example, ``.ogg``), Biblicus prefers a matching media type
90
+ so that downstream processing can make reasonable decisions.
91
+
92
+ :param filename: Filename associated with the payload.
93
+ :type filename: str
94
+ :param media_type: Media type reported or guessed for the payload.
95
+ :type media_type: str
96
+ :return: Normalized media type.
97
+ :rtype: str
98
+ """
99
+ suffix = Path(filename).suffix.lower()
100
+ if media_type in {"application/ogg", "application/x-ogg"} and suffix in {
101
+ ".ogg",
102
+ ".oga",
103
+ ".ogx",
104
+ }:
105
+ return "audio/ogg"
106
+ return media_type
107
+
108
+
78
109
  def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
79
110
  """
80
111
  Ensure the filename has a usable extension for the media type.
@@ -86,10 +117,12 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
86
117
  :return: Filename with extension.
87
118
  :rtype: str
88
119
  """
89
-
90
120
  if Path(filename).suffix:
91
121
  return filename
92
- ext = mimetypes.guess_extension(media_type) or ""
122
+ if media_type == "audio/ogg":
123
+ ext = ".ogg"
124
+ else:
125
+ ext = mimetypes.guess_extension(media_type) or ""
93
126
  return filename + ext if ext else filename
94
127
 
95
128
 
@@ -127,7 +160,6 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
127
160
  :raises ValueError: If a file:// uniform resource identifier has a non-local host.
128
161
  :raises NotImplementedError: If the uniform resource identifier scheme is unsupported.
129
162
  """
130
-
131
163
  if isinstance(source, Path):
132
164
  path = source.resolve()
133
165
  media_type = _media_type_from_filename(path.name)
@@ -144,7 +176,9 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
144
176
  parsed = urlparse(source)
145
177
  if parsed.scheme == "file":
146
178
  if parsed.netloc not in ("", "localhost"):
147
- raise ValueError(f"Unsupported file uniform resource identifier host: {parsed.netloc!r}")
179
+ raise ValueError(
180
+ f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
181
+ )
148
182
  path = Path(unquote(parsed.path)).resolve()
149
183
  return load_source(path, source_uri=source_uri or source)
150
184
 
@@ -160,6 +194,7 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
160
194
  if sniffed:
161
195
  media_type = sniffed
162
196
  filename = _ensure_extension_for_media_type(filename, media_type)
197
+ media_type = _normalize_media_type(filename=filename, media_type=media_type)
163
198
  if Path(filename).suffix.lower() in {".md", ".markdown"}:
164
199
  media_type = "text/markdown"
165
200
  return SourcePayload(
biblicus/time.py CHANGED
@@ -14,5 +14,4 @@ def utc_now_iso() -> str:
14
14
  :return: Current Coordinated Universal Time timestamp in International Organization for Standardization 8601 format.
15
15
  :rtype: str
16
16
  """
17
-
18
17
  return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
biblicus/uris.py CHANGED
@@ -18,7 +18,6 @@ def _looks_like_uri(value: str) -> bool:
18
18
  :return: True if the string has a valid uniform resource identifier scheme prefix.
19
19
  :rtype: bool
20
20
  """
21
-
22
21
  return "://" in value and value.split("://", 1)[0].isidentifier()
23
22
 
24
23
 
@@ -33,7 +32,6 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
33
32
  :raises NotImplementedError: If a non-file uniform resource identifier scheme is used.
34
33
  :raises ValueError: If a file:// uniform resource identifier has a non-local host.
35
34
  """
36
-
37
35
  if isinstance(ref, Path):
38
36
  return ref.resolve()
39
37
 
@@ -45,7 +43,9 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
45
43
  f"(got {parsed.scheme}://)"
46
44
  )
47
45
  if parsed.netloc not in ("", "localhost"):
48
- raise ValueError(f"Unsupported file uniform resource identifier host: {parsed.netloc!r}")
46
+ raise ValueError(
47
+ f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
48
+ )
49
49
  return Path(unquote(parsed.path)).resolve()
50
50
 
51
51
  return Path(ref).resolve()
@@ -60,5 +60,4 @@ def normalize_corpus_uri(ref: Union[str, Path]) -> str:
60
60
  :return: Canonical file:// uniform resource identifier.
61
61
  :rtype: str
62
62
  """
63
-
64
63
  return corpus_ref_to_path(ref).as_uri()