PyPI - biblicus - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

biblicus 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

biblicus/__init__.py +2 -2
biblicus/_vendor/dotyaml/__init__.py +14 -0
biblicus/_vendor/dotyaml/interpolation.py +63 -0
biblicus/_vendor/dotyaml/loader.py +181 -0
biblicus/_vendor/dotyaml/transformer.py +135 -0
biblicus/backends/__init__.py +0 -2
biblicus/backends/base.py +3 -3
biblicus/backends/scan.py +21 -15
biblicus/backends/sqlite_full_text_search.py +14 -15
biblicus/cli.py +33 -49
biblicus/corpus.py +39 -58
biblicus/errors.py +15 -0
biblicus/evaluation.py +4 -8
biblicus/extraction.py +276 -77
biblicus/extractors/__init__.py +14 -3
biblicus/extractors/base.py +12 -5
biblicus/extractors/metadata_text.py +13 -5
biblicus/extractors/openai_stt.py +180 -0
biblicus/extractors/pass_through_text.py +16 -6
biblicus/extractors/pdf_text.py +100 -0
biblicus/extractors/pipeline.py +105 -0
biblicus/extractors/rapidocr_text.py +129 -0
biblicus/extractors/select_longest_text.py +105 -0
biblicus/extractors/select_text.py +100 -0
biblicus/extractors/unstructured_text.py +100 -0
biblicus/frontmatter.py +0 -3
biblicus/hook_logging.py +0 -5
biblicus/hook_manager.py +3 -5
biblicus/hooks.py +3 -7
biblicus/ignore.py +0 -3
biblicus/models.py +87 -0
biblicus/retrieval.py +0 -4
biblicus/sources.py +44 -9
biblicus/time.py +0 -1
biblicus/uris.py +3 -4
biblicus/user_config.py +138 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/METADATA +78 -16
biblicus-0.3.0.dist-info/RECORD +44 -0
biblicus/extractors/cascade.py +0 -101
biblicus-0.2.0.dist-info/RECORD +0 -32
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/WHEEL +0 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/entry_points.txt +0 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/licenses/LICENSE +0 -0
{biblicus-0.2.0.dist-info → biblicus-0.3.0.dist-info}/top_level.txt +0 -0

biblicus/extractors/select_text.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""
+Selection extractor that chooses text from previous pipeline outputs.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, ConfigDict
+from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
+from .base import TextExtractor
+class SelectTextExtractorConfig(BaseModel):
+    """
+    Configuration for the selection extractor.
+    The selection extractor is intentionally minimal and requires no configuration.
+    """
+    model_config = ConfigDict(extra="forbid")
+class SelectTextExtractor(TextExtractor):
+    """
+    Extractor plugin that selects from previous pipeline outputs.
+    This extractor is used as a final step when you want to make an explicit choice among
+    multiple extraction outputs in the same pipeline.
+    It selects the first usable extracted text in pipeline order. Usable means the text is
+    non-empty after stripping whitespace. If no usable text exists but prior extracted text
+    exists, it selects the first extracted text even if it is empty.
+    :ivar extractor_id: Extractor identifier.
+    :vartype extractor_id: str
+    """
+    extractor_id = "select-text"
+    def validate_config(self, config: Dict[str, Any]) -> BaseModel:
+        """
+        Validate selection extractor configuration.
+        :param config: Configuration mapping.
+        :type config: dict[str, Any]
+        :return: Parsed configuration.
+        :rtype: SelectTextExtractorConfig
+        """
+        return SelectTextExtractorConfig.model_validate(config)
+    def extract_text(
+        self,
+        *,
+        corpus,
+        item: CatalogItem,
+        config: BaseModel,
+        previous_extractions: List[ExtractionStepOutput],
+    ) -> Optional[ExtractedText]:
+        """
+        Select extracted text from previous pipeline outputs.
+        :param corpus: Corpus containing the item bytes.
+        :type corpus: Corpus
+        :param item: Catalog item being processed.
+        :type item: CatalogItem
+        :param config: Parsed configuration model.
+        :type config: SelectTextExtractorConfig
+        :param previous_extractions: Prior step outputs for this item within the pipeline.
+        :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
+        :return: Selected extracted text payload or None when no prior outputs exist.
+        :rtype: ExtractedText or None
+        """
+        _ = corpus
+        _ = item
+        _ = config
+        extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
+        usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
+        if usable_candidates:
+            candidate = usable_candidates[0]
+            producer = candidate.producer_extractor_id or candidate.extractor_id
+            return ExtractedText(
+                text=candidate.text or "",
+                producer_extractor_id=producer,
+                source_step_index=candidate.step_index,
+            )
+        if extracted_candidates:
+            candidate = extracted_candidates[0]
+            producer = candidate.producer_extractor_id or candidate.extractor_id
+            return ExtractedText(
+                text=candidate.text or "",
+                producer_extractor_id=producer,
+                source_step_index=candidate.step_index,
+            )
+        return None

biblicus/extractors/unstructured_text.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""
+Unstructured-based text extraction plugin.
+This extractor is implemented as an optional dependency so the core installation stays small.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, ConfigDict
+from ..corpus import Corpus
+from ..errors import ExtractionRunFatalError
+from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
+from .base import TextExtractor
+class UnstructuredExtractorConfig(BaseModel):
+    """
+    Configuration for the Unstructured extractor.
+    Version zero does not expose any configuration for this extractor.
+    """
+    model_config = ConfigDict(extra="forbid")
+class UnstructuredExtractor(TextExtractor):
+    """
+    Extractor plugin backed by the `unstructured` library.
+    The intent is broad format coverage as a last-resort extractor. This extractor skips items
+    that are already text so the pass-through extractor remains the canonical choice for text
+    items and Markdown front matter handling.
+    :ivar extractor_id: Extractor identifier.
+    :vartype extractor_id: str
+    """
+    extractor_id = "unstructured"
+    def validate_config(self, config: Dict[str, Any]) -> BaseModel:
+        """
+        Validate extractor configuration and ensure the dependency is installed.
+        :param config: Configuration mapping.
+        :type config: dict[str, Any]
+        :return: Parsed config.
+        :rtype: UnstructuredExtractorConfig
+        :raises ExtractionRunFatalError: If the optional dependency is not installed.
+        """
+        try:
+            from unstructured.partition.auto import partition  # noqa: F401
+        except ImportError as import_error:
+            raise ExtractionRunFatalError(
+                "Unstructured extractor requires an optional dependency. "
+                'Install it with pip install "biblicus[unstructured]".'
+            ) from import_error
+        return UnstructuredExtractorConfig.model_validate(config)
+    def extract_text(
+        self,
+        *,
+        corpus: Corpus,
+        item: CatalogItem,
+        config: BaseModel,
+        previous_extractions: List[ExtractionStepOutput],
+    ) -> Optional[ExtractedText]:
+        """
+        Extract text for a non-text item using Unstructured.
+        :param corpus: Corpus containing the item bytes.
+        :type corpus: Corpus
+        :param item: Catalog item being processed.
+        :type item: CatalogItem
+        :param config: Parsed configuration model.
+        :type config: UnstructuredExtractorConfig
+        :param previous_extractions: Prior step outputs for this item within the pipeline.
+        :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
+        :return: Extracted text payload, or None when the item is already text.
+        :rtype: ExtractedText or None
+        """
+        _ = config
+        _ = previous_extractions
+        media_type = item.media_type
+        if media_type == "text/markdown" or media_type.startswith("text/"):
+            return None
+        from unstructured.partition.auto import partition
+        source_path = corpus.root / item.relpath
+        elements = partition(filename=str(source_path))
+        lines: list[str] = []
+        for element in elements or []:
+            text = getattr(element, "text", None)
+            if isinstance(text, str) and text.strip():
+                lines.append(text.strip())
+        combined_text = "\n".join(lines).strip()
+        return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)

biblicus/frontmatter.py CHANGED Viewed

@@ -35,7 +35,6 @@ def parse_front_matter(text: str) -> FrontMatterDocument:
     :rtype: FrontMatterDocument
     :raises ValueError: If front matter is present but not a mapping.
     """
     if not text.startswith("---\n"):
         return FrontMatterDocument(metadata={}, body=text)
@@ -64,7 +63,6 @@ def render_front_matter(metadata: Dict[str, Any], body: str) -> str:
     :return: Markdown with Yet Another Markup Language front matter.
     :rtype: str
     """
     if not metadata:
         return body
@@ -87,6 +85,5 @@ def split_markdown_front_matter(path_text: str) -> Tuple[Dict[str, Any], str]:
     :return: Metadata mapping and body text.
     :rtype: tuple[dict[str, Any], str]
     """
     parsed_document = parse_front_matter(path_text)
     return parsed_document.metadata, parsed_document.body

biblicus/hook_logging.py CHANGED Viewed

@@ -23,7 +23,6 @@ def new_operation_id() -> str:
     :return: Operation identifier.
     :rtype: str
     """
     return str(uuid.uuid4())
@@ -36,7 +35,6 @@ def redact_source_uri(source_uri: str) -> str:
     :return: Redacted source uniform resource identifier.
     :rtype: str
     """
     parsed = urlparse(source_uri)
     if not parsed.scheme:
@@ -117,7 +115,6 @@ class HookLogger:
         :param operation_id: Operation identifier for grouping records.
         :type operation_id: str
         """
         self.log_dir = log_dir
         self.operation_id = operation_id
@@ -129,7 +126,6 @@ class HookLogger:
         :return: Log file path.
         :rtype: Path
         """
         return self.log_dir / f"{self.operation_id}.jsonl"
     def record(
@@ -166,7 +162,6 @@ class HookLogger:
         :return: None.
         :rtype: None
         """
         self.log_dir.mkdir(parents=True, exist_ok=True)
         entry = HookLogEntry(
             operation_id=self.operation_id,

biblicus/hook_manager.py CHANGED Viewed

@@ -55,7 +55,6 @@ class HookManager:
         :param operation_id: Optional operation identifier override.
         :type operation_id: str or None
         """
         self.corpus_uri = corpus_uri
         self.log_dir = log_dir
         self.operation_id = operation_id or new_operation_id()
@@ -63,7 +62,9 @@ class HookManager:
         self._logger = HookLogger(log_dir=self.log_dir, operation_id=self.operation_id)
     @classmethod
-    def from_config(cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]) -> "HookManager":
+    def from_config(
+        cls, *, corpus_root: Path, corpus_uri: str, hook_specs: Iterable[HookSpec]
+    ) -> "HookManager":
         """
         Build a hook manager from config data.
@@ -77,7 +78,6 @@ class HookManager:
         :rtype: HookManager
         :raises KeyError: If a hook identifier is unknown.
         """
         log_dir = corpus_root / CORPUS_DIR_NAME / HOOK_LOGS_DIR_NAME
         hooks: List[LifecycleHook] = []
@@ -124,7 +124,6 @@ class HookManager:
         :rtype: IngestMutation
         :raises ValueError: If ingestion is denied by a hook.
         """
         context = IngestHookContext(
             hook_point=hook_point,
             operation_id=self.operation_id,
@@ -195,7 +194,6 @@ class HookManager:
         :rtype: dict[str, Any]
         :raises ValueError: If a hook raises an exception.
         """
         try:
             result = hook.run(context)
         except Exception as exc:

biblicus/hooks.py CHANGED Viewed

@@ -164,7 +164,6 @@ class LifecycleHook:
         :rtype: HookResult
         :raises NotImplementedError: If the hook does not implement run.
         """
         _ = context
         raise NotImplementedError("LifecycleHook.run must be implemented by concrete hooks")
@@ -192,7 +191,6 @@ class AddTagsHook:
         :param tags: Tags to add.
         :type tags: Sequence[str]
         """
         self.hook_points = list(hook_points)
         self.tags = [t.strip() for t in tags if isinstance(t, str) and t.strip()]
@@ -205,7 +203,6 @@ class AddTagsHook:
         :return: Ingest mutation result.
         :rtype: HookResult
         """
         _ = context
         return IngestMutation(add_tags=list(self.tags))
@@ -229,7 +226,6 @@ class DenyAllHook:
         :param hook_points: Hook points where the hook runs.
         :type hook_points: Sequence[HookPoint]
         """
         self.hook_points = list(hook_points)
     def run(self, context: HookContext) -> HookResult:
@@ -241,7 +237,6 @@ class DenyAllHook:
         :return: Ingest denial result.
         :rtype: HookResult
         """
         _ = context
         return IngestMutation(deny=True, deny_reason="Ingest denied by deny-all hook")
@@ -256,10 +251,11 @@ def build_builtin_hook(spec: HookSpec) -> LifecycleHook:
     :rtype: LifecycleHook
     :raises KeyError: If the hook identifier is unknown.
     """
     if spec.hook_id == AddTagsHook.hook_id:
         tags = spec.config.get("tags") or []
-        return AddTagsHook(hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else [])
+        return AddTagsHook(
+            hook_points=spec.hook_points, tags=tags if isinstance(tags, list) else []
+        )
     if spec.hook_id == DenyAllHook.hook_id:
         return DenyAllHook(hook_points=spec.hook_points)
     raise KeyError(f"Unknown hook_id {spec.hook_id!r}")

biblicus/ignore.py CHANGED Viewed

@@ -34,7 +34,6 @@ class CorpusIgnoreSpec(BaseModel):
         :return: True if the path should be ignored.
         :rtype: bool
         """
         normalized = relpath.replace("\\", "/").lstrip("/")
         return any(fnmatch.fnmatch(normalized, pattern) for pattern in self.patterns)
@@ -50,7 +49,6 @@ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
     :return: Parsed ignore specification.
     :rtype: CorpusIgnoreSpec
     """
     ignore_path = corpus_root / ".biblicusignore"
     if not ignore_path.is_file():
         return CorpusIgnoreSpec(patterns=[])
@@ -64,4 +62,3 @@ def load_corpus_ignore_spec(corpus_root: Path) -> CorpusIgnoreSpec:
             continue
         patterns.append(line)
     return CorpusIgnoreSpec(patterns=patterns)

biblicus/models.py CHANGED Viewed

@@ -142,6 +142,53 @@ class CorpusCatalog(BaseModel):
         return self
+class ExtractionRunReference(BaseModel):
+    """
+    Reference to an extraction run.
+    :ivar extractor_id: Extractor plugin identifier.
+    :vartype extractor_id: str
+    :ivar run_id: Extraction run identifier.
+    :vartype run_id: str
+    """
+    model_config = ConfigDict(extra="forbid")
+    extractor_id: str = Field(min_length=1)
+    run_id: str = Field(min_length=1)
+    def as_string(self) -> str:
+        """
+        Serialize the reference as a single string.
+        :return: Reference in the form extractor_id:run_id.
+        :rtype: str
+        """
+        return f"{self.extractor_id}:{self.run_id}"
+def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
+    """
+    Parse an extraction run reference in the form extractor_id:run_id.
+    :param value: Raw reference string.
+    :type value: str
+    :return: Parsed extraction run reference.
+    :rtype: ExtractionRunReference
+    :raises ValueError: If the reference is not well formed.
+    """
+    if ":" not in value:
+        raise ValueError("Extraction run reference must be extractor_id:run_id")
+    extractor_id, run_id = value.split(":", 1)
+    extractor_id = extractor_id.strip()
+    run_id = run_id.strip()
+    if not extractor_id or not run_id:
+        raise ValueError(
+            "Extraction run reference must be extractor_id:run_id with non-empty parts"
+        )
+    return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
 class QueryBudget(BaseModel):
     """
     Evidence selection budget for retrieval.
@@ -319,9 +366,49 @@ class ExtractedText(BaseModel):
     :vartype text: str
     :ivar producer_extractor_id: Extractor identifier that produced this text.
     :vartype producer_extractor_id: str
+    :ivar source_step_index: Optional pipeline step index where this text originated.
+    :vartype source_step_index: int or None
     """
     model_config = ConfigDict(extra="forbid")
     text: str
     producer_extractor_id: str = Field(min_length=1)
+    source_step_index: Optional[int] = Field(default=None, ge=1)
+class ExtractionStepOutput(BaseModel):
+    """
+    In-memory representation of a pipeline step output for a single item.
+    :ivar step_index: One-based pipeline step index.
+    :vartype step_index: int
+    :ivar extractor_id: Extractor identifier for the step.
+    :vartype extractor_id: str
+    :ivar status: Step status, extracted, skipped, or errored.
+    :vartype status: str
+    :ivar text: Extracted text content, when produced.
+    :vartype text: str or None
+    :ivar text_characters: Character count of the extracted text.
+    :vartype text_characters: int
+    :ivar producer_extractor_id: Extractor identifier that produced the text content.
+    :vartype producer_extractor_id: str or None
+    :ivar source_step_index: Optional step index that supplied the text for selection-style extractors.
+    :vartype source_step_index: int or None
+    :ivar error_type: Optional error type name for errored steps.
+    :vartype error_type: str or None
+    :ivar error_message: Optional error message for errored steps.
+    :vartype error_message: str or None
+    """
+    model_config = ConfigDict(extra="forbid")
+    step_index: int = Field(ge=1)
+    extractor_id: str
+    status: str
+    text: Optional[str] = None
+    text_characters: int = Field(default=0, ge=0)
+    producer_extractor_id: Optional[str] = None
+    source_step_index: Optional[int] = Field(default=None, ge=1)
+    error_type: Optional[str] = None
+    error_message: Optional[str] = None

biblicus/retrieval.py CHANGED Viewed

@@ -34,7 +34,6 @@ def create_recipe_manifest(
     :return: Deterministic recipe manifest.
     :rtype: RecipeManifest
     """
     config_json = json.dumps(config, sort_keys=True, separators=(",", ":"))
     recipe_seed = f"{backend_id}:{config_json}"
     recipe_id = hashlib.sha256(recipe_seed.encode("utf-8")).hexdigest()
@@ -69,7 +68,6 @@ def create_run_manifest(
     :return: Run manifest.
     :rtype: RetrievalRun
     """
     catalog = corpus.load_catalog()
     created_at = utc_now_iso()
     run_id = hashlib.sha256(f"{recipe.recipe_id}:{created_at}".encode("utf-8")).hexdigest()
@@ -93,7 +91,6 @@ def hash_text(text: str) -> str:
     :return: Secure Hash Algorithm 256 hex digest.
     :rtype: str
     """
     return hashlib.sha256(text.encode("utf-8")).hexdigest()
@@ -108,7 +105,6 @@ def apply_budget(evidence: Iterable[Evidence], budget: QueryBudget) -> List[Evid
     :return: Evidence list respecting the budget.
     :rtype: list[Evidence]
     """
     selected_evidence: List[Evidence] = []
     source_counts: Dict[str, int] = {}
     total_characters = 0

biblicus/sources.py CHANGED Viewed

@@ -21,7 +21,6 @@ def _looks_like_uri(value: str) -> bool:
     :return: True if the string has a valid uniform resource identifier scheme prefix.
     :rtype: bool
     """
     return "://" in value and value.split("://", 1)[0].isidentifier()
@@ -34,7 +33,6 @@ def _filename_from_url_path(path: str) -> str:
     :return: Filename or a fallback name.
     :rtype: str
     """
     filename = Path(unquote(path)).name
     return filename or "download"
@@ -48,7 +46,6 @@ def _media_type_from_filename(name: str) -> str:
     :return: Guessed media type or application/octet-stream.
     :rtype: str
     """
     media_type, _ = mimetypes.guess_type(name)
     return media_type or "application/octet-stream"
@@ -62,7 +59,6 @@ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
     :return: Detected media type or None.
     :rtype: str or None
     """
     prefix = data[:32]
     if prefix.startswith(b"%PDF-"):
         return "application/pdf"
@@ -70,11 +66,46 @@ def _sniff_media_type_from_bytes(data: bytes) -> Optional[str]:
         return "image/png"
     if prefix[:3] == b"\xff\xd8\xff":
         return "image/jpeg"
-    if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(b"<html"):
+    if prefix.startswith(b"RIFF") and prefix[8:12] == b"WAVE":
+        return "audio/x-wav"
+    if prefix.startswith(b"ID3") or (
+        len(prefix) >= 2 and prefix[0] == 0xFF and (prefix[1] & 0xE0) == 0xE0
+    ):
+        return "audio/mpeg"
+    if prefix.startswith(b"OggS"):
+        return "audio/ogg"
+    if prefix.lstrip().lower().startswith(b"<!doctype html") or prefix.lstrip().lower().startswith(
+        b"<html"
+    ):
         return "text/html"
     return None
+def _normalize_media_type(*, filename: str, media_type: str) -> str:
+    """
+    Normalize media types that are commonly mislabelled by upstream sources.
+    This function exists to keep the corpus usable for humans. When a source provides a filename
+    extension that users recognize (for example, ``.ogg``), Biblicus prefers a matching media type
+    so that downstream processing can make reasonable decisions.
+    :param filename: Filename associated with the payload.
+    :type filename: str
+    :param media_type: Media type reported or guessed for the payload.
+    :type media_type: str
+    :return: Normalized media type.
+    :rtype: str
+    """
+    suffix = Path(filename).suffix.lower()
+    if media_type in {"application/ogg", "application/x-ogg"} and suffix in {
+        ".ogg",
+        ".oga",
+        ".ogx",
+    }:
+        return "audio/ogg"
+    return media_type
 def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
     """
     Ensure the filename has a usable extension for the media type.
@@ -86,10 +117,12 @@ def _ensure_extension_for_media_type(filename: str, media_type: str) -> str:
     :return: Filename with extension.
     :rtype: str
     """
     if Path(filename).suffix:
         return filename
-    ext = mimetypes.guess_extension(media_type) or ""
+    if media_type == "audio/ogg":
+        ext = ".ogg"
+    else:
+        ext = mimetypes.guess_extension(media_type) or ""
     return filename + ext if ext else filename
@@ -127,7 +160,6 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
     :raises ValueError: If a file:// uniform resource identifier has a non-local host.
     :raises NotImplementedError: If the uniform resource identifier scheme is unsupported.
     """
     if isinstance(source, Path):
         path = source.resolve()
         media_type = _media_type_from_filename(path.name)
@@ -144,7 +176,9 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
         parsed = urlparse(source)
         if parsed.scheme == "file":
             if parsed.netloc not in ("", "localhost"):
-                raise ValueError(f"Unsupported file uniform resource identifier host: {parsed.netloc!r}")
+                raise ValueError(
+                    f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
+                )
             path = Path(unquote(parsed.path)).resolve()
             return load_source(path, source_uri=source_uri or source)
@@ -160,6 +194,7 @@ def load_source(source: str | Path, *, source_uri: Optional[str] = None) -> Sour
                     if sniffed:
                         media_type = sniffed
                         filename = _ensure_extension_for_media_type(filename, media_type)
+                media_type = _normalize_media_type(filename=filename, media_type=media_type)
                 if Path(filename).suffix.lower() in {".md", ".markdown"}:
                     media_type = "text/markdown"
                 return SourcePayload(

biblicus/time.py CHANGED Viewed

@@ -14,5 +14,4 @@ def utc_now_iso() -> str:
     :return: Current Coordinated Universal Time timestamp in International Organization for Standardization 8601 format.
     :rtype: str
     """
     return datetime.now(timezone.utc).replace(microsecond=0).isoformat()

biblicus/uris.py CHANGED Viewed

@@ -18,7 +18,6 @@ def _looks_like_uri(value: str) -> bool:
     :return: True if the string has a valid uniform resource identifier scheme prefix.
     :rtype: bool
     """
     return "://" in value and value.split("://", 1)[0].isidentifier()
@@ -33,7 +32,6 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
     :raises NotImplementedError: If a non-file uniform resource identifier scheme is used.
     :raises ValueError: If a file:// uniform resource identifier has a non-local host.
     """
     if isinstance(ref, Path):
         return ref.resolve()
@@ -45,7 +43,9 @@ def corpus_ref_to_path(ref: Union[str, Path]) -> Path:
                 f"(got {parsed.scheme}://)"
             )
         if parsed.netloc not in ("", "localhost"):
-            raise ValueError(f"Unsupported file uniform resource identifier host: {parsed.netloc!r}")
+            raise ValueError(
+                f"Unsupported file uniform resource identifier host: {parsed.netloc!r}"
+            )
         return Path(unquote(parsed.path)).resolve()
     return Path(ref).resolve()
@@ -60,5 +60,4 @@ def normalize_corpus_uri(ref: Union[str, Path]) -> str:
     :return: Canonical file:// uniform resource identifier.
     :rtype: str
     """
     return corpus_ref_to_path(ref).as_uri()

biblicus 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

biblicus 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl