PyPI - extract-python - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

extract-python 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{extract_python-0.3.1 → extract_python-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.3.1
+Version: 0.3.2
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python

{extract_python-0.3.1 → extract_python-0.3.2}/extract_python/docling_.py RENAMED Viewed

@@ -32,7 +32,7 @@ if TYPE_CHECKING:
 def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
-    from docling.datamodel.pipeline_options import PdfPipelineOptions
+    from docling.datamodel.pipeline_options import PdfPipelineOptions  # noqa: PLC0415
     if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
         msg = "generate_picture_images should be set to true"
@@ -49,11 +49,11 @@ def _validate_options(
 @cache
 def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
-    from docling.datamodel.pipeline_options import (
+    from docling.datamodel.pipeline_options import (  # noqa: PLC0415
         EasyOcrOptions,
         PdfPipelineOptions,
     )
-    from docling.document_converter import PdfFormatOption
+    from docling.document_converter import PdfFormatOption  # noqa: PLC0415
     return {
         InputFormat.PDF: PdfFormatOption(
@@ -80,23 +80,21 @@ class DoclingPipelineConfig(PipelineConfig):
     task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
     format_options: Annotated[
-        dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
+        dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
     ] = Field(default_factory=_default_format_opts)
-    _unsupported_input_formats: ClassVar[set[InputFormat]] = {
-        InputFormat.AUDIO,
-        InputFormat.METS_GBS,
-        InputFormat.VTT,
-    }
     @classmethod
     @cache
     def supported_exts(cls) -> set[SupportedExt]:
-        from docling.datamodel.base_models import FormatToExtensions, InputFormat
+        from docling.datamodel.base_models import (  # noqa: PLC0415
+            FormatToExtensions,
+            InputFormat,
+        )
+        unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
         supported = set()
         for f in InputFormat:
-            if f in cls._unsupported_input_formats:
+            if f in unsupported:
                 continue
             for ext in FormatToExtensions[f]:
                 supported.add(SupportedExt(f".{ext.lower()}"))
@@ -105,8 +103,10 @@ class DoclingPipelineConfig(PipelineConfig):
 @Pipeline.register(PipelineType.DOCLING)
 class DoclingPipeline(Pipeline):
-    def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
-        from docling.document_converter import DocumentConverter
+    def __init__(
+        self, format_options: dict["InputFormat", "FormatOption"] | None = None
+    ):
+        from docling.document_converter import DocumentConverter  # noqa: PLC0415
         allowed_format = [
             f.to_docling() for f in DoclingPipelineConfig.supported_exts()
@@ -128,7 +128,7 @@ class DoclingPipeline(Pipeline):
         return cls(config.format_options)
-def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
+def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
     for d in docs:
         yield d.to_docling()
@@ -160,7 +160,7 @@ def _to_markdown_doc(
     page_sep: str = DEFAULT_MD_PAGE_SEP,
     **kwargs,
 ) -> MarkdownDoc:
-    from docling_core.types.doc import ImageRefMode
+    from docling_core.types.doc import ImageRefMode  # noqa: PLC0415
     # TODO: Should we add a hash to avoid collision between files with same names
     #  nested in the tree structured

{extract_python-0.3.1 → extract_python-0.3.2}/extract_python/marker_.py RENAMED Viewed

@@ -73,9 +73,9 @@ class MarkerPipeline(Pipeline):
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
-        from marker.config.parser import ConfigParser
-        from marker.converters.pdf import PdfConverter
-        from marker.models import create_model_dict
+        from marker.config.parser import ConfigParser  # noqa: PLC0415
+        from marker.converters.pdf import PdfConverter  # noqa: PLC0415
+        from marker.models import create_model_dict  # noqa: PLC0415
         config = deepcopy(self._marker_config)
         config["output_format"] = output_format.to_marker()
@@ -102,7 +102,7 @@ def _process_doc(
     output_format: OutputFormat,
     output_path: Path,
 ) -> Result:
-    from marker.output import text_from_rendered
+    from marker.output import text_from_rendered  # noqa: PLC0415
     rendered = converter(str(doc.path))
     content, _, images = text_from_rendered(rendered)
@@ -118,7 +118,7 @@ def _process_doc(
 def _to_markdown_doc(
     input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
 ) -> MarkdownDoc:
-    from marker.renderers.markdown import MarkdownRenderer
+    from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
     # TODO: Should we add a hash to avoid collision between files with same names
     #  nested in the tree structured

{extract_python-0.3.1 → extract_python-0.3.2}/extract_python/miner_u.py RENAMED Viewed

@@ -52,13 +52,12 @@ class MinerUConfig(BaseModel):
     @classmethod
     @cache
     def _get_default_kwargs(cls) -> dict[str, Any]:
-        from mineru.utils.enum_class import MakeMode
+        from mineru.utils.enum_class import MakeMode  # noqa: PLC0415
         return {
             "server_url": None,
-            # We don't dump md directly we process, we dump the middle json in order to be
-            # able to get page indexes
+            # We don't dump md directly we process, we dump the middle json in order
+            # to be able to get page indexes
             "parse_method": "auto",
             "dump_md": False,
             "dump_middle_json": True,
@@ -104,7 +103,7 @@ class MinerUPipeline(Pipeline):
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
-        from mineru.cli.common import aio_do_parse
+        from mineru.cli.common import aio_do_parse  # noqa: PLC0415
         docs = list(docs)
         # TODO: exclude files which are not pdf and return an error
@@ -154,13 +153,15 @@ def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
     match backend:
         case MinerUBackend.PIPELINE:
-            from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
+            from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (  # noqa: PLC0415
                 union_make,
             )
             return union_make
         case MinerUBackend.VLM:
-            from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
+            from mineru.backend.vlm.vlm_middle_json_mkcontent import (  # noqa: PLC0415
+                union_make,
+            )
             return union_make
         case _:
@@ -212,7 +213,7 @@ def _dump_md_content(
     im_dir: Path,
     md_make_mode: str | None = None,
 ) -> ConversionOutput:
-    from mineru.utils.enum_class import MakeMode
+    from mineru.utils.enum_class import MakeMode  # noqa: PLC0415
     if md_make_mode is None:
         md_make_mode = MakeMode.MM_MD