PyPI - extract-python - Versions diffs - 0.3.0__tar.gz → 0.3.2__tar.gz - Mend

extract-python 0.3.0tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{extract_python-0.3.0 → extract_python-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.3.0
+Version: 0.3.2
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python

{extract_python-0.3.0 → extract_python-0.3.2}/extract_python/docling_.py RENAMED Viewed

@@ -3,18 +3,8 @@ import tempfile
 from collections.abc import AsyncGenerator, Iterable, Iterator
 from functools import cache
 from pathlib import Path
-from typing import Annotated, ClassVar, TypeVar
-from docling.datamodel.base_models import FormatToExtensions, InputFormat
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    PdfPipelineOptions,
-    PipelineOptions,
-)
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
-from docling_core.types.doc import ImageRefMode
-from docling_core.types.io import DocumentStream
+from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
 from icij_common.registrable import FromConfig
 from pydantic import AfterValidator, Field
@@ -34,28 +24,45 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
 DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
+if TYPE_CHECKING:
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.pipeline_options import PipelineOptions
+    from docling.document_converter import ConversionResult, FormatOption
+    from docling_core.types.io import DocumentStream
+def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
+    from docling.datamodel.pipeline_options import PdfPipelineOptions  # noqa: PLC0415
-def _validate_pipeline_opts(opts: PipelineOptions) -> None:
     if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
         msg = "generate_picture_images should be set to true"
         raise ValueError(msg)
 def _validate_options(
-    data: dict[InputFormat, FormatOption],
-) -> dict[InputFormat, FormatOption]:
+    data: dict["InputFormat", "FormatOption"],
+) -> dict["InputFormat", "FormatOption"]:
     for opts in data.values():
         _validate_pipeline_opts(opts.pipeline_options)
     return data
-_DEFAULT_FORMAT_OPTS = {
-    InputFormat.PDF: PdfFormatOption(
-        pipeline_options=PdfPipelineOptions(
-            ocr_options=EasyOcrOptions(), generate_picture_images=True
-        )
-    ),
-}
+@cache
+def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
+    from docling.datamodel.pipeline_options import (  # noqa: PLC0415
+        EasyOcrOptions,
+        PdfPipelineOptions,
+    )
+    from docling.document_converter import PdfFormatOption  # noqa: PLC0415
+    return {
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_options=PdfPipelineOptions(
+                ocr_options=EasyOcrOptions(), generate_picture_images=True
+            )
+        ),
+    }
 T = TypeVar("T")
@@ -73,21 +80,21 @@ class DoclingPipelineConfig(PipelineConfig):
     task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
     format_options: Annotated[
-        dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
-    ] = _DEFAULT_FORMAT_OPTS
-    _unsupported_input_formats: ClassVar[set[InputFormat]] = {
-        InputFormat.AUDIO,
-        InputFormat.METS_GBS,
-        InputFormat.VTT,
-    }
+        dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
+    ] = Field(default_factory=_default_format_opts)
     @classmethod
     @cache
     def supported_exts(cls) -> set[SupportedExt]:
+        from docling.datamodel.base_models import (  # noqa: PLC0415
+            FormatToExtensions,
+            InputFormat,
+        )
+        unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
         supported = set()
         for f in InputFormat:
-            if f in cls._unsupported_input_formats:
+            if f in unsupported:
                 continue
             for ext in FormatToExtensions[f]:
                 supported.add(SupportedExt(f".{ext.lower()}"))
@@ -96,7 +103,11 @@ class DoclingPipelineConfig(PipelineConfig):
 @Pipeline.register(PipelineType.DOCLING)
 class DoclingPipeline(Pipeline):
-    def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
+    def __init__(
+        self, format_options: dict["InputFormat", "FormatOption"] | None = None
+    ):
+        from docling.document_converter import DocumentConverter  # noqa: PLC0415
         allowed_format = [
             f.to_docling() for f in DoclingPipelineConfig.supported_exts()
         ]
@@ -117,13 +128,13 @@ class DoclingPipeline(Pipeline):
         return cls(config.format_options)
-def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
+def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
     for d in docs:
         yield d.to_docling()
 def _to_result(
-    res: ConversionResult,
+    res: "ConversionResult",
     input_document: InputDoc,
     output_format: OutputFormat,
     output_path: Path,
@@ -144,11 +155,13 @@ def _to_result(
 def _to_markdown_doc(
-    res: ConversionResult,
+    res: "ConversionResult",
     output_path: Path,
     page_sep: str = DEFAULT_MD_PAGE_SEP,
     **kwargs,
 ) -> MarkdownDoc:
+    from docling_core.types.doc import ImageRefMode  # noqa: PLC0415
     # TODO: Should we add a hash to avoid collision between files with same names
     #  nested in the tree structured
     md_dir_name = path_to_artifacts_dirname(res.input.file)

{extract_python-0.3.0 → extract_python-0.3.2}/extract_python/marker_.py RENAMED Viewed

@@ -3,14 +3,8 @@ from collections.abc import AsyncGenerator, Iterable
 from copy import deepcopy
 from functools import cache
 from pathlib import Path
-from typing import Any, ClassVar, Self
-from marker.config.parser import ConfigParser
-from marker.converters.pdf import PdfConverter
-from marker.models import create_model_dict
-from marker.output import text_from_rendered
-from marker.renderers.markdown import MarkdownRenderer
-from PIL.Image import Image
+from typing import TYPE_CHECKING, Any, ClassVar, Self
 from pydantic import Field
 from .constants import ARTIFACTS, CPU_GROUP
@@ -26,6 +20,10 @@ from .objects import (
 from .pipeline import Pipeline, PipelineConfig, PipelineType
 from .utils import path_to_artifacts_dirname, report_recoverable_errors
+if TYPE_CHECKING:
+    from marker.converters.pdf import PdfConverter
+    from PIL import Image
 @PipelineConfig.register()
 class MarkerPipelineConfig(PipelineConfig):
@@ -75,6 +73,10 @@ class MarkerPipeline(Pipeline):
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
+        from marker.config.parser import ConfigParser  # noqa: PLC0415
+        from marker.converters.pdf import PdfConverter  # noqa: PLC0415
+        from marker.models import create_model_dict  # noqa: PLC0415
         config = deepcopy(self._marker_config)
         config["output_format"] = output_format.to_marker()
         config_parser = ConfigParser(config)
@@ -96,10 +98,12 @@ class MarkerPipeline(Pipeline):
 @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
 def _process_doc(
     doc: InputDoc,
-    converter: PdfConverter,
+    converter: "PdfConverter",
     output_format: OutputFormat,
     output_path: Path,
 ) -> Result:
+    from marker.output import text_from_rendered  # noqa: PLC0415
     rendered = converter(str(doc.path))
     content, _, images = text_from_rendered(rendered)
     match output_format:
@@ -112,8 +116,10 @@ def _process_doc(
 def _to_markdown_doc(
-    input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
+    input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
 ) -> MarkdownDoc:
+    from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
     # TODO: Should we add a hash to avoid collision between files with same names
     #  nested in the tree structured
     md_dir_name = path_to_artifacts_dirname(input_doc.path)

{extract_python-0.3.0 → extract_python-0.3.2}/extract_python/miner_u.py RENAMED Viewed

@@ -8,12 +8,6 @@ from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, ClassVar, Self
-from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
-    union_make as pipeline_union_make,
-)
-from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
-from mineru.cli.common import aio_do_parse
-from mineru.utils.enum_class import MakeMode
 from pydantic import Field
 from pydantic_extra_types.language_code import LanguageAlpha2
@@ -47,33 +41,38 @@ class MinerUConfig(BaseModel):
     # TODO: use enum or literal here
     parse_method: str = "auto"
-    default_kwargs: ClassVar[dict] = {
-        "server_url": None,
-        # We don't dump md directly we process, we dump the middle json in order to be
-        # able to get page indexes
-        "parse_method": "auto",
-        "dump_md": False,
-        "dump_middle_json": True,
-        "f_draw_layout_bbox": False,
-        "f_draw_span_bbox": False,
-        "f_dump_model_output": False,  # might be useful for debug though
-        "f_dump_orig_pdf": False,
-        "f_dump_content_list": False,  # might be useful for debug though
-        "start_page_id": 0,
-        "f_make_md_mode": MakeMode.MM_MD,
-        "image_analysis": True,
-        "end_page_id": None,
-        "client_side_output_generation": False,
-    }
     def as_parse_kwargs(self) -> dict[str, Any]:
-        kwargs = copy(self.default_kwargs)
+        kwargs = copy(self._get_default_kwargs())
         kwargs["backend"] = self.backend
         kwargs["parse_method"] = self.parse_method
         kwargs["formula_enable"] = self.enable_formula_extraction
         kwargs["table_enable"] = self.enable_table_extraction
         return kwargs
+    @classmethod
+    @cache
+    def _get_default_kwargs(cls) -> dict[str, Any]:
+        from mineru.utils.enum_class import MakeMode  # noqa: PLC0415
+        return {
+            "server_url": None,
+            # We don't dump md directly we process, we dump the middle json in order
+            # to be able to get page indexes
+            "parse_method": "auto",
+            "dump_md": False,
+            "dump_middle_json": True,
+            "f_draw_layout_bbox": False,
+            "f_draw_span_bbox": False,
+            "f_dump_model_output": False,  # might be useful for debug though
+            "f_dump_orig_pdf": False,
+            "f_dump_content_list": False,  # might be useful for debug though
+            "start_page_id": 0,
+            "f_make_md_mode": MakeMode.MM_MD,
+            "image_analysis": True,
+            "end_page_id": None,
+            "client_side_output_generation": False,
+        }
 @PipelineConfig.register()  # noqa: F821
 class MinerUPipelineConfig(PipelineConfig):  # noqa: F821
@@ -104,6 +103,8 @@ class MinerUPipeline(Pipeline):
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
+        from mineru.cli.common import aio_do_parse  # noqa: PLC0415
         docs = list(docs)
         # TODO: exclude files which are not pdf and return an error
         pdfs_bytes = [d.path.read_bytes() for d in docs]
@@ -149,11 +150,20 @@ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
 def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
     match backend:
         case MinerUBackend.PIPELINE:
-            return pipeline_union_make
+            from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (  # noqa: PLC0415
+                union_make,
+            )
+            return union_make
         case MinerUBackend.VLM:
-            return vlm_union_make
+            from mineru.backend.vlm.vlm_middle_json_mkcontent import (  # noqa: PLC0415
+                union_make,
+            )
+            return union_make
         case _:
             raise ValueError(f"Unsupported backend: {backend}")
@@ -201,8 +211,12 @@ def _dump_md_content(
     output_path: Path,
     md_path: Path,
     im_dir: Path,
-    md_make_mode: str = MakeMode.MM_MD,
+    md_make_mode: str | None = None,
 ) -> ConversionOutput:
+    from mineru.utils.enum_class import MakeMode  # noqa: PLC0415
+    if md_make_mode is None:
+        md_make_mode = MakeMode.MM_MD
     total_length = 0
     end_indices = []
     with md_path.open("w") as f: