PyPI - extract-python - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

extract-python 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

extract_python/docling_.py +103 -59
extract_python/marker_.py +7 -9
extract_python/miner_u.py +11 -12
extract_python/pipeline.py +1 -3
{extract_python-0.3.1.dist-info → extract_python-0.4.0.dist-info}/METADATA +1 -1
extract_python-0.4.0.dist-info/RECORD +11 -0
extract_python-0.3.1.dist-info/RECORD +0 -11
{extract_python-0.3.1.dist-info → extract_python-0.4.0.dist-info}/WHEEL +0 -0

extract_python/docling_.py CHANGED Viewed

@@ -1,14 +1,34 @@
+import importlib
 import shutil
 import tempfile
 from collections.abc import AsyncGenerator, Iterable, Iterator
 from functools import cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
+from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.backend_options import BackendOptions
+# Data model import are quick it's ok to leave it there
+from docling.datamodel.base_models import FormatToExtensions, InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PdfPipelineOptions,
+    PipelineOptions,
+    ThreadedPdfPipelineOptions,
+)
+from docling.document_converter import DocumentConverter, FormatOption
+from docling.pipeline.base_pipeline import BasePipeline
+# TODO: this is long to load improve it
+from docling_core.types.doc import ImageRefMode
+from docling_core.types.io import DocumentStream
+from icij_common.pydantic_utils import to_lower_snake_case
 from icij_common.registrable import FromConfig
-from pydantic import AfterValidator, Field
+from pydantic import AfterValidator, BeforeValidator, Field, model_validator
-from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
 from .objects import (
     Error,
     InputDoc,
@@ -24,79 +44,104 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
 DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
-if TYPE_CHECKING:
-    from docling.datamodel.base_models import InputFormat
-    from docling.datamodel.pipeline_options import PipelineOptions
-    from docling.document_converter import ConversionResult, FormatOption
-    from docling_core.types.io import DocumentStream
-def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
-    from docling.datamodel.pipeline_options import PdfPipelineOptions
-    if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
+def _validate_pipeline_opts(v: "PipelineOptions") -> None:
+    if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
         msg = "generate_picture_images should be set to true"
         raise ValueError(msg)
-def _validate_options(
-    data: dict["InputFormat", "FormatOption"],
-) -> dict["InputFormat", "FormatOption"]:
-    for opts in data.values():
-        _validate_pipeline_opts(opts.pipeline_options)
-    return data
-@cache
-def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
-    from docling.datamodel.pipeline_options import (
-        EasyOcrOptions,
-        PdfPipelineOptions,
-    )
-    from docling.document_converter import PdfFormatOption
-    return {
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_options=PdfPipelineOptions(
-                ocr_options=EasyOcrOptions(), generate_picture_images=True
-            )
-        ),
-    }
+    return v
 T = TypeVar("T")
 def _find_subcls(cls: type[T], name: str) -> type[T]:
+    # Check if the class available
     for c in all_subclasses(cls):
         if c.__name__ == name:
             return c
+    # Then apply ad-hoc search
+    if "pipeline" in cls.__name__.lower():
+        module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
+        try:
+            module = importlib.import_module(module_name)
+            return getattr(module, name)
+        except (ModuleNotFoundError, AttributeError):
+            pass
     raise ValueError(f"unknown {cls.__name__} subclass {name}")
-@PipelineConfig.register()
-class DoclingPipelineConfig(PipelineConfig):
-    pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
-    task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
+def _find_init_arg_type(cls: type[Any], arg: str) -> type:
+    hints = get_type_hints(cls.__init__)
+    return hints[arg].__class__
+def _resolve_pipeline_cls(v: Any) -> Any:
+    if isinstance(v, str):
+        return _find_subcls(BasePipeline, v)
+    return v
-    format_options: Annotated[
-        dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
-    ] = Field(default_factory=_default_format_opts)
+def _resolve_backend(v: Any) -> Any:
+    if isinstance(v, str):
+        return _find_subcls(AbstractDocumentBackend, v)
+    return v
-    _unsupported_input_formats: ClassVar[set[InputFormat]] = {
-        InputFormat.AUDIO,
-        InputFormat.METS_GBS,
-        InputFormat.VTT,
+class DoclingFormatOption(FormatOption):
+    pipeline_cls: Annotated[
+        str | type[BasePipeline], BeforeValidator(_resolve_pipeline_cls)
+    ]
+    pipeline_options: Annotated[
+        dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
+    ] = None
+    backend: Annotated[
+        str | type[AbstractDocumentBackend], BeforeValidator(_resolve_backend)
+    ]
+    backend_options: BackendOptions | None = None
+    @model_validator(mode="after")
+    def _resolve_pipeline_options(self) -> Self:
+        if isinstance(self.pipeline_options, dict):
+            option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
+            self.pipeline_options = option_cls.model_validate(self.pipeline_options)
+        return self
+@cache
+def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
+    from docling.backend.docling_parse_backend import (  # noqa: PLC0415
+        DoclingParseDocumentBackend,
+    )
+    from docling.pipeline.standard_pdf_pipeline import (  # noqa: PLC0415
+        StandardPdfPipeline,
+    )
+    return {
+        InputFormat.PDF: DoclingFormatOption(
+            pipeline_cls=StandardPdfPipeline,
+            backend=DoclingParseDocumentBackend,
+            pipeline_options=ThreadedPdfPipelineOptions(
+                ocr_options=EasyOcrOptions(), generate_picture_images=True
+            ),
+        ),
     }
+class DoclingPipelineConfig(PipelineConfig):
+    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
+    format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
+        default_factory=_default_format_opts
+    )
     @classmethod
     @cache
     def supported_exts(cls) -> set[SupportedExt]:
-        from docling.datamodel.base_models import FormatToExtensions, InputFormat
+        unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
         supported = set()
         for f in InputFormat:
-            if f in cls._unsupported_input_formats:
+            if f in unsupported:
                 continue
             for ext in FormatToExtensions[f]:
                 supported.add(SupportedExt(f".{ext.lower()}"))
@@ -105,8 +150,9 @@ class DoclingPipelineConfig(PipelineConfig):
 @Pipeline.register(PipelineType.DOCLING)
 class DoclingPipeline(Pipeline):
-    def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
-        from docling.document_converter import DocumentConverter
+    def __init__(
+        self, format_options: dict["InputFormat", "FormatOption"] | None = None
+    ):
         allowed_format = [
             f.to_docling() for f in DoclingPipelineConfig.supported_exts()
@@ -128,13 +174,13 @@ class DoclingPipeline(Pipeline):
         return cls(config.format_options)
-def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
+def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
     for d in docs:
         yield d.to_docling()
 def _to_result(
-    res: "ConversionResult",
+    res: ConversionResult,
     input_document: InputDoc,
     output_format: OutputFormat,
     output_path: Path,
@@ -155,13 +201,11 @@ def _to_result(
 def _to_markdown_doc(
-    res: "ConversionResult",
+    res: ConversionResult,
     output_path: Path,
     page_sep: str = DEFAULT_MD_PAGE_SEP,
     **kwargs,
 ) -> MarkdownDoc:
-    from docling_core.types.doc import ImageRefMode
     # TODO: Should we add a hash to avoid collision between files with same names
     #  nested in the tree structured
     md_dir_name = path_to_artifacts_dirname(res.input.file)

extract_python/marker_.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
 from pydantic import Field
-from .constants import ARTIFACTS, CPU_GROUP
+from .constants import ARTIFACTS
 from .objects import (
     InputDoc,
     MarkdownDoc,
@@ -25,10 +25,8 @@ if TYPE_CHECKING:
     from PIL import Image
-@PipelineConfig.register()
 class MarkerPipelineConfig(PipelineConfig):
-    pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
-    task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
+    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
     config: dict[str, Any] = dict()
@@ -73,9 +71,9 @@ class MarkerPipeline(Pipeline):
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
-        from marker.config.parser import ConfigParser
-        from marker.converters.pdf import PdfConverter
-        from marker.models import create_model_dict
+        from marker.config.parser import ConfigParser  # noqa: PLC0415
+        from marker.converters.pdf import PdfConverter  # noqa: PLC0415
+        from marker.models import create_model_dict  # noqa: PLC0415
         config = deepcopy(self._marker_config)
         config["output_format"] = output_format.to_marker()
@@ -102,7 +100,7 @@ def _process_doc(
     output_format: OutputFormat,
     output_path: Path,
 ) -> Result:
-    from marker.output import text_from_rendered
+    from marker.output import text_from_rendered  # noqa: PLC0415
     rendered = converter(str(doc.path))
     content, _, images = text_from_rendered(rendered)
@@ -118,7 +116,7 @@ def _process_doc(
 def _to_markdown_doc(
     input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
 ) -> MarkdownDoc:
-    from marker.renderers.markdown import MarkdownRenderer
+    from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
     # TODO: Should we add a hash to avoid collision between files with same names
     #  nested in the tree structured

extract_python/miner_u.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing import Any, ClassVar, Self
 from pydantic import Field
 from pydantic_extra_types.language_code import LanguageAlpha2
-from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP, MINER_U_GROUP
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
 from .objects import (
     BaseModel,
     ConversionOutput,
@@ -52,13 +52,12 @@ class MinerUConfig(BaseModel):
     @classmethod
     @cache
     def _get_default_kwargs(cls) -> dict[str, Any]:
-        from mineru.utils.enum_class import MakeMode
+        from mineru.utils.enum_class import MakeMode  # noqa: PLC0415
         return {
             "server_url": None,
-            # We don't dump md directly we process, we dump the middle json in order to be
-            # able to get page indexes
+            # We don't dump md directly we process, we dump the middle json in order
+            # to be able to get page indexes
             "parse_method": "auto",
             "dump_md": False,
             "dump_middle_json": True,
@@ -75,10 +74,8 @@ class MinerUConfig(BaseModel):
         }
-@PipelineConfig.register()  # noqa: F821
 class MinerUPipelineConfig(PipelineConfig):  # noqa: F821
-    pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
-    task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
+    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
     config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
     language: LanguageAlpha2 = Field(frozen=True, default="en")
@@ -104,7 +101,7 @@ class MinerUPipeline(Pipeline):
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
-        from mineru.cli.common import aio_do_parse
+        from mineru.cli.common import aio_do_parse  # noqa: PLC0415
         docs = list(docs)
         # TODO: exclude files which are not pdf and return an error
@@ -154,13 +151,15 @@ def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
     match backend:
         case MinerUBackend.PIPELINE:
-            from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
+            from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (  # noqa: PLC0415
                 union_make,
             )
             return union_make
         case MinerUBackend.VLM:
-            from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
+            from mineru.backend.vlm.vlm_middle_json_mkcontent import (  # noqa: PLC0415
+                union_make,
+            )
             return union_make
         case _:
@@ -212,7 +211,7 @@ def _dump_md_content(
     im_dir: Path,
     md_make_mode: str | None = None,
 ) -> ConversionOutput:
-    from mineru.utils.enum_class import MakeMode
+    from mineru.utils.enum_class import MakeMode  # noqa: PLC0415
     if md_make_mode is None:
         md_make_mode = MakeMode.MM_MD

extract_python/pipeline.py CHANGED Viewed

@@ -24,9 +24,7 @@ class PipelineConfig(RegistrableConfig, ABC):
     model_config = merge_configs(icij_config(), no_enum_values_config())
     registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
-    pipeline: PipelineType
-    task_group: ClassVar[str] = Field(frozen=True)
+    pipeline: ClassVar[PipelineType]
     @classmethod
     @abstractmethod

{extract_python-0.3.1.dist-info → extract_python-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.3.1
+Version: 0.4.0
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python

extract_python-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
+extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
+extract_python/docling_.py,sha256=ZGlOVrgQw50bDh4B4DiRiRQSv5rGX-EFi8Z51mnAHpY,8620
+extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
+extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
+extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
+extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
+extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
+extract_python-0.4.0.dist-info/METADATA,sha256=_cFyQr6erjdP5CxXtFI9lbyMIDJ8fVuU2LM-h1oyv7k,1132
+extract_python-0.4.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+extract_python-0.4.0.dist-info/RECORD,,

extract_python-0.3.1.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
-extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
-extract_python/docling_.py,sha256=lWWQ2PT5qOFUcJkeKw8ibF4JxzxQBgf93_CfvNcykDg,7041
-extract_python/marker_.py,sha256=ocRFxWX__A-M31z7Qr67OMcWRvgGO_C3tyZpiKc-bXw,5027
-extract_python/miner_u.py,sha256=hwRFTvtWGN_mRuv0p6H7nKS89dTErQxI1yOrvh6238M,8010
-extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
-extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
-extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
-extract_python-0.3.1.dist-info/METADATA,sha256=qtfZpwEIKgWzkfbxGYMVP-pNFMFAbLrZo1-hmDXcgvE,1132
-extract_python-0.3.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
-extract_python-0.3.1.dist-info/RECORD,,

{extract_python-0.3.1.dist-info → extract_python-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

extract-python 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

extract-python 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl