PyPI - extract-python - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

extract-python 0.3.2py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

extract_python/docling_.py +128 -53
extract_python/marker_.py +2 -4
extract_python/miner_u.py +2 -4
extract_python/pipeline.py +1 -3
{extract_python-0.3.2.dist-info → extract_python-0.4.1.dist-info}/METADATA +1 -1
extract_python-0.4.1.dist-info/RECORD +11 -0
extract_python-0.3.2.dist-info/RECORD +0 -11
{extract_python-0.3.2.dist-info → extract_python-0.4.1.dist-info}/WHEEL +0 -0

extract_python/docling_.py CHANGED Viewed

@@ -1,14 +1,42 @@
+import importlib
 import shutil
 import tempfile
 from collections.abc import AsyncGenerator, Iterable, Iterator
 from functools import cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
+from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
+# Data model import are quick it's ok to leave it there
+from docling.datamodel.base_models import FormatToExtensions, InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PdfPipelineOptions,
+    PipelineOptions,
+    ThreadedPdfPipelineOptions,
+)
+from docling.document_converter import DocumentConverter, FormatOption
+from docling.pipeline.base_pipeline import BasePipeline
+# TODO: this is long to load improve it
+from docling_core.types.doc import ImageRefMode
+from docling_core.types.io import DocumentStream
+from icij_common.pydantic_utils import to_lower_snake_case
 from icij_common.registrable import FromConfig
-from pydantic import AfterValidator, Field
+from pydantic import (
+    AfterValidator,
+    BeforeValidator,
+    Field,
+    PlainSerializer,
+    WrapSerializer,
+    model_validator,
+)
+from pydantic_core.core_schema import SerializerFunctionWrapHandler
-from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
 from .objects import (
     Error,
     InputDoc,
@@ -24,73 +52,123 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
 DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
-if TYPE_CHECKING:
-    from docling.datamodel.base_models import InputFormat
-    from docling.datamodel.pipeline_options import PipelineOptions
-    from docling.document_converter import ConversionResult, FormatOption
-    from docling_core.types.io import DocumentStream
-def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
-    from docling.datamodel.pipeline_options import PdfPipelineOptions  # noqa: PLC0415
-    if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
+def _validate_pipeline_opts(v: "PipelineOptions") -> None:
+    if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
         msg = "generate_picture_images should be set to true"
         raise ValueError(msg)
+    return v
+T = TypeVar("T")
+def _find_subcls(cls: type[T], name: str) -> type[T]:
+    # Check if the class available
+    for c in all_subclasses(cls):
+        if c.__name__ == name:
+            return c
+    # Then apply ad-hoc search
+    if "pipeline" in cls.__name__.lower():
+        module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
+        try:
+            module = importlib.import_module(module_name)
+            return getattr(module, name)
+        except (ModuleNotFoundError, AttributeError):
+            pass
+    raise ValueError(f"unknown {cls.__name__} subclass {name}")
+def _find_init_arg_type(cls: type[Any], arg: str) -> type:
+    hints = get_type_hints(cls.__init__)
+    return hints[arg]
+def _resolve_pipeline_cls(v: Any) -> Any:
+    if isinstance(v, str):
+        return _find_subcls(BasePipeline, v)
+    return v
+def _ser_class_as_str(v: Any) -> Any:
+    if isinstance(v, type):
+        return v.__name__
+    return v
-def _validate_options(
-    data: dict["InputFormat", "FormatOption"],
-) -> dict["InputFormat", "FormatOption"]:
-    for opts in data.values():
-        _validate_pipeline_opts(opts.pipeline_options)
-    return data
+def _ser_with_backend_option_kind(
+    v: Any, handler: SerializerFunctionWrapHandler
+) -> Any:
+    serialized = handler(v)
+    if isinstance(v, BaseBackendOptions):
+        kind = getattr(v, "kind", None)
+        if kind is not None:
+            serialized["kind"] = kind
+    return serialized
+def _resolve_backend(v: Any) -> Any:
+    if isinstance(v, str):
+        return _find_subcls(AbstractDocumentBackend, v)
+    return v
+class DoclingFormatOption(FormatOption):
+    pipeline_cls: Annotated[
+        str | type[BasePipeline],
+        BeforeValidator(_resolve_pipeline_cls),
+        PlainSerializer(_ser_class_as_str),
+    ]
+    pipeline_options: Annotated[
+        dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
+    ] = None
+    backend: Annotated[
+        str | type[AbstractDocumentBackend],
+        BeforeValidator(_resolve_backend),
+        PlainSerializer(_ser_class_as_str),
+    ]
+    backend_options: Annotated[
+        BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
+    ] = None
+    @model_validator(mode="after")
+    def _resolve_pipeline_options(self) -> Self:
+        if isinstance(self.pipeline_options, dict):
+            option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
+            self.pipeline_options = option_cls.model_validate(self.pipeline_options)
+        return self
 @cache
-def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
-    from docling.datamodel.pipeline_options import (  # noqa: PLC0415
-        EasyOcrOptions,
-        PdfPipelineOptions,
+def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
+    from docling.backend.docling_parse_backend import (  # noqa: PLC0415
+        DoclingParseDocumentBackend,
+    )
+    from docling.pipeline.standard_pdf_pipeline import (  # noqa: PLC0415
+        StandardPdfPipeline,
     )
-    from docling.document_converter import PdfFormatOption  # noqa: PLC0415
     return {
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_options=PdfPipelineOptions(
+        InputFormat.PDF: DoclingFormatOption(
+            pipeline_cls=StandardPdfPipeline,
+            backend=DoclingParseDocumentBackend,
+            pipeline_options=ThreadedPdfPipelineOptions(
                 ocr_options=EasyOcrOptions(), generate_picture_images=True
-            )
+            ),
         ),
     }
-T = TypeVar("T")
-def _find_subcls(cls: type[T], name: str) -> type[T]:
-    for c in all_subclasses(cls):
-        if c.__name__ == name:
-            return c
-    raise ValueError(f"unknown {cls.__name__} subclass {name}")
-@PipelineConfig.register()
 class DoclingPipelineConfig(PipelineConfig):
-    pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
-    task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
+    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
-    format_options: Annotated[
-        dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
-    ] = Field(default_factory=_default_format_opts)
+    format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
+        default_factory=_default_format_opts
+    )
     @classmethod
     @cache
     def supported_exts(cls) -> set[SupportedExt]:
-        from docling.datamodel.base_models import (  # noqa: PLC0415
-            FormatToExtensions,
-            InputFormat,
-        )
         unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
         supported = set()
         for f in InputFormat:
@@ -106,7 +184,6 @@ class DoclingPipeline(Pipeline):
     def __init__(
         self, format_options: dict["InputFormat", "FormatOption"] | None = None
     ):
-        from docling.document_converter import DocumentConverter  # noqa: PLC0415
         allowed_format = [
             f.to_docling() for f in DoclingPipelineConfig.supported_exts()
@@ -134,7 +211,7 @@ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
 def _to_result(
-    res: "ConversionResult",
+    res: ConversionResult,
     input_document: InputDoc,
     output_format: OutputFormat,
     output_path: Path,
@@ -155,13 +232,11 @@ def _to_result(
 def _to_markdown_doc(
-    res: "ConversionResult",
+    res: ConversionResult,
     output_path: Path,
     page_sep: str = DEFAULT_MD_PAGE_SEP,
     **kwargs,
 ) -> MarkdownDoc:
-    from docling_core.types.doc import ImageRefMode  # noqa: PLC0415
     # TODO: Should we add a hash to avoid collision between files with same names
     #  nested in the tree structured
     md_dir_name = path_to_artifacts_dirname(res.input.file)

extract_python/marker_.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
 from pydantic import Field
-from .constants import ARTIFACTS, CPU_GROUP
+from .constants import ARTIFACTS
 from .objects import (
     InputDoc,
     MarkdownDoc,
@@ -25,10 +25,8 @@ if TYPE_CHECKING:
     from PIL import Image
-@PipelineConfig.register()
 class MarkerPipelineConfig(PipelineConfig):
-    pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
-    task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
+    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
     config: dict[str, Any] = dict()

extract_python/miner_u.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing import Any, ClassVar, Self
 from pydantic import Field
 from pydantic_extra_types.language_code import LanguageAlpha2
-from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP, MINER_U_GROUP
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
 from .objects import (
     BaseModel,
     ConversionOutput,
@@ -74,10 +74,8 @@ class MinerUConfig(BaseModel):
         }
-@PipelineConfig.register()  # noqa: F821
 class MinerUPipelineConfig(PipelineConfig):  # noqa: F821
-    pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
-    task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
+    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
     config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
     language: LanguageAlpha2 = Field(frozen=True, default="en")

extract_python/pipeline.py CHANGED Viewed

@@ -24,9 +24,7 @@ class PipelineConfig(RegistrableConfig, ABC):
     model_config = merge_configs(icij_config(), no_enum_values_config())
     registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
-    pipeline: PipelineType
-    task_group: ClassVar[str] = Field(frozen=True)
+    pipeline: ClassVar[PipelineType]
     @classmethod
     @abstractmethod

{extract_python-0.3.2.dist-info → extract_python-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.3.2
+Version: 0.4.1
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python

extract_python-0.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
+extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
+extract_python/docling_.py,sha256=ys2vK4zgpWsPObIZWRFhHM4fNkojMYUa9QRevl8bd3c,9342
+extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
+extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
+extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
+extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
+extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
+extract_python-0.4.1.dist-info/METADATA,sha256=tjxWkMOJ4mhT6eF-HmZmJl_HJgNT2fluq2sZUPWfE7o,1132
+extract_python-0.4.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+extract_python-0.4.1.dist-info/RECORD,,

extract_python-0.3.2.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
-extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
-extract_python/docling_.py,sha256=JD5lLFSRo6KC7LMF6rH2MVNJaQAwsVwzFd_WIRQhEWQ,7112
-extract_python/marker_.py,sha256=GM1GB0gp8TkeyPGn7S5tCKkfEqcQdKjIu1CtYs2zt2g,5112
-extract_python/miner_u.py,sha256=i7JKcoKvU3G_fB_0ffsTaLdRYAPvuK6zwohgjOVIBTY,8127
-extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
-extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
-extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
-extract_python-0.3.2.dist-info/METADATA,sha256=BbUayvHGHkr9HZ-Pq1iUcxvtEq7QSZjCWTYS-iiWOWg,1132
-extract_python-0.3.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
-extract_python-0.3.2.dist-info/RECORD,,

{extract_python-0.3.2.dist-info → extract_python-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

extract-python 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

extract-python 0.3.2py3-none-any.whl → 0.4.1py3-none-any.whl