PyPI - extract-python - Versions diffs - 0.5.15__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

extract-python 0.5.15py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

extract_python/constants.py +1 -1
extract_python/docling_.py +33 -51
extract_python/marker_.py +18 -85
extract_python/miner_u.py +10 -38
extract_python/utils.py +15 -2
{extract_python-0.5.15.dist-info → extract_python-0.7.0.dist-info}/METADATA +2 -2
extract_python-0.7.0.dist-info/RECORD +9 -0
extract_python-0.5.15.dist-info/RECORD +0 -9
{extract_python-0.5.15.dist-info → extract_python-0.7.0.dist-info}/WHEEL +0 -0

extract_python/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
 ARTIFACTS = "artifacts"
-DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
+DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'

extract_python/docling_.py CHANGED Viewed

@@ -8,7 +8,6 @@ from functools import partial
 from pathlib import Path
 from typing import Any, Self
-from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter, FormatOption
@@ -18,26 +17,23 @@ from docling_core.types.doc import ImageRefMode
 from docling_core.types.io import DocumentStream
 from extract_core import (
     BaseModel,
-    Device,
     DoclingFormatOption,
     DoclingPipelineConfig,
     Error,
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
     Status,
 )
 from icij_common.pydantic_utils import merge_configs
-from icij_common.registrable import FromConfig
 from pydantic import ConfigDict, field_serializer
 from pydantic_core.core_schema import SerializerFunctionWrapHandler
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
+from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
 logger = logging.getLogger(__name__)
@@ -46,16 +42,12 @@ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "mode
 @Pipeline.register(PipelineType.DOCLING)
 class DoclingPipeline(Pipeline):
-    def __init__(
-        self,
-        format_options: dict["InputFormat", DoclingFormatOption] | None = None,
-        *,
-        device: Device = Device.CPU,
-    ):
-        super().__init__(device)
-        format_options = dict()
-        for k, v in format_options.items():
-            format_options[k] = v.to_docling(self._device)
+    def __init__(self, config: DoclingPipelineConfig):
+        super().__init__(config)
+        format_options = {
+            k: v.to_docling(self._device)
+            for k, v in self._config.format_options.items()
+        }
         logger.info(
             "resolved format options to: %s",
             lambda: partial(json.dumps, format_options, indent=2),
@@ -81,15 +73,6 @@ class DoclingPipeline(Pipeline):
             doc = next(docs)
             yield _to_result(res, doc, output_format, output_path=output_path)
-    @classmethod
-    def _from_config(
-        cls,
-        config: DoclingPipelineConfig,
-        *,
-        device: Device = Device.CPU,
-    ) -> FromConfig:
-        return cls(config.format_options, device=device)
 def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
     for d in docs:
@@ -131,39 +114,38 @@ def _to_markdown_doc(
         raise FileExistsError(f"directory {md_dir} already exists")
     # Let's avoid issue of duplicated input file names flattened top level
     md_filename = md_dir_name + OutputFormat.MARKDOWN
-    total_length = 0
-    n_pages = len(res.pages)
     with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
         tmp_dir = Path(td)
-        page_path = Path("page.md")
-        # We do a chdir to bypass a Docling bug which only allows to maintain relative
-        # image ref when saving the markdown to a relative path
-        with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
-            end_indices = []
-            for page_i in range(n_pages):
-                res.document.save_as_markdown(
-                    page_path,
-                    page_no=page_i + 1,
-                    image_mode=ImageRefMode.REFERENCED,
-                    artifacts_dir=Path(ARTIFACTS),
-                    **kwargs,
-                )
-                content = page_path.read_text()
-                if page_i > 0:
-                    content += "\n"
-                if page_i < n_pages - 1:
-                    content += page_sep
-                total_length += len(content)
-                end_indices.append(total_length)
-                f.write(content)
-                f.flush()
-                page_path.unlink()
+        md_path = tmp_dir / md_filename
+        current_page_path = tmp_dir / "page.md"
+        with chdir(tmp_dir):
+            # We do a chdir to bypass a Docling bug which only allows to maintain
+            # relative image ref when saving the markdown to a relative path
+            pages = _docling_pages_it(res, current_page_path, **kwargs)
+            with md_path.open("wb") as f:
+                pages = write_pages(pages, page_sep, f)
+        # Clean up the tmp page file before move everything to the end destination
+        current_page_path.unlink(missing_ok=True)
         shutil.move(tmp_dir, md_dir)
-    pages = PageIndexes.from_page_end_indices(end_indices)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)
+def _docling_pages_it(
+    res: ConversionResult, output_path: Path, **kwargs
+) -> Iterable[str]:
+    n_pages = len(res.pages)
+    for page_i in range(n_pages):
+        res.document.save_as_markdown(
+            output_path,
+            page_no=page_i + 1,
+            image_mode=ImageRefMode.REFERENCED,
+            artifacts_dir=Path(ARTIFACTS),
+            **kwargs,
+        )
+        content = output_path.read_text()
+        yield content
 class SerializableFormatOptions(DoclingFormatOption):
     # Utility class to serialize Python format options into a JSON which can be
     # correctly deserialized into a docling FormatOption

extract_python/marker_.py CHANGED Viewed

@@ -2,82 +2,32 @@ import asyncio
 import gc
 from collections.abc import AsyncGenerator, Iterable
 from copy import deepcopy
-from functools import cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Self
+from typing import TYPE_CHECKING
 from extract_core import (
-    BasePipelineConfig,
-    Device,
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
     Status,
-    SupportedExt,
 )
-from pydantic import Field
-from .constants import ARTIFACTS
-from .utils import path_to_artifacts_dirname, report_recoverable_errors
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
+from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
 if TYPE_CHECKING:
     from marker.converters.pdf import PdfConverter
     from PIL import Image
-class MarkerPipelineConfig(BasePipelineConfig):
-    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
-    config: dict[str, Any] = Field(default_factory=dict)
-    @classmethod
-    @cache
-    def supported_exts(cls) -> set[SupportedExt]:
-        # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
-        return {
-            SupportedExt.PDF,
-            SupportedExt.XLS,
-            SupportedExt.XLSX,
-            SupportedExt.XLSM,
-            SupportedExt.CSV,
-            SupportedExt.ODS,
-            SupportedExt.DOC,
-            SupportedExt.DOCX,
-            SupportedExt.ODT,
-            SupportedExt.PPT,
-            SupportedExt.PPTX,
-            SupportedExt.ODP,
-            SupportedExt.HTLM,
-            SupportedExt.EPUB,
-            SupportedExt.PNG,
-            SupportedExt.JPG,
-            SupportedExt.JPEG,
-            SupportedExt.WEBP,
-            SupportedExt.GIF,
-            SupportedExt.TIFF,
-        }
 _MARKER_CONVERSION_ERRORS = tuple()
 @Pipeline.register(PipelineType.MARKER)
 class MarkerPipeline(Pipeline):
-    def __init__(
-        self,
-        marker_config: dict[str, Any] | None = None,
-        *,
-        device: Device = Device.CPU,
-    ):
-        super().__init__(device)
-        if marker_config is None:
-            marker_config = dict()
-        self._marker_config = marker_config
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
@@ -85,7 +35,7 @@ class MarkerPipeline(Pipeline):
         from marker.converters.pdf import PdfConverter  # noqa: PLC0415
         from marker.models import create_model_dict  # noqa: PLC0415
-        config = deepcopy(self._marker_config)
+        config = deepcopy(self._config.config)
         config["output_format"] = output_format.to_marker()
         config_parser = ConfigParser(config)
         renderer = config_parser.get_renderer()
@@ -98,15 +48,6 @@ class MarkerPipeline(Pipeline):
         for doc in docs:
             yield await _process_doc(doc, converter, output_format, output_path)
-    @classmethod
-    def _from_config(
-        cls,
-        config: MarkerPipelineConfig,
-        *,
-        device: Device = Device.CPU,
-    ) -> Self:
-        return cls(config.config, device=device)
 @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
 async def _process_doc(
@@ -121,7 +62,9 @@ async def _process_doc(
     content, _, images = text_from_rendered(rendered)
     match output_format:
         case OutputFormat.MARKDOWN:
-            output = _to_markdown_doc(doc, content, images, output_path)
+            output = _to_markdown_doc(
+                doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
+            )
         case _:
             raise NotImplementedError(f"unsupported output format {output_format}")
     input_doc = doc.without_content()
@@ -129,7 +72,12 @@ async def _process_doc(
 def _to_markdown_doc(
-    input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
+    input_doc: InputDoc,
+    content: str,
+    images: dict[str, "Image"],
+    output_path: Path,
+    *,
+    page_sep: str = DEFAULT_MD_PAGE_SEP,
 ) -> MarkdownDoc:
     from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
@@ -143,24 +91,9 @@ def _to_markdown_doc(
         im.save(artifacts_dir / im_name)
     del images
     gc.collect()
-    page_sep = MarkdownRenderer.page_separator
-    content = content.split(page_sep)
-    n_pages = len(content)
-    md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
-        OutputFormat.MARKDOWN.value
-    )
-    total_length = 0
-    end_indices = []
-    with md_path.open("w", encoding="utf-8") as f:
-        for page_i, page_content in enumerate(content):
-            content = page_content
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    pages = PageIndexes.from_page_end_indices(end_indices)
+    pages = content.split(MarkdownRenderer.page_separator)
+    md_path = output_path / md_dir_name / md_dir_name
+    md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)

extract_python/miner_u.py CHANGED Viewed

@@ -5,17 +5,13 @@ from collections.abc import AsyncGenerator, Callable, Iterable
 from functools import partial
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Self
 from extract_core import (
     ConversionOutput,
-    Device,
     InputDoc,
     MinerUBackend,
-    MinerUConfig,
     MinerUPipelineConfig,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -23,7 +19,7 @@ from extract_core import (
 )
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import path_to_artifacts_dirname, reset_env
+from .utils import path_to_artifacts_dirname, reset_env, write_pages
 _MINER_U_CONVERSION_ERRORS = tuple()
 MDMakeFunction = Callable[[list, str, str], str | None]
@@ -31,13 +27,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
 @Pipeline.register(PipelineType.MINER_U)
 class MinerUPipeline(Pipeline):
-    def __init__(
-        self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
-    ):
-        super().__init__(device)
-        self._config = config
-        self._language = language
-        self._md_make_fn = _parse_md_make_fn(config.backend)
+    def __init__(self, config: MinerUPipelineConfig):
+        super().__init__(config)
+        self._language = self._config.language
+        self._md_make_fn = _parse_md_make_fn(self._config.config.backend)
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
@@ -59,7 +52,7 @@ class MinerUPipeline(Pipeline):
                     pdf_file_names=pdfs_names,
                     pdf_bytes_list=pdfs_bytes,
                     p_lang_list=p_lang_list,
-                    **self._config.as_parse_kwargs(),
+                    **self._config.config.as_parse_kwargs(),
                 )
                 res_paths = [
                     _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
@@ -73,15 +66,6 @@ class MinerUPipeline(Pipeline):
                         output_path=output_path,
                     )
-    @classmethod
-    def _from_config(
-        cls,
-        config: MinerUPipelineConfig,
-        *,
-        device: Device = Device.CPU,
-    ) -> Self:
-        return cls(config.config, language=config.language, device=device)
 def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
     output_path = output_dir / pdf_filename
@@ -163,21 +147,9 @@ def _dump_md_content(
     if md_make_mode is None:
         md_make_mode = MakeMode.MM_MD
-    total_length = 0
-    end_indices = []
-    with md_path.open("w") as f:
-        n_pages = len(pdf_info)
-        for page_i, page in enumerate(pdf_info):
-            content = md_make_fn([page], md_make_mode, str(im_dir))
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    end_indices = PageIndexes.from_page_end_indices(end_indices)
+    pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     output_path = md_path.parent.relative_to(output_path)
-    output = ConversionOutput(path=output_path, pages=end_indices)
+    output = ConversionOutput(path=output_path, pages=pages)
     return output

extract_python/utils.py CHANGED Viewed

@@ -5,9 +5,9 @@ from copy import copy
 from functools import wraps
 from itertools import tee
 from pathlib import Path, PurePath
-from typing import Protocol, TypeVar
+from typing import BinaryIO, Protocol, TypeVar
-from extract_core import Error, InputDoc, Result, Status
+from extract_core import Error, InputDoc, Pages, Result, Status
 R = TypeVar("R")
 In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
     finally:
         os.environ.clear()
         os.environ.update(old_env)
+def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
+    pages_byte_sizes = []
+    pages = iter(pages)
+    content = None
+    for p in pages:
+        if content:
+            pages_byte_sizes.append(out.write((content + page_sep).encode()))
+        content = p
+    if content:
+        pages_byte_sizes.append(out.write(content.encode()))
+    return Pages.from_pages_bytes_sizes(pages_byte_sizes)

{extract_python-0.5.15.dist-info → extract_python-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.5.15
+Version: 0.7.0
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python
 Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
 Author-email: Clément Doumouro <cdoumouro@icij.org>
 Requires-Python: <3.14,>=3.11
-Requires-Dist: extract-core~=0.5.5
+Requires-Dist: extract-core~=0.6.0
 Requires-Dist: icij-common~=0.8.2
 Provides-Extra: benches
 Requires-Dist: html2image~=2.0.7; extra == 'benches'

extract_python-0.7.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
+extract_python/constants.py,sha256=659V40LcTWJhX3IbuJLSSvI5AsGJh9ciMrGCfzJn2zA,98
+extract_python/docling_.py,sha256=j1rVhKG7m1ef43VDsS6XGP0INPRY1Rcovzf1mjZ57tU,7352
+extract_python/marker_.py,sha256=R_SXhqk5GmEWqJrYgg3tRdXKHms7n0FueNr-aOCDvLc,3358
+extract_python/miner_u.py,sha256=MtXmnG-dFIGa3dXVrixfUU32yc88US0dhu7E3x6wQIM,5415
+extract_python/utils.py,sha256=9IWW9_VVdUPHOHhdDgkXx16R1X1FPz8-nTBNYsLCFfA,2443
+extract_python-0.7.0.dist-info/METADATA,sha256=my-lfG6yqNEat77SC6mAfFerRRmTtksQMKYwHsg8aVE,1218
+extract_python-0.7.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+extract_python-0.7.0.dist-info/RECORD,,

extract_python-0.5.15.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
-extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
-extract_python/docling_.py,sha256=1ujMmtD63RaSdR1gvWbQAm396JODj44uBWtz9M4cFyI,7864
-extract_python/marker_.py,sha256=oxN1unJ9x8YW5jds1STCc2wvQ30KzQNy3dXbCIuTuQc,5311
-extract_python/miner_u.py,sha256=Ien3H7vZXLCACVjSMP2NAiog7yvvPq7oGgLGcfLZfpA,6159
-extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
-extract_python-0.5.15.dist-info/METADATA,sha256=S3upxGMF81cp6kMaqteJJ5gMBmQ2dQe4Xcil8DGq8s0,1219
-extract_python-0.5.15.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
-extract_python-0.5.15.dist-info/RECORD,,

{extract_python-0.5.15.dist-info → extract_python-0.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

extract-python 0.5.15__py3-none-any.whl → 0.7.0__py3-none-any.whl

extract-python 0.5.15py3-none-any.whl → 0.7.0py3-none-any.whl