PyPI - extract-python - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

extract-python 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

extract_python/constants.py +1 -1
extract_python/docling_.py +27 -29
extract_python/marker_.py +16 -25
extract_python/miner_u.py +5 -18
extract_python/utils.py +15 -2
{extract_python-0.6.0.dist-info → extract_python-0.7.1.dist-info}/METADATA +3 -3
extract_python-0.7.1.dist-info/RECORD +9 -0
extract_python-0.6.0.dist-info/RECORD +0 -9
{extract_python-0.6.0.dist-info → extract_python-0.7.1.dist-info}/WHEEL +0 -0

extract_python/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
 ARTIFACTS = "artifacts"
-DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
+DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'

extract_python/docling_.py CHANGED Viewed

@@ -23,7 +23,6 @@ from extract_core import (
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -34,7 +33,7 @@ from pydantic import ConfigDict, field_serializer
 from pydantic_core.core_schema import SerializerFunctionWrapHandler
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
+from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
 logger = logging.getLogger(__name__)
@@ -115,39 +114,38 @@ def _to_markdown_doc(
         raise FileExistsError(f"directory {md_dir} already exists")
     # Let's avoid issue of duplicated input file names flattened top level
     md_filename = md_dir_name + OutputFormat.MARKDOWN
-    total_length = 0
-    n_pages = len(res.pages)
     with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
         tmp_dir = Path(td)
-        page_path = Path("page.md")
-        # We do a chdir to bypass a Docling bug which only allows to maintain relative
-        # image ref when saving the markdown to a relative path
-        with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
-            end_indices = []
-            for page_i in range(n_pages):
-                res.document.save_as_markdown(
-                    page_path,
-                    page_no=page_i + 1,
-                    image_mode=ImageRefMode.REFERENCED,
-                    artifacts_dir=Path(ARTIFACTS),
-                    **kwargs,
-                )
-                content = page_path.read_text()
-                if page_i > 0:
-                    content += "\n"
-                if page_i < n_pages - 1:
-                    content += page_sep
-                total_length += len(content)
-                end_indices.append(total_length)
-                f.write(content)
-                f.flush()
-                page_path.unlink()
+        md_path = tmp_dir / md_filename
+        current_page_path = tmp_dir / "page.md"
+        with chdir(tmp_dir):
+            # We do a chdir to bypass a Docling bug which only allows to maintain
+            # relative image ref when saving the markdown to a relative path
+            pages = _docling_pages_it(res, current_page_path, **kwargs)
+            with md_path.open("wb") as f:
+                pages = write_pages(pages, page_sep, f)
+        # Clean up the tmp page file before move everything to the end destination
+        current_page_path.unlink(missing_ok=True)
         shutil.move(tmp_dir, md_dir)
-    pages = PageIndexes.from_page_end_indices(end_indices)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)
+def _docling_pages_it(
+    res: ConversionResult, output_path: Path, **kwargs
+) -> Iterable[str]:
+    n_pages = len(res.pages)
+    for page_i in range(n_pages):
+        res.document.save_as_markdown(
+            output_path,
+            page_no=page_i + 1,
+            image_mode=ImageRefMode.REFERENCED,
+            artifacts_dir=Path(ARTIFACTS),
+            **kwargs,
+        )
+        content = output_path.read_text()
+        yield content
 class SerializableFormatOptions(DoclingFormatOption):
     # Utility class to serialize Python format options into a JSON which can be
     # correctly deserialized into a docling FormatOption

extract_python/marker_.py CHANGED Viewed

@@ -9,15 +9,14 @@ from extract_core import (
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
     Status,
 )
-from .constants import ARTIFACTS
-from .utils import path_to_artifacts_dirname, report_recoverable_errors
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
+from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
 if TYPE_CHECKING:
     from marker.converters.pdf import PdfConverter
@@ -63,7 +62,9 @@ async def _process_doc(
     content, _, images = text_from_rendered(rendered)
     match output_format:
         case OutputFormat.MARKDOWN:
-            output = _to_markdown_doc(doc, content, images, output_path)
+            output = _to_markdown_doc(
+                doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
+            )
         case _:
             raise NotImplementedError(f"unsupported output format {output_format}")
     input_doc = doc.without_content()
@@ -71,7 +72,12 @@ async def _process_doc(
 def _to_markdown_doc(
-    input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
+    input_doc: InputDoc,
+    content: str,
+    images: dict[str, "Image"],
+    output_path: Path,
+    *,
+    page_sep: str = DEFAULT_MD_PAGE_SEP,
 ) -> MarkdownDoc:
     from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
@@ -85,24 +91,9 @@ def _to_markdown_doc(
         im.save(artifacts_dir / im_name)
     del images
     gc.collect()
-    page_sep = MarkdownRenderer.page_separator
-    content = content.split(page_sep)
-    n_pages = len(content)
-    md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
-        OutputFormat.MARKDOWN.value
-    )
-    total_length = 0
-    end_indices = []
-    with md_path.open("w", encoding="utf-8") as f:
-        for page_i, page_content in enumerate(content):
-            content = page_content
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    pages = PageIndexes.from_page_end_indices(end_indices)
+    pages = content.split(MarkdownRenderer.page_separator)
+    md_path = output_path / md_dir_name / md_dir_name
+    md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)

extract_python/miner_u.py CHANGED Viewed

@@ -12,7 +12,6 @@ from extract_core import (
     MinerUBackend,
     MinerUPipelineConfig,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -20,7 +19,7 @@ from extract_core import (
 )
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import path_to_artifacts_dirname, reset_env
+from .utils import path_to_artifacts_dirname, reset_env, write_pages
 _MINER_U_CONVERSION_ERRORS = tuple()
 MDMakeFunction = Callable[[list, str, str], str | None]
@@ -148,21 +147,9 @@ def _dump_md_content(
     if md_make_mode is None:
         md_make_mode = MakeMode.MM_MD
-    total_length = 0
-    end_indices = []
-    with md_path.open("w") as f:
-        n_pages = len(pdf_info)
-        for page_i, page in enumerate(pdf_info):
-            content = md_make_fn([page], md_make_mode, str(im_dir))
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    end_indices = PageIndexes.from_page_end_indices(end_indices)
+    pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     output_path = md_path.parent.relative_to(output_path)
-    output = ConversionOutput(path=output_path, pages=end_indices)
+    output = ConversionOutput(path=output_path, pages=pages)
     return output

extract_python/utils.py CHANGED Viewed

@@ -5,9 +5,9 @@ from copy import copy
 from functools import wraps
 from itertools import tee
 from pathlib import Path, PurePath
-from typing import Protocol, TypeVar
+from typing import BinaryIO, Protocol, TypeVar
-from extract_core import Error, InputDoc, Result, Status
+from extract_core import Error, InputDoc, Pages, Result, Status
 R = TypeVar("R")
 In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
     finally:
         os.environ.clear()
         os.environ.update(old_env)
+def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
+    pages_byte_sizes = []
+    pages = iter(pages)
+    content = None
+    for p in pages:
+        if content:
+            pages_byte_sizes.append(out.write((content + page_sep).encode()))
+        content = p
+    if content:
+        pages_byte_sizes.append(out.write(content.encode()))
+    return Pages.from_pages_bytes_sizes(pages_byte_sizes)

{extract_python-0.6.0.dist-info → extract_python-0.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.6.0
+Version: 0.7.1
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python
 Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
 Author-email: Clément Doumouro <cdoumouro@icij.org>
-Requires-Python: <3.14,>=3.11
-Requires-Dist: extract-core~=0.6.0
+Requires-Python: <3.15,>=3.13
+Requires-Dist: extract-core~=0.7.0
 Requires-Dist: icij-common~=0.8.2
 Provides-Extra: benches
 Requires-Dist: html2image~=2.0.7; extra == 'benches'

extract_python-0.7.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
+extract_python/constants.py,sha256=659V40LcTWJhX3IbuJLSSvI5AsGJh9ciMrGCfzJn2zA,98
+extract_python/docling_.py,sha256=j1rVhKG7m1ef43VDsS6XGP0INPRY1Rcovzf1mjZ57tU,7352
+extract_python/marker_.py,sha256=R_SXhqk5GmEWqJrYgg3tRdXKHms7n0FueNr-aOCDvLc,3358
+extract_python/miner_u.py,sha256=MtXmnG-dFIGa3dXVrixfUU32yc88US0dhu7E3x6wQIM,5415
+extract_python/utils.py,sha256=9IWW9_VVdUPHOHhdDgkXx16R1X1FPz8-nTBNYsLCFfA,2443
+extract_python-0.7.1.dist-info/METADATA,sha256=zBt-q5GlvTXtkITwZgKRgqVWfkRhJxXPcLwOpucAEiY,1218
+extract_python-0.7.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+extract_python-0.7.1.dist-info/RECORD,,

extract_python-0.6.0.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
-extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
-extract_python/docling_.py,sha256=dRNnOEuVnGKmhtZpWXZ9PhoeCJwP-eAm3JBZrDQzeQc,7425
-extract_python/marker_.py,sha256=ZXaZ11TkILnz5ChWDQP7yunBTRWZl2TgsigTTpA86v0,3697
-extract_python/miner_u.py,sha256=YYqeOVDiYcyi31BUuGKJs77_FX1Zai9sxmhT4ELr15g,5826
-extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
-extract_python-0.6.0.dist-info/METADATA,sha256=NGBhFpPoTcIwvGyt5kjWGaIfy6NuP7fhWCZ2NkbNIP0,1218
-extract_python-0.6.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
-extract_python-0.6.0.dist-info/RECORD,,

{extract_python-0.6.0.dist-info → extract_python-0.7.1.dist-info}/WHEEL RENAMED Viewed

File without changes

extract-python 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

extract-python 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl