PyPI - extract-python - Versions diffs - 0.6.0__tar.gz → 0.7.1__tar.gz - Mend

extract-python 0.6.0tar.gz → 0.7.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{extract_python-0.6.0 → extract_python-0.7.1}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.6.0
+Version: 0.7.1
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python
 Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
 Author-email: Clément Doumouro <cdoumouro@icij.org>
-Requires-Python: <3.14,>=3.11
-Requires-Dist: extract-core~=0.6.0
+Requires-Python: <3.15,>=3.13
+Requires-Dist: extract-core~=0.7.0
 Requires-Dist: icij-common~=0.8.2
 Provides-Extra: benches
 Requires-Dist: html2image~=2.0.7; extra == 'benches'

{extract_python-0.6.0 → extract_python-0.7.1}/benches/compare.py RENAMED Viewed

@@ -3,7 +3,7 @@ from tempfile import TemporaryDirectory
 import markdown2
 import pypdfium2
-from extract_core import BaseModel, OutputFormat, PageIndexes
+from extract_core import BaseModel, OutputFormat, Pages
 from extract_python.utils import chdir
 from html2image import Html2Image
 from PIL import Image, ImageDraw
@@ -93,7 +93,7 @@ def side_by_side_md_page_comp(
     if len(md_files) != 1:
         msg = f"unexpected number of md files ({len(md_files)}) in {compared_path}"
         raise ValueError(msg)
-    md_content = md_files[0].read_text()[page_ix[0] : page_ix[1]]
+    md_content = (md_files[0].read_bytes()[page_ix[0] : page_ix[1]]).decode()
     # change the current dir so that the browser renders images properly
     with chdir(compared_path):
         md_page_im = _render_md(md_content, compared_path, html_size=ref_im.size)
@@ -140,9 +140,9 @@ def _scan_pages(
     root: Path, comparison: ComparisonItem
 ) -> list[dict[str, tuple[int, int]]]:
     all_pages = [
-        PageIndexes.model_validate_json(
+        Pages.model_validate_json(
             (root / compared / "artifacts" / "pages.json").read_text()
-        ).root
+        )
         for compared in comparison.compared
     ]
     all_pages = zip(*all_pages, strict=True)

extract_python-0.7.1/extract_python/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ ARTIFACTS = "artifacts"
2	+ DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'

{extract_python-0.6.0 → extract_python-0.7.1}/extract_python/docling_.py RENAMED Viewed

@@ -23,7 +23,6 @@ from extract_core import (
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -34,7 +33,7 @@ from pydantic import ConfigDict, field_serializer
 from pydantic_core.core_schema import SerializerFunctionWrapHandler
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
+from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
 logger = logging.getLogger(__name__)
@@ -115,39 +114,38 @@ def _to_markdown_doc(
         raise FileExistsError(f"directory {md_dir} already exists")
     # Let's avoid issue of duplicated input file names flattened top level
     md_filename = md_dir_name + OutputFormat.MARKDOWN
-    total_length = 0
-    n_pages = len(res.pages)
     with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
         tmp_dir = Path(td)
-        page_path = Path("page.md")
-        # We do a chdir to bypass a Docling bug which only allows to maintain relative
-        # image ref when saving the markdown to a relative path
-        with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
-            end_indices = []
-            for page_i in range(n_pages):
-                res.document.save_as_markdown(
-                    page_path,
-                    page_no=page_i + 1,
-                    image_mode=ImageRefMode.REFERENCED,
-                    artifacts_dir=Path(ARTIFACTS),
-                    **kwargs,
-                )
-                content = page_path.read_text()
-                if page_i > 0:
-                    content += "\n"
-                if page_i < n_pages - 1:
-                    content += page_sep
-                total_length += len(content)
-                end_indices.append(total_length)
-                f.write(content)
-                f.flush()
-                page_path.unlink()
+        md_path = tmp_dir / md_filename
+        current_page_path = tmp_dir / "page.md"
+        with chdir(tmp_dir):
+            # We do a chdir to bypass a Docling bug which only allows to maintain
+            # relative image ref when saving the markdown to a relative path
+            pages = _docling_pages_it(res, current_page_path, **kwargs)
+            with md_path.open("wb") as f:
+                pages = write_pages(pages, page_sep, f)
+        # Clean up the tmp page file before move everything to the end destination
+        current_page_path.unlink(missing_ok=True)
         shutil.move(tmp_dir, md_dir)
-    pages = PageIndexes.from_page_end_indices(end_indices)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)
+def _docling_pages_it(
+    res: ConversionResult, output_path: Path, **kwargs
+) -> Iterable[str]:
+    n_pages = len(res.pages)
+    for page_i in range(n_pages):
+        res.document.save_as_markdown(
+            output_path,
+            page_no=page_i + 1,
+            image_mode=ImageRefMode.REFERENCED,
+            artifacts_dir=Path(ARTIFACTS),
+            **kwargs,
+        )
+        content = output_path.read_text()
+        yield content
 class SerializableFormatOptions(DoclingFormatOption):
     # Utility class to serialize Python format options into a JSON which can be
     # correctly deserialized into a docling FormatOption

{extract_python-0.6.0 → extract_python-0.7.1}/extract_python/marker_.py RENAMED Viewed

@@ -9,15 +9,14 @@ from extract_core import (
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
     Status,
 )
-from .constants import ARTIFACTS
-from .utils import path_to_artifacts_dirname, report_recoverable_errors
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
+from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
 if TYPE_CHECKING:
     from marker.converters.pdf import PdfConverter
@@ -63,7 +62,9 @@ async def _process_doc(
     content, _, images = text_from_rendered(rendered)
     match output_format:
         case OutputFormat.MARKDOWN:
-            output = _to_markdown_doc(doc, content, images, output_path)
+            output = _to_markdown_doc(
+                doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
+            )
         case _:
             raise NotImplementedError(f"unsupported output format {output_format}")
     input_doc = doc.without_content()
@@ -71,7 +72,12 @@ async def _process_doc(
 def _to_markdown_doc(
-    input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
+    input_doc: InputDoc,
+    content: str,
+    images: dict[str, "Image"],
+    output_path: Path,
+    *,
+    page_sep: str = DEFAULT_MD_PAGE_SEP,
 ) -> MarkdownDoc:
     from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
@@ -85,24 +91,9 @@ def _to_markdown_doc(
         im.save(artifacts_dir / im_name)
     del images
     gc.collect()
-    page_sep = MarkdownRenderer.page_separator
-    content = content.split(page_sep)
-    n_pages = len(content)
-    md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
-        OutputFormat.MARKDOWN.value
-    )
-    total_length = 0
-    end_indices = []
-    with md_path.open("w", encoding="utf-8") as f:
-        for page_i, page_content in enumerate(content):
-            content = page_content
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    pages = PageIndexes.from_page_end_indices(end_indices)
+    pages = content.split(MarkdownRenderer.page_separator)
+    md_path = output_path / md_dir_name / md_dir_name
+    md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)

{extract_python-0.6.0 → extract_python-0.7.1}/extract_python/miner_u.py RENAMED Viewed

@@ -12,7 +12,6 @@ from extract_core import (
     MinerUBackend,
     MinerUPipelineConfig,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -20,7 +19,7 @@ from extract_core import (
 )
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import path_to_artifacts_dirname, reset_env
+from .utils import path_to_artifacts_dirname, reset_env, write_pages
 _MINER_U_CONVERSION_ERRORS = tuple()
 MDMakeFunction = Callable[[list, str, str], str | None]
@@ -148,21 +147,9 @@ def _dump_md_content(
     if md_make_mode is None:
         md_make_mode = MakeMode.MM_MD
-    total_length = 0
-    end_indices = []
-    with md_path.open("w") as f:
-        n_pages = len(pdf_info)
-        for page_i, page in enumerate(pdf_info):
-            content = md_make_fn([page], md_make_mode, str(im_dir))
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    end_indices = PageIndexes.from_page_end_indices(end_indices)
+    pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     output_path = md_path.parent.relative_to(output_path)
-    output = ConversionOutput(path=output_path, pages=end_indices)
+    output = ConversionOutput(path=output_path, pages=pages)
     return output

{extract_python-0.6.0 → extract_python-0.7.1}/extract_python/utils.py RENAMED Viewed

@@ -5,9 +5,9 @@ from copy import copy
 from functools import wraps
 from itertools import tee
 from pathlib import Path, PurePath
-from typing import Protocol, TypeVar
+from typing import BinaryIO, Protocol, TypeVar
-from extract_core import Error, InputDoc, Result, Status
+from extract_core import Error, InputDoc, Pages, Result, Status
 R = TypeVar("R")
 In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
     finally:
         os.environ.clear()
         os.environ.update(old_env)
+def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
+    pages_byte_sizes = []
+    pages = iter(pages)
+    content = None
+    for p in pages:
+        if content:
+            pages_byte_sizes.append(out.write((content + page_sep).encode()))
+        content = p
+    if content:
+        pages_byte_sizes.append(out.write(content.encode()))
+    return Pages.from_pages_bytes_sizes(pages_byte_sizes)

{extract_python-0.6.0 → extract_python-0.7.1}/pyproject.toml RENAMED Viewed

@@ -6,10 +6,10 @@ authors = [
   { name = "Clément Doumouro", email = "cdoumouro@icij.org" },
 ]
 readme = "README.md"
-requires-python = ">=3.11,<3.14"
+requires-python = ">=3.13,<3.15"
 dependencies = [
   "icij-common~=0.8.2",
-  "extract-core~=0.6.0",
+  "extract-core~=0.7.0",
 ]
 [project.optional-dependencies]

extract_python-0.6.0/extract_python/constants.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- ARTIFACTS = "artifacts"
2	- DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'

{extract_python-0.6.0 → extract_python-0.7.1}/.gitignore RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/.python-version RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/README.md RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/benches/__init__.py RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/benches/compare.ipynb RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/benches/constants.py RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/data/.gitignore RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/extract_python/__init__.py RENAMED Viewed

File without changes

{extract_python-0.6.0 → extract_python-0.7.1}/uv.lock RENAMED Viewed

File without changes

extract-python 0.6.0__tar.gz → 0.7.1__tar.gz

extract-python 0.6.0tar.gz → 0.7.1tar.gz