PyPI - extract-python - Versions diffs - 0.5.13__tar.gz → 0.7.0__tar.gz - Mend

extract-python 0.5.13tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{extract_python-0.5.13 → extract_python-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.5.13
+Version: 0.7.0
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python
 Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
 Author-email: Clément Doumouro <cdoumouro@icij.org>
 Requires-Python: <3.14,>=3.11
-Requires-Dist: extract-core~=0.5.5
+Requires-Dist: extract-core~=0.6.0
 Requires-Dist: icij-common~=0.8.2
 Provides-Extra: benches
 Requires-Dist: html2image~=2.0.7; extra == 'benches'

{extract_python-0.5.13 → extract_python-0.7.0}/benches/compare.py RENAMED Viewed

@@ -3,7 +3,7 @@ from tempfile import TemporaryDirectory
 import markdown2
 import pypdfium2
-from extract_core import BaseModel, OutputFormat, PageIndexes
+from extract_core import BaseModel, OutputFormat, Pages
 from extract_python.utils import chdir
 from html2image import Html2Image
 from PIL import Image, ImageDraw
@@ -93,7 +93,7 @@ def side_by_side_md_page_comp(
     if len(md_files) != 1:
         msg = f"unexpected number of md files ({len(md_files)}) in {compared_path}"
         raise ValueError(msg)
-    md_content = md_files[0].read_text()[page_ix[0] : page_ix[1]]
+    md_content = (md_files[0].read_bytes()[page_ix[0] : page_ix[1]]).decode()
     # change the current dir so that the browser renders images properly
     with chdir(compared_path):
         md_page_im = _render_md(md_content, compared_path, html_size=ref_im.size)
@@ -140,9 +140,9 @@ def _scan_pages(
     root: Path, comparison: ComparisonItem
 ) -> list[dict[str, tuple[int, int]]]:
     all_pages = [
-        PageIndexes.model_validate_json(
+        Pages.model_validate_json(
             (root / compared / "artifacts" / "pages.json").read_text()
-        ).root
+        )
         for compared in comparison.compared
     ]
     all_pages = zip(*all_pages, strict=True)

extract_python-0.7.0/extract_python/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ ARTIFACTS = "artifacts"
2	+ DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'

{extract_python-0.5.13 → extract_python-0.7.0}/extract_python/docling_.py RENAMED Viewed

@@ -1,11 +1,13 @@
 import asyncio
+import json
+import logging
 import shutil
 import tempfile
 from collections.abc import AsyncGenerator, Iterable, Iterator
+from functools import partial
 from pathlib import Path
 from typing import Any, Self
-from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter, FormatOption
@@ -15,42 +17,41 @@ from docling_core.types.doc import ImageRefMode
 from docling_core.types.io import DocumentStream
 from extract_core import (
     BaseModel,
-    Device,
     DoclingFormatOption,
     DoclingPipelineConfig,
     Error,
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
     Status,
 )
 from icij_common.pydantic_utils import merge_configs
-from icij_common.registrable import FromConfig
 from pydantic import ConfigDict, field_serializer
 from pydantic_core.core_schema import SerializerFunctionWrapHandler
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
+from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
+logger = logging.getLogger(__name__)
 DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
 @Pipeline.register(PipelineType.DOCLING)
 class DoclingPipeline(Pipeline):
-    def __init__(
-        self,
-        format_options: dict["InputFormat", DoclingFormatOption] | None = None,
-        *,
-        device: Device = Device.CPU,
-    ):
-        super().__init__(device)
+    def __init__(self, config: DoclingPipelineConfig):
+        super().__init__(config)
         format_options = {
-            k: v.to_docling(self._device) for k, v in format_options.items()
+            k: v.to_docling(self._device)
+            for k, v in self._config.format_options.items()
         }
+        logger.info(
+            "resolved format options to: %s",
+            lambda: partial(json.dumps, format_options, indent=2),
+        )
         allowed_format = [
             f.to_docling() for f in DoclingPipelineConfig.supported_exts()
         ]
@@ -72,15 +73,6 @@ class DoclingPipeline(Pipeline):
             doc = next(docs)
             yield _to_result(res, doc, output_format, output_path=output_path)
-    @classmethod
-    def _from_config(
-        cls,
-        config: DoclingPipelineConfig,
-        *,
-        device: Device = Device.CPU,
-    ) -> FromConfig:
-        return cls(config.format_options, device=device)
 def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
     for d in docs:
@@ -122,39 +114,38 @@ def _to_markdown_doc(
         raise FileExistsError(f"directory {md_dir} already exists")
     # Let's avoid issue of duplicated input file names flattened top level
     md_filename = md_dir_name + OutputFormat.MARKDOWN
-    total_length = 0
-    n_pages = len(res.pages)
     with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
         tmp_dir = Path(td)
-        page_path = Path("page.md")
-        # We do a chdir to bypass a Docling bug which only allows to maintain relative
-        # image ref when saving the markdown to a relative path
-        with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
-            end_indices = []
-            for page_i in range(n_pages):
-                res.document.save_as_markdown(
-                    page_path,
-                    page_no=page_i + 1,
-                    image_mode=ImageRefMode.REFERENCED,
-                    artifacts_dir=Path(ARTIFACTS),
-                    **kwargs,
-                )
-                content = page_path.read_text()
-                if page_i > 0:
-                    content += "\n"
-                if page_i < n_pages - 1:
-                    content += page_sep
-                total_length += len(content)
-                end_indices.append(total_length)
-                f.write(content)
-                f.flush()
-                page_path.unlink()
+        md_path = tmp_dir / md_filename
+        current_page_path = tmp_dir / "page.md"
+        with chdir(tmp_dir):
+            # We do a chdir to bypass a Docling bug which only allows to maintain
+            # relative image ref when saving the markdown to a relative path
+            pages = _docling_pages_it(res, current_page_path, **kwargs)
+            with md_path.open("wb") as f:
+                pages = write_pages(pages, page_sep, f)
+        # Clean up the tmp page file before move everything to the end destination
+        current_page_path.unlink(missing_ok=True)
         shutil.move(tmp_dir, md_dir)
-    pages = PageIndexes.from_page_end_indices(end_indices)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)
+def _docling_pages_it(
+    res: ConversionResult, output_path: Path, **kwargs
+) -> Iterable[str]:
+    n_pages = len(res.pages)
+    for page_i in range(n_pages):
+        res.document.save_as_markdown(
+            output_path,
+            page_no=page_i + 1,
+            image_mode=ImageRefMode.REFERENCED,
+            artifacts_dir=Path(ARTIFACTS),
+            **kwargs,
+        )
+        content = output_path.read_text()
+        yield content
 class SerializableFormatOptions(DoclingFormatOption):
     # Utility class to serialize Python format options into a JSON which can be
     # correctly deserialized into a docling FormatOption

{extract_python-0.5.13 → extract_python-0.7.0}/extract_python/marker_.py RENAMED Viewed

@@ -2,82 +2,32 @@ import asyncio
 import gc
 from collections.abc import AsyncGenerator, Iterable
 from copy import deepcopy
-from functools import cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Self
+from typing import TYPE_CHECKING
 from extract_core import (
-    BasePipelineConfig,
-    Device,
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
     Status,
-    SupportedExt,
 )
-from pydantic import Field
-from .constants import ARTIFACTS
-from .utils import path_to_artifacts_dirname, report_recoverable_errors
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
+from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
 if TYPE_CHECKING:
     from marker.converters.pdf import PdfConverter
     from PIL import Image
-class MarkerPipelineConfig(BasePipelineConfig):
-    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
-    config: dict[str, Any] = Field(default_factory=dict)
-    @classmethod
-    @cache
-    def supported_exts(cls) -> set[SupportedExt]:
-        # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
-        return {
-            SupportedExt.PDF,
-            SupportedExt.XLS,
-            SupportedExt.XLSX,
-            SupportedExt.XLSM,
-            SupportedExt.CSV,
-            SupportedExt.ODS,
-            SupportedExt.DOC,
-            SupportedExt.DOCX,
-            SupportedExt.ODT,
-            SupportedExt.PPT,
-            SupportedExt.PPTX,
-            SupportedExt.ODP,
-            SupportedExt.HTLM,
-            SupportedExt.EPUB,
-            SupportedExt.PNG,
-            SupportedExt.JPG,
-            SupportedExt.JPEG,
-            SupportedExt.WEBP,
-            SupportedExt.GIF,
-            SupportedExt.TIFF,
-        }
 _MARKER_CONVERSION_ERRORS = tuple()
 @Pipeline.register(PipelineType.MARKER)
 class MarkerPipeline(Pipeline):
-    def __init__(
-        self,
-        marker_config: dict[str, Any] | None = None,
-        *,
-        device: Device = Device.CPU,
-    ):
-        super().__init__(device)
-        if marker_config is None:
-            marker_config = dict()
-        self._marker_config = marker_config
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
     ) -> AsyncGenerator[Result, None]:
@@ -85,7 +35,7 @@ class MarkerPipeline(Pipeline):
         from marker.converters.pdf import PdfConverter  # noqa: PLC0415
         from marker.models import create_model_dict  # noqa: PLC0415
-        config = deepcopy(self._marker_config)
+        config = deepcopy(self._config.config)
         config["output_format"] = output_format.to_marker()
         config_parser = ConfigParser(config)
         renderer = config_parser.get_renderer()
@@ -98,15 +48,6 @@ class MarkerPipeline(Pipeline):
         for doc in docs:
             yield await _process_doc(doc, converter, output_format, output_path)
-    @classmethod
-    def _from_config(
-        cls,
-        config: MarkerPipelineConfig,
-        *,
-        device: Device = Device.CPU,
-    ) -> Self:
-        return cls(config.config, device=device)
 @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
 async def _process_doc(
@@ -121,7 +62,9 @@ async def _process_doc(
     content, _, images = text_from_rendered(rendered)
     match output_format:
         case OutputFormat.MARKDOWN:
-            output = _to_markdown_doc(doc, content, images, output_path)
+            output = _to_markdown_doc(
+                doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
+            )
         case _:
             raise NotImplementedError(f"unsupported output format {output_format}")
     input_doc = doc.without_content()
@@ -129,7 +72,12 @@ async def _process_doc(
 def _to_markdown_doc(
-    input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
+    input_doc: InputDoc,
+    content: str,
+    images: dict[str, "Image"],
+    output_path: Path,
+    *,
+    page_sep: str = DEFAULT_MD_PAGE_SEP,
 ) -> MarkdownDoc:
     from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
@@ -143,24 +91,9 @@ def _to_markdown_doc(
         im.save(artifacts_dir / im_name)
     del images
     gc.collect()
-    page_sep = MarkdownRenderer.page_separator
-    content = content.split(page_sep)
-    n_pages = len(content)
-    md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
-        OutputFormat.MARKDOWN.value
-    )
-    total_length = 0
-    end_indices = []
-    with md_path.open("w", encoding="utf-8") as f:
-        for page_i, page_content in enumerate(content):
-            content = page_content
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    pages = PageIndexes.from_page_end_indices(end_indices)
+    pages = content.split(MarkdownRenderer.page_separator)
+    md_path = output_path / md_dir_name / md_dir_name
+    md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)

{extract_python-0.5.13 → extract_python-0.7.0}/extract_python/miner_u.py RENAMED Viewed

@@ -5,17 +5,13 @@ from collections.abc import AsyncGenerator, Callable, Iterable
 from functools import partial
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Self
 from extract_core import (
     ConversionOutput,
-    Device,
     InputDoc,
     MinerUBackend,
-    MinerUConfig,
     MinerUPipelineConfig,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -23,7 +19,7 @@ from extract_core import (
 )
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import path_to_artifacts_dirname, reset_env
+from .utils import path_to_artifacts_dirname, reset_env, write_pages
 _MINER_U_CONVERSION_ERRORS = tuple()
 MDMakeFunction = Callable[[list, str, str], str | None]
@@ -31,13 +27,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
 @Pipeline.register(PipelineType.MINER_U)
 class MinerUPipeline(Pipeline):
-    def __init__(
-        self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
-    ):
-        super().__init__(device)
-        self._config = config
-        self._language = language
-        self._md_make_fn = _parse_md_make_fn(config.backend)
+    def __init__(self, config: MinerUPipelineConfig):
+        super().__init__(config)
+        self._language = self._config.language
+        self._md_make_fn = _parse_md_make_fn(self._config.config.backend)
     async def extract_content(
         self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
@@ -59,7 +52,7 @@ class MinerUPipeline(Pipeline):
                     pdf_file_names=pdfs_names,
                     pdf_bytes_list=pdfs_bytes,
                     p_lang_list=p_lang_list,
-                    **self._config.as_parse_kwargs(),
+                    **self._config.config.as_parse_kwargs(),
                 )
                 res_paths = [
                     _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
@@ -73,15 +66,6 @@ class MinerUPipeline(Pipeline):
                         output_path=output_path,
                     )
-    @classmethod
-    def _from_config(
-        cls,
-        config: MinerUPipelineConfig,
-        *,
-        device: Device = Device.CPU,
-    ) -> Self:
-        return cls(config.config, language=config.language, device=device)
 def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
     output_path = output_dir / pdf_filename
@@ -163,21 +147,9 @@ def _dump_md_content(
     if md_make_mode is None:
         md_make_mode = MakeMode.MM_MD
-    total_length = 0
-    end_indices = []
-    with md_path.open("w") as f:
-        n_pages = len(pdf_info)
-        for page_i, page in enumerate(pdf_info):
-            content = md_make_fn([page], md_make_mode, str(im_dir))
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    end_indices = PageIndexes.from_page_end_indices(end_indices)
+    pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     output_path = md_path.parent.relative_to(output_path)
-    output = ConversionOutput(path=output_path, pages=end_indices)
+    output = ConversionOutput(path=output_path, pages=pages)
     return output

{extract_python-0.5.13 → extract_python-0.7.0}/extract_python/utils.py RENAMED Viewed

@@ -5,9 +5,9 @@ from copy import copy
 from functools import wraps
 from itertools import tee
 from pathlib import Path, PurePath
-from typing import Protocol, TypeVar
+from typing import BinaryIO, Protocol, TypeVar
-from extract_core import Error, InputDoc, Result, Status
+from extract_core import Error, InputDoc, Pages, Result, Status
 R = TypeVar("R")
 In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
     finally:
         os.environ.clear()
         os.environ.update(old_env)
+def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
+    pages_byte_sizes = []
+    pages = iter(pages)
+    content = None
+    for p in pages:
+        if content:
+            pages_byte_sizes.append(out.write((content + page_sep).encode()))
+        content = p
+    if content:
+        pages_byte_sizes.append(out.write(content.encode()))
+    return Pages.from_pages_bytes_sizes(pages_byte_sizes)

{extract_python-0.5.13 → extract_python-0.7.0}/pyproject.toml RENAMED Viewed

@@ -9,7 +9,7 @@ readme = "README.md"
 requires-python = ">=3.11,<3.14"
 dependencies = [
   "icij-common~=0.8.2",
-  "extract-core~=0.5.5",
+  "extract-core~=0.6.0",
 ]
 [project.optional-dependencies]
@@ -51,14 +51,28 @@ override-dependencies = [
   "pillow==11.3.0",
 ]
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
 [tool.uv.sources]
 extract-core = { path = "../extract-core", editable = true }
+torch = [
+  { index = "pytorch-cpu" },
+]
+torchvision = [
+  { index = "pytorch-cpu" },
+]
 [dependency-groups]
 dev = [
   "pytest~=8.3.5",
   "pytest-asyncio~=0.25.3",
   "ruff==0.15.2",
+  "torch==2.12.0",
+  "torchvision==0.27.0",
 ]
 [project.urls]

extract-python 0.5.13__tar.gz → 0.7.0__tar.gz

extract-python 0.5.13tar.gz → 0.7.0tar.gz