extract-python 0.4.1__tar.gz → 0.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {extract_python-0.4.1 → extract_python-0.5.4}/PKG-INFO +3 -1
  2. {extract_python-0.4.1 → extract_python-0.5.4}/benches/compare.ipynb +2 -2
  3. {extract_python-0.4.1 → extract_python-0.5.4}/benches/compare.py +7 -5
  4. extract_python-0.5.4/extract_python/__init__.py +23 -0
  5. extract_python-0.5.4/extract_python/constants.py +2 -0
  6. extract_python-0.5.4/extract_python/docling_.py +130 -0
  7. {extract_python-0.4.1 → extract_python-0.5.4}/extract_python/marker_.py +7 -7
  8. {extract_python-0.4.1 → extract_python-0.5.4}/extract_python/miner_u.py +10 -74
  9. {extract_python-0.4.1 → extract_python-0.5.4}/extract_python/utils.py +4 -10
  10. {extract_python-0.4.1 → extract_python-0.5.4}/pyproject.toml +9 -1
  11. {extract_python-0.4.1 → extract_python-0.5.4}/uv.lock +169 -311
  12. extract_python-0.4.1/.dockerignore +0 -6
  13. extract_python-0.4.1/.github/workflows/publish.yml +0 -45
  14. extract_python-0.4.1/.github/workflows/tests.yml +0 -79
  15. extract_python-0.4.1/Dockerfile +0 -76
  16. extract_python-0.4.1/docker-compose.yml +0 -107
  17. extract_python-0.4.1/extract +0 -42
  18. extract_python-0.4.1/extract_python/__init__.py +0 -41
  19. extract_python-0.4.1/extract_python/constants.py +0 -6
  20. extract_python-0.4.1/extract_python/docling_.py +0 -278
  21. extract_python-0.4.1/extract_python/objects.py +0 -322
  22. extract_python-0.4.1/extract_python/pipeline.py +0 -38
  23. extract_python-0.4.1/qa/ruff.toml +0 -58
  24. {extract_python-0.4.1 → extract_python-0.5.4}/.gitignore +0 -0
  25. {extract_python-0.4.1 → extract_python-0.5.4}/.python-version +0 -0
  26. {extract_python-0.4.1 → extract_python-0.5.4}/README.md +0 -0
  27. {extract_python-0.4.1 → extract_python-0.5.4}/benches/__init__.py +0 -0
  28. {extract_python-0.4.1 → extract_python-0.5.4}/benches/constants.py +0 -0
  29. {extract_python-0.4.1 → extract_python-0.5.4}/data/.gitignore +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.4.1
3
+ Version: 0.5.4
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
9
  Requires-Python: <3.14,>=3.11
10
+ Requires-Dist: extract-core~=0.1
10
11
  Requires-Dist: icij-common~=0.8.2
11
12
  Provides-Extra: benches
12
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -21,4 +22,5 @@ Provides-Extra: mineru
21
22
  Requires-Dist: mineru[mlx]~=3.2; (sys_platform == 'darwin') and extra == 'mineru'
22
23
  Requires-Dist: mineru[pipeline,vlm]~=3.2; extra == 'mineru'
23
24
  Requires-Dist: pydantic-extra-types[pycountry]~=2.11; extra == 'mineru'
25
+ Requires-Dist: python-pptx~=1.0; extra == 'mineru'
24
26
  Requires-Dist: six~=1.17; extra == 'mineru'
@@ -13,8 +13,8 @@
13
13
  "from extract_python.benches.compare import (\n",
14
14
  " compare,\n",
15
15
  ")\n",
16
- "from extract_python.pipelines import DoclingPipeline, MarkerPipeline\n",
17
- "from extract_python.objects import InputDoc, OutputFormat"
16
+ "from extract_python.objects import InputDoc, OutputFormat\n",
17
+ "from extract_python.pipelines import DoclingPipeline, MarkerPipeline"
18
18
  ]
19
19
  },
20
20
  {
@@ -3,12 +3,11 @@ from tempfile import TemporaryDirectory
3
3
 
4
4
  import markdown2
5
5
  import pypdfium2
6
- from extract_python.pipelines.utils import chdir
6
+ from extract_core import BaseModel, OutputFormat, PageIndexes
7
+ from extract_python.utils import chdir
7
8
  from html2image import Html2Image
8
9
  from PIL import Image, ImageDraw
9
10
 
10
- from extract_python.objects import BaseModel, OutputFormat, PageIndexes
11
-
12
11
  _WHITE_BACKGROUND_CSS = "body {background: white;}"
13
12
 
14
13
 
@@ -146,7 +145,10 @@ def _scan_pages(
146
145
  ).root
147
146
  for compared in comparison.compared
148
147
  ]
149
- all_pages = zip(*all_pages)
148
+ all_pages = zip(*all_pages, strict=True)
150
149
  compared_names = (p.parent.name for p in comparison.compared)
151
- pages = [dict(zip(compared_names, page_comp_ixs)) for page_comp_ixs in all_pages]
150
+ pages = [
151
+ dict(zip(compared_names, page_comp_ixs, strict=True))
152
+ for page_comp_ixs in all_pages
153
+ ]
152
154
  return pages
@@ -0,0 +1,23 @@
1
+ try:
2
+ from .docling_ import DOCLING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline
3
+ except ImportError:
4
+ DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline = None, None
5
+
6
+ try:
7
+ from .marker_ import MarkerPipeline
8
+ except ImportError:
9
+ MarkerPipeline = None
10
+
11
+
12
+ try:
13
+ from .miner_u import MinerUPipeline
14
+ except ImportError:
15
+ MinerUPipeline = None
16
+
17
+
18
+ __all__ = [
19
+ "DoclingPipeline",
20
+ "DOCLING_DEFAULT_ARTIFACTS_PATH",
21
+ "MarkerPipeline",
22
+ "MinerUPipeline",
23
+ ]
@@ -0,0 +1,2 @@
1
+ ARTIFACTS = "artifacts"
2
+ DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
@@ -0,0 +1,130 @@
1
+ import shutil
2
+ import tempfile
3
+ from collections.abc import AsyncGenerator, Iterable, Iterator
4
+ from pathlib import Path
5
+
6
+ from docling.datamodel.base_models import InputFormat
7
+ from docling.datamodel.document import ConversionResult
8
+ from docling.document_converter import DocumentConverter
9
+
10
+ # TODO: this is long to load improve it
11
+ from docling_core.types.doc import ImageRefMode
12
+ from docling_core.types.io import DocumentStream
13
+ from extract_core import (
14
+ DoclingFormatOption,
15
+ DoclingPipelineConfig,
16
+ Error,
17
+ InputDoc,
18
+ MarkdownDoc,
19
+ OutputFormat,
20
+ PageIndexes,
21
+ Pipeline,
22
+ PipelineType,
23
+ Result,
24
+ Status,
25
+ )
26
+ from icij_common.registrable import FromConfig
27
+
28
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
29
+ from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
30
+
31
+ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
32
+
33
+
34
+ @Pipeline.register(PipelineType.DOCLING)
35
+ class DoclingPipeline(Pipeline):
36
+ def __init__(
37
+ self, format_options: dict["InputFormat", DoclingFormatOption] | None = None
38
+ ):
39
+ format_options = {k: v.to_docling() for k, v in format_options.items()}
40
+ allowed_format = [
41
+ f.to_docling() for f in DoclingPipelineConfig.supported_exts()
42
+ ]
43
+ self._converter = DocumentConverter(
44
+ allowed_formats=allowed_format, format_options=format_options
45
+ )
46
+
47
+ async def extract_content(
48
+ self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
49
+ ) -> AsyncGenerator[Result, None]:
50
+ docs, path_or_streams = map_and_preserve(_to_docling, docs)
51
+ outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
52
+ for doc, res in zip(docs, outputs, strict=True):
53
+ yield _to_result(res, doc, output_format, output_path=output_path)
54
+
55
+ @classmethod
56
+ def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
57
+ return cls(config.format_options)
58
+
59
+
60
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
61
+ for d in docs:
62
+ yield d.to_docling()
63
+
64
+
65
+ def _to_result(
66
+ res: ConversionResult,
67
+ input_document: InputDoc,
68
+ output_format: OutputFormat,
69
+ output_path: Path,
70
+ **kwargs,
71
+ ) -> Result:
72
+ output_path.mkdir(parents=True, exist_ok=True)
73
+ output = None
74
+ status = Status.from_docling(res.status)
75
+ if status.allows_conversion:
76
+ match output_format:
77
+ case OutputFormat.MARKDOWN:
78
+ output = _to_markdown_doc(res, output_path, **kwargs)
79
+ case _:
80
+ raise NotImplementedError(f"unsupported output format {output_format}")
81
+ errors = [Error.from_docling(e) for e in res.errors]
82
+ input_doc = input_document.without_content()
83
+ return Result(input=input_doc, status=status, errors=errors, output=output)
84
+
85
+
86
+ def _to_markdown_doc(
87
+ res: ConversionResult,
88
+ output_path: Path,
89
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
90
+ **kwargs,
91
+ ) -> MarkdownDoc:
92
+ # TODO: Should we add a hash to avoid collision between files with same names
93
+ # nested in the tree structured
94
+ md_dir_name = path_to_artifacts_dirname(res.input.file)
95
+ md_dir = output_path / md_dir_name
96
+ if md_dir.exists():
97
+ raise FileExistsError(f"directory {md_dir} already exists")
98
+ # Let's avoid issue of duplicated input file names flattened top level
99
+ md_filename = md_dir_name + OutputFormat.MARKDOWN
100
+ total_length = 0
101
+ n_pages = len(res.pages)
102
+
103
+ with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
104
+ tmp_dir = Path(td)
105
+ page_path = Path("page.md")
106
+ # We do a chdir to bypass a Docling bug which only allows to maintain relative
107
+ # image ref when saving the markdown to a relative path
108
+ with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
109
+ end_indices = []
110
+ for page_i in range(n_pages):
111
+ res.document.save_as_markdown(
112
+ page_path,
113
+ page_no=page_i + 1,
114
+ image_mode=ImageRefMode.REFERENCED,
115
+ artifacts_dir=Path(ARTIFACTS),
116
+ **kwargs,
117
+ )
118
+ content = page_path.read_text()
119
+ if page_i > 0:
120
+ content += "\n"
121
+ if page_i < n_pages - 1:
122
+ content += page_sep
123
+ total_length += len(content)
124
+ end_indices.append(total_length)
125
+ f.write(content)
126
+ f.flush()
127
+ page_path.unlink()
128
+ shutil.move(tmp_dir, md_dir)
129
+ pages = PageIndexes.from_page_end_indices(end_indices)
130
+ return MarkdownDoc(path=Path(md_dir_name), pages=pages)
@@ -5,10 +5,8 @@ from functools import cache
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, ClassVar, Self
7
7
 
8
- from pydantic import Field
9
-
10
- from .constants import ARTIFACTS
11
- from .objects import (
8
+ from extract_core import BasePipelineConfig, Pipeline, PipelineType
9
+ from extract_core.objects import (
12
10
  InputDoc,
13
11
  MarkdownDoc,
14
12
  OutputFormat,
@@ -17,7 +15,9 @@ from .objects import (
17
15
  Status,
18
16
  SupportedExt,
19
17
  )
20
- from .pipeline import Pipeline, PipelineConfig, PipelineType
18
+ from pydantic import Field
19
+
20
+ from .constants import ARTIFACTS
21
21
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
22
22
 
23
23
  if TYPE_CHECKING:
@@ -25,10 +25,10 @@ if TYPE_CHECKING:
25
25
  from PIL import Image
26
26
 
27
27
 
28
- class MarkerPipelineConfig(PipelineConfig):
28
+ class MarkerPipelineConfig(BasePipelineConfig):
29
29
  pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
30
30
 
31
- config: dict[str, Any] = dict()
31
+ config: dict[str, Any] = Field(default_factory=dict)
32
32
 
33
33
  @classmethod
34
34
  @cache
@@ -1,96 +1,32 @@
1
1
  import json
2
2
  import shutil
3
3
  from collections.abc import AsyncGenerator, Callable, Iterable
4
- from copy import copy
5
- from enum import StrEnum
6
- from functools import cache, partial
4
+ from functools import partial
7
5
  from pathlib import Path
8
6
  from tempfile import TemporaryDirectory
9
- from typing import Any, ClassVar, Self
7
+ from typing import Self
10
8
 
11
- from pydantic import Field
12
- from pydantic_extra_types.language_code import LanguageAlpha2
13
-
14
- from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
15
- from .objects import (
16
- BaseModel,
9
+ from extract_core import (
17
10
  ConversionOutput,
18
11
  InputDoc,
12
+ MinerUBackend,
13
+ MinerUConfig,
14
+ MinerUPipelineConfig,
19
15
  OutputFormat,
20
16
  PageIndexes,
17
+ Pipeline,
18
+ PipelineType,
21
19
  Result,
22
20
  Status,
23
- SupportedExt,
24
21
  )
25
- from .pipeline import Pipeline, PipelineConfig, PipelineType
22
+
23
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
26
24
  from .utils import path_to_artifacts_dirname
27
25
 
28
26
  _MINER_U_CONVERSION_ERRORS = tuple()
29
27
  MDMakeFunction = Callable[[list, str, str], str | None]
30
28
 
31
29
 
32
- class MinerUBackend(StrEnum):
33
- PIPELINE = "pipeline"
34
- VLM = "vlm"
35
-
36
-
37
- class MinerUConfig(BaseModel):
38
- backend: MinerUBackend = MinerUBackend.PIPELINE
39
- enable_formula_extraction: bool = True
40
- enable_table_extraction: bool = True
41
- # TODO: use enum or literal here
42
- parse_method: str = "auto"
43
-
44
- def as_parse_kwargs(self) -> dict[str, Any]:
45
- kwargs = copy(self._get_default_kwargs())
46
- kwargs["backend"] = self.backend
47
- kwargs["parse_method"] = self.parse_method
48
- kwargs["formula_enable"] = self.enable_formula_extraction
49
- kwargs["table_enable"] = self.enable_table_extraction
50
- return kwargs
51
-
52
- @classmethod
53
- @cache
54
- def _get_default_kwargs(cls) -> dict[str, Any]:
55
- from mineru.utils.enum_class import MakeMode # noqa: PLC0415
56
-
57
- return {
58
- "server_url": None,
59
- # We don't dump md directly we process, we dump the middle json in order
60
- # to be able to get page indexes
61
- "parse_method": "auto",
62
- "dump_md": False,
63
- "dump_middle_json": True,
64
- "f_draw_layout_bbox": False,
65
- "f_draw_span_bbox": False,
66
- "f_dump_model_output": False, # might be useful for debug though
67
- "f_dump_orig_pdf": False,
68
- "f_dump_content_list": False, # might be useful for debug though
69
- "start_page_id": 0,
70
- "f_make_md_mode": MakeMode.MM_MD,
71
- "image_analysis": True,
72
- "end_page_id": None,
73
- "client_side_output_generation": False,
74
- }
75
-
76
-
77
- class MinerUPipelineConfig(PipelineConfig): # noqa: F821
78
- pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
79
-
80
- config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
81
- language: LanguageAlpha2 = Field(frozen=True, default="en")
82
-
83
- @classmethod
84
- @cache
85
- def supported_exts(cls) -> set[SupportedExt]:
86
- return {
87
- SupportedExt.PDF,
88
- SupportedExt.DOCX,
89
- SupportedExt.PPTX,
90
- SupportedExt.XLSX,
91
- }
92
-
93
-
94
30
  @Pipeline.register(PipelineType.MINER_U)
95
31
  class MinerUPipeline(Pipeline):
96
32
  def __init__(self, config: MinerUConfig, language: str):
@@ -6,26 +6,20 @@ from itertools import tee
6
6
  from pathlib import Path, PurePath
7
7
  from typing import Protocol, TypeVar
8
8
 
9
- from .objects import Error, InputDoc, Result, Status
9
+ from extract_core import Error, InputDoc, Result, Status
10
10
 
11
11
  R = TypeVar("R")
12
- T = TypeVar("T")
12
+ In = TypeVar("In")
13
13
 
14
14
 
15
15
  def map_and_preserve(
16
- fn: Callable[[Iterable[T]], Iterator[R]], inputs: Iterable[T]
17
- ) -> tuple[Iterable[T], Iterator[R]]:
16
+ fn: Callable[[Iterable[In]], Iterator[R]], inputs: Iterable[In]
17
+ ) -> tuple[Iterable[In], Iterator[R]]:
18
18
  save_inputs, function_inputs = tee(inputs)
19
19
  outputs = iter(fn(function_inputs))
20
20
  return save_inputs, outputs
21
21
 
22
22
 
23
- def all_subclasses(cls: type[T]) -> set[type[T]]:
24
- return set(cls.__subclasses__()).union(
25
- [s for c in cls.__subclasses__() for s in all_subclasses(c)]
26
- )
27
-
28
-
29
23
  def path_to_artifacts_dirname(path: PurePath, sep: str = "_") -> str:
30
24
  dirname = f"{path.name[: -len(path.suffix)]}"
31
25
  ext = path.suffix
@@ -9,6 +9,7 @@ readme = "README.md"
9
9
  requires-python = ">=3.11,<3.14"
10
10
  dependencies = [
11
11
  "icij-common~=0.8.2",
12
+ "extract-core~=0.1",
12
13
  ]
13
14
 
14
15
  [project.optional-dependencies]
@@ -31,6 +32,7 @@ mineru = [
31
32
  "mineru[pipeline,vlm]~=3.2",
32
33
  "mineru[mlx]~=3.2; sys_platform == 'darwin'",
33
34
  "pydantic-extra-types[pycountry]~=2.11",
35
+ "python-pptx~=1.0",
34
36
  "six~=1.17",
35
37
  ]
36
38
 
@@ -45,7 +47,12 @@ required-environments = [
45
47
  "sys_platform == 'darwin' and platform_machine == 'arm64'",
46
48
  "sys_platform == 'linux'",
47
49
  ]
50
+ override-dependencies = [
51
+ "pillow==11.3.0",
52
+ ]
48
53
 
54
+ [tool.uv.sources]
55
+ extract-core = { path = "../extract-core", editable = true }
49
56
 
50
57
  [dependency-groups]
51
58
  dev = [
@@ -87,4 +94,5 @@ exclude = [
87
94
  ]
88
95
  [tool.uv-dynamic-versioning]
89
96
  fallback-version = "0.0.0"
90
- pattern = "default-unprefixed"
97
+ pattern-prefix = "extract-python-"
98
+ pattern = "default-unprefixed"