extract-python 0.5.15__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
1
  ARTIFACTS = "artifacts"
2
- DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
2
+ DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
@@ -8,7 +8,6 @@ from functools import partial
8
8
  from pathlib import Path
9
9
  from typing import Any, Self
10
10
 
11
- from docling.datamodel.base_models import InputFormat
12
11
  from docling.datamodel.document import ConversionResult
13
12
  from docling.datamodel.pipeline_options import PipelineOptions
14
13
  from docling.document_converter import DocumentConverter, FormatOption
@@ -18,26 +17,23 @@ from docling_core.types.doc import ImageRefMode
18
17
  from docling_core.types.io import DocumentStream
19
18
  from extract_core import (
20
19
  BaseModel,
21
- Device,
22
20
  DoclingFormatOption,
23
21
  DoclingPipelineConfig,
24
22
  Error,
25
23
  InputDoc,
26
24
  MarkdownDoc,
27
25
  OutputFormat,
28
- PageIndexes,
29
26
  Pipeline,
30
27
  PipelineType,
31
28
  Result,
32
29
  Status,
33
30
  )
34
31
  from icij_common.pydantic_utils import merge_configs
35
- from icij_common.registrable import FromConfig
36
32
  from pydantic import ConfigDict, field_serializer
37
33
  from pydantic_core.core_schema import SerializerFunctionWrapHandler
38
34
 
39
35
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
40
- from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
36
+ from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
41
37
 
42
38
  logger = logging.getLogger(__name__)
43
39
 
@@ -46,16 +42,12 @@ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "mode
46
42
 
47
43
  @Pipeline.register(PipelineType.DOCLING)
48
44
  class DoclingPipeline(Pipeline):
49
- def __init__(
50
- self,
51
- format_options: dict["InputFormat", DoclingFormatOption] | None = None,
52
- *,
53
- device: Device = Device.CPU,
54
- ):
55
- super().__init__(device)
56
- format_options = dict()
57
- for k, v in format_options.items():
58
- format_options[k] = v.to_docling(self._device)
45
+ def __init__(self, config: DoclingPipelineConfig):
46
+ super().__init__(config)
47
+ format_options = {
48
+ k: v.to_docling(self._device)
49
+ for k, v in self._config.format_options.items()
50
+ }
59
51
  logger.info(
60
52
  "resolved format options to: %s",
61
53
  lambda: partial(json.dumps, format_options, indent=2),
@@ -81,15 +73,6 @@ class DoclingPipeline(Pipeline):
81
73
  doc = next(docs)
82
74
  yield _to_result(res, doc, output_format, output_path=output_path)
83
75
 
84
- @classmethod
85
- def _from_config(
86
- cls,
87
- config: DoclingPipelineConfig,
88
- *,
89
- device: Device = Device.CPU,
90
- ) -> FromConfig:
91
- return cls(config.format_options, device=device)
92
-
93
76
 
94
77
  def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
95
78
  for d in docs:
@@ -131,39 +114,38 @@ def _to_markdown_doc(
131
114
  raise FileExistsError(f"directory {md_dir} already exists")
132
115
  # Let's avoid issue of duplicated input file names flattened top level
133
116
  md_filename = md_dir_name + OutputFormat.MARKDOWN
134
- total_length = 0
135
- n_pages = len(res.pages)
136
-
137
117
  with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
138
118
  tmp_dir = Path(td)
139
- page_path = Path("page.md")
140
- # We do a chdir to bypass a Docling bug which only allows to maintain relative
141
- # image ref when saving the markdown to a relative path
142
- with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
143
- end_indices = []
144
- for page_i in range(n_pages):
145
- res.document.save_as_markdown(
146
- page_path,
147
- page_no=page_i + 1,
148
- image_mode=ImageRefMode.REFERENCED,
149
- artifacts_dir=Path(ARTIFACTS),
150
- **kwargs,
151
- )
152
- content = page_path.read_text()
153
- if page_i > 0:
154
- content += "\n"
155
- if page_i < n_pages - 1:
156
- content += page_sep
157
- total_length += len(content)
158
- end_indices.append(total_length)
159
- f.write(content)
160
- f.flush()
161
- page_path.unlink()
119
+ md_path = tmp_dir / md_filename
120
+ current_page_path = tmp_dir / "page.md"
121
+ with chdir(tmp_dir):
122
+ # We do a chdir to bypass a Docling bug which only allows to maintain
123
+ # relative image ref when saving the markdown to a relative path
124
+ pages = _docling_pages_it(res, current_page_path, **kwargs)
125
+ with md_path.open("wb") as f:
126
+ pages = write_pages(pages, page_sep, f)
127
+ # Clean up the tmp page file before move everything to the end destination
128
+ current_page_path.unlink(missing_ok=True)
162
129
  shutil.move(tmp_dir, md_dir)
163
- pages = PageIndexes.from_page_end_indices(end_indices)
164
130
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
165
131
 
166
132
 
133
+ def _docling_pages_it(
134
+ res: ConversionResult, output_path: Path, **kwargs
135
+ ) -> Iterable[str]:
136
+ n_pages = len(res.pages)
137
+ for page_i in range(n_pages):
138
+ res.document.save_as_markdown(
139
+ output_path,
140
+ page_no=page_i + 1,
141
+ image_mode=ImageRefMode.REFERENCED,
142
+ artifacts_dir=Path(ARTIFACTS),
143
+ **kwargs,
144
+ )
145
+ content = output_path.read_text()
146
+ yield content
147
+
148
+
167
149
  class SerializableFormatOptions(DoclingFormatOption):
168
150
  # Utility class to serialize Python format options into a JSON which can be
169
151
  # correctly deserialized into a docling FormatOption
extract_python/marker_.py CHANGED
@@ -2,82 +2,32 @@ import asyncio
2
2
  import gc
3
3
  from collections.abc import AsyncGenerator, Iterable
4
4
  from copy import deepcopy
5
- from functools import cache
6
5
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, ClassVar, Self
6
+ from typing import TYPE_CHECKING
8
7
 
9
8
  from extract_core import (
10
- BasePipelineConfig,
11
- Device,
12
9
  InputDoc,
13
10
  MarkdownDoc,
14
11
  OutputFormat,
15
- PageIndexes,
16
12
  Pipeline,
17
13
  PipelineType,
18
14
  Result,
19
15
  Status,
20
- SupportedExt,
21
16
  )
22
- from pydantic import Field
23
17
 
24
- from .constants import ARTIFACTS
25
- from .utils import path_to_artifacts_dirname, report_recoverable_errors
18
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
19
+ from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
26
20
 
27
21
  if TYPE_CHECKING:
28
22
  from marker.converters.pdf import PdfConverter
29
23
  from PIL import Image
30
24
 
31
25
 
32
- class MarkerPipelineConfig(BasePipelineConfig):
33
- pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
34
-
35
- config: dict[str, Any] = Field(default_factory=dict)
36
-
37
- @classmethod
38
- @cache
39
- def supported_exts(cls) -> set[SupportedExt]:
40
- # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
41
- return {
42
- SupportedExt.PDF,
43
- SupportedExt.XLS,
44
- SupportedExt.XLSX,
45
- SupportedExt.XLSM,
46
- SupportedExt.CSV,
47
- SupportedExt.ODS,
48
- SupportedExt.DOC,
49
- SupportedExt.DOCX,
50
- SupportedExt.ODT,
51
- SupportedExt.PPT,
52
- SupportedExt.PPTX,
53
- SupportedExt.ODP,
54
- SupportedExt.HTLM,
55
- SupportedExt.EPUB,
56
- SupportedExt.PNG,
57
- SupportedExt.JPG,
58
- SupportedExt.JPEG,
59
- SupportedExt.WEBP,
60
- SupportedExt.GIF,
61
- SupportedExt.TIFF,
62
- }
63
-
64
-
65
26
  _MARKER_CONVERSION_ERRORS = tuple()
66
27
 
67
28
 
68
29
  @Pipeline.register(PipelineType.MARKER)
69
30
  class MarkerPipeline(Pipeline):
70
- def __init__(
71
- self,
72
- marker_config: dict[str, Any] | None = None,
73
- *,
74
- device: Device = Device.CPU,
75
- ):
76
- super().__init__(device)
77
- if marker_config is None:
78
- marker_config = dict()
79
- self._marker_config = marker_config
80
-
81
31
  async def extract_content(
82
32
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
83
33
  ) -> AsyncGenerator[Result, None]:
@@ -85,7 +35,7 @@ class MarkerPipeline(Pipeline):
85
35
  from marker.converters.pdf import PdfConverter # noqa: PLC0415
86
36
  from marker.models import create_model_dict # noqa: PLC0415
87
37
 
88
- config = deepcopy(self._marker_config)
38
+ config = deepcopy(self._config.config)
89
39
  config["output_format"] = output_format.to_marker()
90
40
  config_parser = ConfigParser(config)
91
41
  renderer = config_parser.get_renderer()
@@ -98,15 +48,6 @@ class MarkerPipeline(Pipeline):
98
48
  for doc in docs:
99
49
  yield await _process_doc(doc, converter, output_format, output_path)
100
50
 
101
- @classmethod
102
- def _from_config(
103
- cls,
104
- config: MarkerPipelineConfig,
105
- *,
106
- device: Device = Device.CPU,
107
- ) -> Self:
108
- return cls(config.config, device=device)
109
-
110
51
 
111
52
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
112
53
  async def _process_doc(
@@ -121,7 +62,9 @@ async def _process_doc(
121
62
  content, _, images = text_from_rendered(rendered)
122
63
  match output_format:
123
64
  case OutputFormat.MARKDOWN:
124
- output = _to_markdown_doc(doc, content, images, output_path)
65
+ output = _to_markdown_doc(
66
+ doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
67
+ )
125
68
  case _:
126
69
  raise NotImplementedError(f"unsupported output format {output_format}")
127
70
  input_doc = doc.without_content()
@@ -129,7 +72,12 @@ async def _process_doc(
129
72
 
130
73
 
131
74
  def _to_markdown_doc(
132
- input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
75
+ input_doc: InputDoc,
76
+ content: str,
77
+ images: dict[str, "Image"],
78
+ output_path: Path,
79
+ *,
80
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
133
81
  ) -> MarkdownDoc:
134
82
  from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
135
83
 
@@ -143,24 +91,9 @@ def _to_markdown_doc(
143
91
  im.save(artifacts_dir / im_name)
144
92
  del images
145
93
  gc.collect()
146
- page_sep = MarkdownRenderer.page_separator
147
- content = content.split(page_sep)
148
- n_pages = len(content)
149
- md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
150
- OutputFormat.MARKDOWN.value
151
- )
152
- total_length = 0
153
- end_indices = []
154
- with md_path.open("w", encoding="utf-8") as f:
155
- for page_i, page_content in enumerate(content):
156
- content = page_content
157
- if page_i > 0:
158
- content += "\n"
159
- if page_i < n_pages - 1:
160
- content += page_sep
161
- total_length += len(content)
162
- end_indices.append(total_length)
163
- f.write(content)
164
- f.flush()
165
- pages = PageIndexes.from_page_end_indices(end_indices)
94
+ pages = content.split(MarkdownRenderer.page_separator)
95
+ md_path = output_path / md_dir_name / md_dir_name
96
+ md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
97
+ with md_path.open("wb") as f:
98
+ pages = write_pages(pages, page_sep, f)
166
99
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
extract_python/miner_u.py CHANGED
@@ -5,17 +5,13 @@ from collections.abc import AsyncGenerator, Callable, Iterable
5
5
  from functools import partial
6
6
  from pathlib import Path
7
7
  from tempfile import TemporaryDirectory
8
- from typing import Self
9
8
 
10
9
  from extract_core import (
11
10
  ConversionOutput,
12
- Device,
13
11
  InputDoc,
14
12
  MinerUBackend,
15
- MinerUConfig,
16
13
  MinerUPipelineConfig,
17
14
  OutputFormat,
18
- PageIndexes,
19
15
  Pipeline,
20
16
  PipelineType,
21
17
  Result,
@@ -23,7 +19,7 @@ from extract_core import (
23
19
  )
24
20
 
25
21
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
26
- from .utils import path_to_artifacts_dirname, reset_env
22
+ from .utils import path_to_artifacts_dirname, reset_env, write_pages
27
23
 
28
24
  _MINER_U_CONVERSION_ERRORS = tuple()
29
25
  MDMakeFunction = Callable[[list, str, str], str | None]
@@ -31,13 +27,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
31
27
 
32
28
  @Pipeline.register(PipelineType.MINER_U)
33
29
  class MinerUPipeline(Pipeline):
34
- def __init__(
35
- self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
36
- ):
37
- super().__init__(device)
38
- self._config = config
39
- self._language = language
40
- self._md_make_fn = _parse_md_make_fn(config.backend)
30
+ def __init__(self, config: MinerUPipelineConfig):
31
+ super().__init__(config)
32
+ self._language = self._config.language
33
+ self._md_make_fn = _parse_md_make_fn(self._config.config.backend)
41
34
 
42
35
  async def extract_content(
43
36
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
@@ -59,7 +52,7 @@ class MinerUPipeline(Pipeline):
59
52
  pdf_file_names=pdfs_names,
60
53
  pdf_bytes_list=pdfs_bytes,
61
54
  p_lang_list=p_lang_list,
62
- **self._config.as_parse_kwargs(),
55
+ **self._config.config.as_parse_kwargs(),
63
56
  )
64
57
  res_paths = [
65
58
  _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
@@ -73,15 +66,6 @@ class MinerUPipeline(Pipeline):
73
66
  output_path=output_path,
74
67
  )
75
68
 
76
- @classmethod
77
- def _from_config(
78
- cls,
79
- config: MinerUPipelineConfig,
80
- *,
81
- device: Device = Device.CPU,
82
- ) -> Self:
83
- return cls(config.config, language=config.language, device=device)
84
-
85
69
 
86
70
  def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
87
71
  output_path = output_dir / pdf_filename
@@ -163,21 +147,9 @@ def _dump_md_content(
163
147
 
164
148
  if md_make_mode is None:
165
149
  md_make_mode = MakeMode.MM_MD
166
- total_length = 0
167
- end_indices = []
168
- with md_path.open("w") as f:
169
- n_pages = len(pdf_info)
170
- for page_i, page in enumerate(pdf_info):
171
- content = md_make_fn([page], md_make_mode, str(im_dir))
172
- if page_i > 0:
173
- content += "\n"
174
- if page_i < n_pages - 1:
175
- content += page_sep
176
- total_length += len(content)
177
- end_indices.append(total_length)
178
- f.write(content)
179
- f.flush()
180
- end_indices = PageIndexes.from_page_end_indices(end_indices)
150
+ pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
151
+ with md_path.open("wb") as f:
152
+ pages = write_pages(pages, page_sep, f)
181
153
  output_path = md_path.parent.relative_to(output_path)
182
- output = ConversionOutput(path=output_path, pages=end_indices)
154
+ output = ConversionOutput(path=output_path, pages=pages)
183
155
  return output
extract_python/utils.py CHANGED
@@ -5,9 +5,9 @@ from copy import copy
5
5
  from functools import wraps
6
6
  from itertools import tee
7
7
  from pathlib import Path, PurePath
8
- from typing import Protocol, TypeVar
8
+ from typing import BinaryIO, Protocol, TypeVar
9
9
 
10
- from extract_core import Error, InputDoc, Result, Status
10
+ from extract_core import Error, InputDoc, Pages, Result, Status
11
11
 
12
12
  R = TypeVar("R")
13
13
  In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
73
73
  finally:
74
74
  os.environ.clear()
75
75
  os.environ.update(old_env)
76
+
77
+
78
+ def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
79
+ pages_byte_sizes = []
80
+ pages = iter(pages)
81
+ content = None
82
+ for p in pages:
83
+ if content:
84
+ pages_byte_sizes.append(out.write((content + page_sep).encode()))
85
+ content = p
86
+ if content:
87
+ pages_byte_sizes.append(out.write(content.encode()))
88
+ return Pages.from_pages_bytes_sizes(pages_byte_sizes)
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.5.15
3
+ Version: 0.7.0
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
9
  Requires-Python: <3.14,>=3.11
10
- Requires-Dist: extract-core~=0.5.5
10
+ Requires-Dist: extract-core~=0.6.0
11
11
  Requires-Dist: icij-common~=0.8.2
12
12
  Provides-Extra: benches
13
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -0,0 +1,9 @@
1
+ extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
2
+ extract_python/constants.py,sha256=659V40LcTWJhX3IbuJLSSvI5AsGJh9ciMrGCfzJn2zA,98
3
+ extract_python/docling_.py,sha256=j1rVhKG7m1ef43VDsS6XGP0INPRY1Rcovzf1mjZ57tU,7352
4
+ extract_python/marker_.py,sha256=R_SXhqk5GmEWqJrYgg3tRdXKHms7n0FueNr-aOCDvLc,3358
5
+ extract_python/miner_u.py,sha256=MtXmnG-dFIGa3dXVrixfUU32yc88US0dhu7E3x6wQIM,5415
6
+ extract_python/utils.py,sha256=9IWW9_VVdUPHOHhdDgkXx16R1X1FPz8-nTBNYsLCFfA,2443
7
+ extract_python-0.7.0.dist-info/METADATA,sha256=my-lfG6yqNEat77SC6mAfFerRRmTtksQMKYwHsg8aVE,1218
8
+ extract_python-0.7.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ extract_python-0.7.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
2
- extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
3
- extract_python/docling_.py,sha256=1ujMmtD63RaSdR1gvWbQAm396JODj44uBWtz9M4cFyI,7864
4
- extract_python/marker_.py,sha256=oxN1unJ9x8YW5jds1STCc2wvQ30KzQNy3dXbCIuTuQc,5311
5
- extract_python/miner_u.py,sha256=Ien3H7vZXLCACVjSMP2NAiog7yvvPq7oGgLGcfLZfpA,6159
6
- extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
7
- extract_python-0.5.15.dist-info/METADATA,sha256=S3upxGMF81cp6kMaqteJJ5gMBmQ2dQe4Xcil8DGq8s0,1219
8
- extract_python-0.5.15.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
- extract_python-0.5.15.dist-info/RECORD,,