extract-python 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
1
  ARTIFACTS = "artifacts"
2
- DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
2
+ DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
@@ -23,7 +23,6 @@ from extract_core import (
23
23
  InputDoc,
24
24
  MarkdownDoc,
25
25
  OutputFormat,
26
- PageIndexes,
27
26
  Pipeline,
28
27
  PipelineType,
29
28
  Result,
@@ -34,7 +33,7 @@ from pydantic import ConfigDict, field_serializer
34
33
  from pydantic_core.core_schema import SerializerFunctionWrapHandler
35
34
 
36
35
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
37
- from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
36
+ from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
38
37
 
39
38
  logger = logging.getLogger(__name__)
40
39
 
@@ -115,39 +114,38 @@ def _to_markdown_doc(
115
114
  raise FileExistsError(f"directory {md_dir} already exists")
116
115
  # Let's avoid issue of duplicated input file names flattened top level
117
116
  md_filename = md_dir_name + OutputFormat.MARKDOWN
118
- total_length = 0
119
- n_pages = len(res.pages)
120
-
121
117
  with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
122
118
  tmp_dir = Path(td)
123
- page_path = Path("page.md")
124
- # We do a chdir to bypass a Docling bug which only allows to maintain relative
125
- # image ref when saving the markdown to a relative path
126
- with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
127
- end_indices = []
128
- for page_i in range(n_pages):
129
- res.document.save_as_markdown(
130
- page_path,
131
- page_no=page_i + 1,
132
- image_mode=ImageRefMode.REFERENCED,
133
- artifacts_dir=Path(ARTIFACTS),
134
- **kwargs,
135
- )
136
- content = page_path.read_text()
137
- if page_i > 0:
138
- content += "\n"
139
- if page_i < n_pages - 1:
140
- content += page_sep
141
- total_length += len(content)
142
- end_indices.append(total_length)
143
- f.write(content)
144
- f.flush()
145
- page_path.unlink()
119
+ md_path = tmp_dir / md_filename
120
+ current_page_path = tmp_dir / "page.md"
121
+ with chdir(tmp_dir):
122
+ # We do a chdir to bypass a Docling bug which only allows to maintain
123
+ # relative image ref when saving the markdown to a relative path
124
+ pages = _docling_pages_it(res, current_page_path, **kwargs)
125
+ with md_path.open("wb") as f:
126
+ pages = write_pages(pages, page_sep, f)
127
+ # Clean up the tmp page file before move everything to the end destination
128
+ current_page_path.unlink(missing_ok=True)
146
129
  shutil.move(tmp_dir, md_dir)
147
- pages = PageIndexes.from_page_end_indices(end_indices)
148
130
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
149
131
 
150
132
 
133
+ def _docling_pages_it(
134
+ res: ConversionResult, output_path: Path, **kwargs
135
+ ) -> Iterable[str]:
136
+ n_pages = len(res.pages)
137
+ for page_i in range(n_pages):
138
+ res.document.save_as_markdown(
139
+ output_path,
140
+ page_no=page_i + 1,
141
+ image_mode=ImageRefMode.REFERENCED,
142
+ artifacts_dir=Path(ARTIFACTS),
143
+ **kwargs,
144
+ )
145
+ content = output_path.read_text()
146
+ yield content
147
+
148
+
151
149
  class SerializableFormatOptions(DoclingFormatOption):
152
150
  # Utility class to serialize Python format options into a JSON which can be
153
151
  # correctly deserialized into a docling FormatOption
extract_python/marker_.py CHANGED
@@ -9,15 +9,14 @@ from extract_core import (
9
9
  InputDoc,
10
10
  MarkdownDoc,
11
11
  OutputFormat,
12
- PageIndexes,
13
12
  Pipeline,
14
13
  PipelineType,
15
14
  Result,
16
15
  Status,
17
16
  )
18
17
 
19
- from .constants import ARTIFACTS
20
- from .utils import path_to_artifacts_dirname, report_recoverable_errors
18
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
19
+ from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
21
20
 
22
21
  if TYPE_CHECKING:
23
22
  from marker.converters.pdf import PdfConverter
@@ -63,7 +62,9 @@ async def _process_doc(
63
62
  content, _, images = text_from_rendered(rendered)
64
63
  match output_format:
65
64
  case OutputFormat.MARKDOWN:
66
- output = _to_markdown_doc(doc, content, images, output_path)
65
+ output = _to_markdown_doc(
66
+ doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
67
+ )
67
68
  case _:
68
69
  raise NotImplementedError(f"unsupported output format {output_format}")
69
70
  input_doc = doc.without_content()
@@ -71,7 +72,12 @@ async def _process_doc(
71
72
 
72
73
 
73
74
  def _to_markdown_doc(
74
- input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
75
+ input_doc: InputDoc,
76
+ content: str,
77
+ images: dict[str, "Image"],
78
+ output_path: Path,
79
+ *,
80
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
75
81
  ) -> MarkdownDoc:
76
82
  from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
77
83
 
@@ -85,24 +91,9 @@ def _to_markdown_doc(
85
91
  im.save(artifacts_dir / im_name)
86
92
  del images
87
93
  gc.collect()
88
- page_sep = MarkdownRenderer.page_separator
89
- content = content.split(page_sep)
90
- n_pages = len(content)
91
- md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
92
- OutputFormat.MARKDOWN.value
93
- )
94
- total_length = 0
95
- end_indices = []
96
- with md_path.open("w", encoding="utf-8") as f:
97
- for page_i, page_content in enumerate(content):
98
- content = page_content
99
- if page_i > 0:
100
- content += "\n"
101
- if page_i < n_pages - 1:
102
- content += page_sep
103
- total_length += len(content)
104
- end_indices.append(total_length)
105
- f.write(content)
106
- f.flush()
107
- pages = PageIndexes.from_page_end_indices(end_indices)
94
+ pages = content.split(MarkdownRenderer.page_separator)
95
+ md_path = output_path / md_dir_name / md_dir_name
96
+ md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
97
+ with md_path.open("wb") as f:
98
+ pages = write_pages(pages, page_sep, f)
108
99
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
extract_python/miner_u.py CHANGED
@@ -12,7 +12,6 @@ from extract_core import (
12
12
  MinerUBackend,
13
13
  MinerUPipelineConfig,
14
14
  OutputFormat,
15
- PageIndexes,
16
15
  Pipeline,
17
16
  PipelineType,
18
17
  Result,
@@ -20,7 +19,7 @@ from extract_core import (
20
19
  )
21
20
 
22
21
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
23
- from .utils import path_to_artifacts_dirname, reset_env
22
+ from .utils import path_to_artifacts_dirname, reset_env, write_pages
24
23
 
25
24
  _MINER_U_CONVERSION_ERRORS = tuple()
26
25
  MDMakeFunction = Callable[[list, str, str], str | None]
@@ -148,21 +147,9 @@ def _dump_md_content(
148
147
 
149
148
  if md_make_mode is None:
150
149
  md_make_mode = MakeMode.MM_MD
151
- total_length = 0
152
- end_indices = []
153
- with md_path.open("w") as f:
154
- n_pages = len(pdf_info)
155
- for page_i, page in enumerate(pdf_info):
156
- content = md_make_fn([page], md_make_mode, str(im_dir))
157
- if page_i > 0:
158
- content += "\n"
159
- if page_i < n_pages - 1:
160
- content += page_sep
161
- total_length += len(content)
162
- end_indices.append(total_length)
163
- f.write(content)
164
- f.flush()
165
- end_indices = PageIndexes.from_page_end_indices(end_indices)
150
+ pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
151
+ with md_path.open("wb") as f:
152
+ pages = write_pages(pages, page_sep, f)
166
153
  output_path = md_path.parent.relative_to(output_path)
167
- output = ConversionOutput(path=output_path, pages=end_indices)
154
+ output = ConversionOutput(path=output_path, pages=pages)
168
155
  return output
extract_python/utils.py CHANGED
@@ -5,9 +5,9 @@ from copy import copy
5
5
  from functools import wraps
6
6
  from itertools import tee
7
7
  from pathlib import Path, PurePath
8
- from typing import Protocol, TypeVar
8
+ from typing import BinaryIO, Protocol, TypeVar
9
9
 
10
- from extract_core import Error, InputDoc, Result, Status
10
+ from extract_core import Error, InputDoc, Pages, Result, Status
11
11
 
12
12
  R = TypeVar("R")
13
13
  In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
73
73
  finally:
74
74
  os.environ.clear()
75
75
  os.environ.update(old_env)
76
+
77
+
78
+ def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
79
+ pages_byte_sizes = []
80
+ pages = iter(pages)
81
+ content = None
82
+ for p in pages:
83
+ if content:
84
+ pages_byte_sizes.append(out.write((content + page_sep).encode()))
85
+ content = p
86
+ if content:
87
+ pages_byte_sizes.append(out.write(content.encode()))
88
+ return Pages.from_pages_bytes_sizes(pages_byte_sizes)
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.6.0
3
+ Version: 0.7.1
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
- Requires-Python: <3.14,>=3.11
10
- Requires-Dist: extract-core~=0.6.0
9
+ Requires-Python: <3.15,>=3.13
10
+ Requires-Dist: extract-core~=0.7.0
11
11
  Requires-Dist: icij-common~=0.8.2
12
12
  Provides-Extra: benches
13
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -0,0 +1,9 @@
1
+ extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
2
+ extract_python/constants.py,sha256=659V40LcTWJhX3IbuJLSSvI5AsGJh9ciMrGCfzJn2zA,98
3
+ extract_python/docling_.py,sha256=j1rVhKG7m1ef43VDsS6XGP0INPRY1Rcovzf1mjZ57tU,7352
4
+ extract_python/marker_.py,sha256=R_SXhqk5GmEWqJrYgg3tRdXKHms7n0FueNr-aOCDvLc,3358
5
+ extract_python/miner_u.py,sha256=MtXmnG-dFIGa3dXVrixfUU32yc88US0dhu7E3x6wQIM,5415
6
+ extract_python/utils.py,sha256=9IWW9_VVdUPHOHhdDgkXx16R1X1FPz8-nTBNYsLCFfA,2443
7
+ extract_python-0.7.1.dist-info/METADATA,sha256=zBt-q5GlvTXtkITwZgKRgqVWfkRhJxXPcLwOpucAEiY,1218
8
+ extract_python-0.7.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ extract_python-0.7.1.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
2
- extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
3
- extract_python/docling_.py,sha256=dRNnOEuVnGKmhtZpWXZ9PhoeCJwP-eAm3JBZrDQzeQc,7425
4
- extract_python/marker_.py,sha256=ZXaZ11TkILnz5ChWDQP7yunBTRWZl2TgsigTTpA86v0,3697
5
- extract_python/miner_u.py,sha256=YYqeOVDiYcyi31BUuGKJs77_FX1Zai9sxmhT4ELr15g,5826
6
- extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
7
- extract_python-0.6.0.dist-info/METADATA,sha256=NGBhFpPoTcIwvGyt5kjWGaIfy6NuP7fhWCZ2NkbNIP0,1218
8
- extract_python-0.6.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
- extract_python-0.6.0.dist-info/RECORD,,