extract-python 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/constants.py +1 -1
- extract_python/docling_.py +27 -29
- extract_python/marker_.py +16 -25
- extract_python/miner_u.py +5 -18
- extract_python/utils.py +15 -2
- {extract_python-0.6.0.dist-info → extract_python-0.7.1.dist-info}/METADATA +3 -3
- extract_python-0.7.1.dist-info/RECORD +9 -0
- extract_python-0.6.0.dist-info/RECORD +0 -9
- {extract_python-0.6.0.dist-info → extract_python-0.7.1.dist-info}/WHEEL +0 -0
extract_python/constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
ARTIFACTS = "artifacts"
|
|
2
|
-
DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div
|
|
2
|
+
DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
|
extract_python/docling_.py
CHANGED
|
@@ -23,7 +23,6 @@ from extract_core import (
|
|
|
23
23
|
InputDoc,
|
|
24
24
|
MarkdownDoc,
|
|
25
25
|
OutputFormat,
|
|
26
|
-
PageIndexes,
|
|
27
26
|
Pipeline,
|
|
28
27
|
PipelineType,
|
|
29
28
|
Result,
|
|
@@ -34,7 +33,7 @@ from pydantic import ConfigDict, field_serializer
|
|
|
34
33
|
from pydantic_core.core_schema import SerializerFunctionWrapHandler
|
|
35
34
|
|
|
36
35
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
37
|
-
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
|
|
36
|
+
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
|
|
38
37
|
|
|
39
38
|
logger = logging.getLogger(__name__)
|
|
40
39
|
|
|
@@ -115,39 +114,38 @@ def _to_markdown_doc(
|
|
|
115
114
|
raise FileExistsError(f"directory {md_dir} already exists")
|
|
116
115
|
# Let's avoid issue of duplicated input file names flattened top level
|
|
117
116
|
md_filename = md_dir_name + OutputFormat.MARKDOWN
|
|
118
|
-
total_length = 0
|
|
119
|
-
n_pages = len(res.pages)
|
|
120
|
-
|
|
121
117
|
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
|
|
122
118
|
tmp_dir = Path(td)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
artifacts_dir=Path(ARTIFACTS),
|
|
134
|
-
**kwargs,
|
|
135
|
-
)
|
|
136
|
-
content = page_path.read_text()
|
|
137
|
-
if page_i > 0:
|
|
138
|
-
content += "\n"
|
|
139
|
-
if page_i < n_pages - 1:
|
|
140
|
-
content += page_sep
|
|
141
|
-
total_length += len(content)
|
|
142
|
-
end_indices.append(total_length)
|
|
143
|
-
f.write(content)
|
|
144
|
-
f.flush()
|
|
145
|
-
page_path.unlink()
|
|
119
|
+
md_path = tmp_dir / md_filename
|
|
120
|
+
current_page_path = tmp_dir / "page.md"
|
|
121
|
+
with chdir(tmp_dir):
|
|
122
|
+
# We do a chdir to bypass a Docling bug which only allows to maintain
|
|
123
|
+
# relative image ref when saving the markdown to a relative path
|
|
124
|
+
pages = _docling_pages_it(res, current_page_path, **kwargs)
|
|
125
|
+
with md_path.open("wb") as f:
|
|
126
|
+
pages = write_pages(pages, page_sep, f)
|
|
127
|
+
# Clean up the tmp page file before move everything to the end destination
|
|
128
|
+
current_page_path.unlink(missing_ok=True)
|
|
146
129
|
shutil.move(tmp_dir, md_dir)
|
|
147
|
-
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
148
130
|
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
|
149
131
|
|
|
150
132
|
|
|
133
|
+
def _docling_pages_it(
|
|
134
|
+
res: ConversionResult, output_path: Path, **kwargs
|
|
135
|
+
) -> Iterable[str]:
|
|
136
|
+
n_pages = len(res.pages)
|
|
137
|
+
for page_i in range(n_pages):
|
|
138
|
+
res.document.save_as_markdown(
|
|
139
|
+
output_path,
|
|
140
|
+
page_no=page_i + 1,
|
|
141
|
+
image_mode=ImageRefMode.REFERENCED,
|
|
142
|
+
artifacts_dir=Path(ARTIFACTS),
|
|
143
|
+
**kwargs,
|
|
144
|
+
)
|
|
145
|
+
content = output_path.read_text()
|
|
146
|
+
yield content
|
|
147
|
+
|
|
148
|
+
|
|
151
149
|
class SerializableFormatOptions(DoclingFormatOption):
|
|
152
150
|
# Utility class to serialize Python format options into a JSON which can be
|
|
153
151
|
# correctly deserialized into a docling FormatOption
|
extract_python/marker_.py
CHANGED
|
@@ -9,15 +9,14 @@ from extract_core import (
|
|
|
9
9
|
InputDoc,
|
|
10
10
|
MarkdownDoc,
|
|
11
11
|
OutputFormat,
|
|
12
|
-
PageIndexes,
|
|
13
12
|
Pipeline,
|
|
14
13
|
PipelineType,
|
|
15
14
|
Result,
|
|
16
15
|
Status,
|
|
17
16
|
)
|
|
18
17
|
|
|
19
|
-
from .constants import ARTIFACTS
|
|
20
|
-
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
18
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
19
|
+
from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
|
|
21
20
|
|
|
22
21
|
if TYPE_CHECKING:
|
|
23
22
|
from marker.converters.pdf import PdfConverter
|
|
@@ -63,7 +62,9 @@ async def _process_doc(
|
|
|
63
62
|
content, _, images = text_from_rendered(rendered)
|
|
64
63
|
match output_format:
|
|
65
64
|
case OutputFormat.MARKDOWN:
|
|
66
|
-
output = _to_markdown_doc(
|
|
65
|
+
output = _to_markdown_doc(
|
|
66
|
+
doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
|
|
67
|
+
)
|
|
67
68
|
case _:
|
|
68
69
|
raise NotImplementedError(f"unsupported output format {output_format}")
|
|
69
70
|
input_doc = doc.without_content()
|
|
@@ -71,7 +72,12 @@ async def _process_doc(
|
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
def _to_markdown_doc(
|
|
74
|
-
input_doc: InputDoc,
|
|
75
|
+
input_doc: InputDoc,
|
|
76
|
+
content: str,
|
|
77
|
+
images: dict[str, "Image"],
|
|
78
|
+
output_path: Path,
|
|
79
|
+
*,
|
|
80
|
+
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
75
81
|
) -> MarkdownDoc:
|
|
76
82
|
from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
|
|
77
83
|
|
|
@@ -85,24 +91,9 @@ def _to_markdown_doc(
|
|
|
85
91
|
im.save(artifacts_dir / im_name)
|
|
86
92
|
del images
|
|
87
93
|
gc.collect()
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
md_path
|
|
92
|
-
|
|
93
|
-
)
|
|
94
|
-
total_length = 0
|
|
95
|
-
end_indices = []
|
|
96
|
-
with md_path.open("w", encoding="utf-8") as f:
|
|
97
|
-
for page_i, page_content in enumerate(content):
|
|
98
|
-
content = page_content
|
|
99
|
-
if page_i > 0:
|
|
100
|
-
content += "\n"
|
|
101
|
-
if page_i < n_pages - 1:
|
|
102
|
-
content += page_sep
|
|
103
|
-
total_length += len(content)
|
|
104
|
-
end_indices.append(total_length)
|
|
105
|
-
f.write(content)
|
|
106
|
-
f.flush()
|
|
107
|
-
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
94
|
+
pages = content.split(MarkdownRenderer.page_separator)
|
|
95
|
+
md_path = output_path / md_dir_name / md_dir_name
|
|
96
|
+
md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
|
|
97
|
+
with md_path.open("wb") as f:
|
|
98
|
+
pages = write_pages(pages, page_sep, f)
|
|
108
99
|
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
extract_python/miner_u.py
CHANGED
|
@@ -12,7 +12,6 @@ from extract_core import (
|
|
|
12
12
|
MinerUBackend,
|
|
13
13
|
MinerUPipelineConfig,
|
|
14
14
|
OutputFormat,
|
|
15
|
-
PageIndexes,
|
|
16
15
|
Pipeline,
|
|
17
16
|
PipelineType,
|
|
18
17
|
Result,
|
|
@@ -20,7 +19,7 @@ from extract_core import (
|
|
|
20
19
|
)
|
|
21
20
|
|
|
22
21
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
23
|
-
from .utils import path_to_artifacts_dirname, reset_env
|
|
22
|
+
from .utils import path_to_artifacts_dirname, reset_env, write_pages
|
|
24
23
|
|
|
25
24
|
_MINER_U_CONVERSION_ERRORS = tuple()
|
|
26
25
|
MDMakeFunction = Callable[[list, str, str], str | None]
|
|
@@ -148,21 +147,9 @@ def _dump_md_content(
|
|
|
148
147
|
|
|
149
148
|
if md_make_mode is None:
|
|
150
149
|
md_make_mode = MakeMode.MM_MD
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
n_pages = len(pdf_info)
|
|
155
|
-
for page_i, page in enumerate(pdf_info):
|
|
156
|
-
content = md_make_fn([page], md_make_mode, str(im_dir))
|
|
157
|
-
if page_i > 0:
|
|
158
|
-
content += "\n"
|
|
159
|
-
if page_i < n_pages - 1:
|
|
160
|
-
content += page_sep
|
|
161
|
-
total_length += len(content)
|
|
162
|
-
end_indices.append(total_length)
|
|
163
|
-
f.write(content)
|
|
164
|
-
f.flush()
|
|
165
|
-
end_indices = PageIndexes.from_page_end_indices(end_indices)
|
|
150
|
+
pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
|
|
151
|
+
with md_path.open("wb") as f:
|
|
152
|
+
pages = write_pages(pages, page_sep, f)
|
|
166
153
|
output_path = md_path.parent.relative_to(output_path)
|
|
167
|
-
output = ConversionOutput(path=output_path, pages=
|
|
154
|
+
output = ConversionOutput(path=output_path, pages=pages)
|
|
168
155
|
return output
|
extract_python/utils.py
CHANGED
|
@@ -5,9 +5,9 @@ from copy import copy
|
|
|
5
5
|
from functools import wraps
|
|
6
6
|
from itertools import tee
|
|
7
7
|
from pathlib import Path, PurePath
|
|
8
|
-
from typing import Protocol, TypeVar
|
|
8
|
+
from typing import BinaryIO, Protocol, TypeVar
|
|
9
9
|
|
|
10
|
-
from extract_core import Error, InputDoc, Result, Status
|
|
10
|
+
from extract_core import Error, InputDoc, Pages, Result, Status
|
|
11
11
|
|
|
12
12
|
R = TypeVar("R")
|
|
13
13
|
In = TypeVar("In")
|
|
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
|
|
|
73
73
|
finally:
|
|
74
74
|
os.environ.clear()
|
|
75
75
|
os.environ.update(old_env)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
|
|
79
|
+
pages_byte_sizes = []
|
|
80
|
+
pages = iter(pages)
|
|
81
|
+
content = None
|
|
82
|
+
for p in pages:
|
|
83
|
+
if content:
|
|
84
|
+
pages_byte_sizes.append(out.write((content + page_sep).encode()))
|
|
85
|
+
content = p
|
|
86
|
+
if content:
|
|
87
|
+
pages_byte_sizes.append(out.write(content.encode()))
|
|
88
|
+
return Pages.from_pages_bytes_sizes(pages_byte_sizes)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Structured content extraction
|
|
5
5
|
Project-URL: Homepage, https://github.com/ICIJ/extract-python
|
|
6
6
|
Project-URL: Repository, https://github.com/ICIJ/extract-python
|
|
7
7
|
Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
|
|
8
8
|
Author-email: Clément Doumouro <cdoumouro@icij.org>
|
|
9
|
-
Requires-Python: <3.
|
|
10
|
-
Requires-Dist: extract-core~=0.
|
|
9
|
+
Requires-Python: <3.15,>=3.13
|
|
10
|
+
Requires-Dist: extract-core~=0.7.0
|
|
11
11
|
Requires-Dist: icij-common~=0.8.2
|
|
12
12
|
Provides-Extra: benches
|
|
13
13
|
Requires-Dist: html2image~=2.0.7; extra == 'benches'
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
|
|
2
|
+
extract_python/constants.py,sha256=659V40LcTWJhX3IbuJLSSvI5AsGJh9ciMrGCfzJn2zA,98
|
|
3
|
+
extract_python/docling_.py,sha256=j1rVhKG7m1ef43VDsS6XGP0INPRY1Rcovzf1mjZ57tU,7352
|
|
4
|
+
extract_python/marker_.py,sha256=R_SXhqk5GmEWqJrYgg3tRdXKHms7n0FueNr-aOCDvLc,3358
|
|
5
|
+
extract_python/miner_u.py,sha256=MtXmnG-dFIGa3dXVrixfUU32yc88US0dhu7E3x6wQIM,5415
|
|
6
|
+
extract_python/utils.py,sha256=9IWW9_VVdUPHOHhdDgkXx16R1X1FPz8-nTBNYsLCFfA,2443
|
|
7
|
+
extract_python-0.7.1.dist-info/METADATA,sha256=zBt-q5GlvTXtkITwZgKRgqVWfkRhJxXPcLwOpucAEiY,1218
|
|
8
|
+
extract_python-0.7.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
extract_python-0.7.1.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
|
|
2
|
-
extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
|
|
3
|
-
extract_python/docling_.py,sha256=dRNnOEuVnGKmhtZpWXZ9PhoeCJwP-eAm3JBZrDQzeQc,7425
|
|
4
|
-
extract_python/marker_.py,sha256=ZXaZ11TkILnz5ChWDQP7yunBTRWZl2TgsigTTpA86v0,3697
|
|
5
|
-
extract_python/miner_u.py,sha256=YYqeOVDiYcyi31BUuGKJs77_FX1Zai9sxmhT4ELr15g,5826
|
|
6
|
-
extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
|
|
7
|
-
extract_python-0.6.0.dist-info/METADATA,sha256=NGBhFpPoTcIwvGyt5kjWGaIfy6NuP7fhWCZ2NkbNIP0,1218
|
|
8
|
-
extract_python-0.6.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
-
extract_python-0.6.0.dist-info/RECORD,,
|
|
File without changes
|