extract-python 0.5.5__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/docling_.py +80 -6
- extract_python/marker_.py +20 -7
- extract_python/miner_u.py +41 -29
- extract_python/utils.py +11 -0
- {extract_python-0.5.5.dist-info → extract_python-0.5.8.dist-info}/METADATA +2 -2
- extract_python-0.5.8.dist-info/RECORD +9 -0
- extract_python-0.5.5.dist-info/RECORD +0 -9
- {extract_python-0.5.5.dist-info → extract_python-0.5.8.dist-info}/WHEEL +0 -0
extract_python/docling_.py
CHANGED
|
@@ -1,16 +1,20 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import shutil
|
|
2
3
|
import tempfile
|
|
3
4
|
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Any, Self
|
|
5
7
|
|
|
6
8
|
from docling.datamodel.base_models import InputFormat
|
|
7
9
|
from docling.datamodel.document import ConversionResult
|
|
8
|
-
from docling.
|
|
10
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
|
11
|
+
from docling.document_converter import DocumentConverter, FormatOption
|
|
9
12
|
|
|
10
13
|
# TODO: this is long to load improve it
|
|
11
14
|
from docling_core.types.doc import ImageRefMode
|
|
12
15
|
from docling_core.types.io import DocumentStream
|
|
13
16
|
from extract_core import (
|
|
17
|
+
BaseModel,
|
|
14
18
|
DoclingFormatOption,
|
|
15
19
|
DoclingPipelineConfig,
|
|
16
20
|
Error,
|
|
@@ -23,7 +27,11 @@ from extract_core import (
|
|
|
23
27
|
Result,
|
|
24
28
|
Status,
|
|
25
29
|
)
|
|
30
|
+
from extract_core.objects import Device
|
|
31
|
+
from icij_common.pydantic_utils import merge_configs
|
|
26
32
|
from icij_common.registrable import FromConfig
|
|
33
|
+
from pydantic import ConfigDict, field_serializer
|
|
34
|
+
from pydantic_core.core_schema import SerializerFunctionWrapHandler
|
|
27
35
|
|
|
28
36
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
29
37
|
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
|
|
@@ -34,9 +42,15 @@ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "mode
|
|
|
34
42
|
@Pipeline.register(PipelineType.DOCLING)
|
|
35
43
|
class DoclingPipeline(Pipeline):
|
|
36
44
|
def __init__(
|
|
37
|
-
self,
|
|
45
|
+
self,
|
|
46
|
+
format_options: dict["InputFormat", DoclingFormatOption] | None = None,
|
|
47
|
+
*,
|
|
48
|
+
device: Device = Device.CPU,
|
|
38
49
|
):
|
|
39
|
-
|
|
50
|
+
super().__init__(device)
|
|
51
|
+
format_options = {
|
|
52
|
+
k: v.to_docling(self._device) for k, v in format_options.items()
|
|
53
|
+
}
|
|
40
54
|
allowed_format = [
|
|
41
55
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
42
56
|
]
|
|
@@ -49,12 +63,23 @@ class DoclingPipeline(Pipeline):
|
|
|
49
63
|
) -> AsyncGenerator[Result, None]:
|
|
50
64
|
docs, path_or_streams = map_and_preserve(_to_docling, docs)
|
|
51
65
|
outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
|
|
52
|
-
|
|
66
|
+
|
|
67
|
+
sentinel = object()
|
|
68
|
+
while True:
|
|
69
|
+
res = await asyncio.to_thread(next, outputs, sentinel)
|
|
70
|
+
if res is sentinel:
|
|
71
|
+
return
|
|
72
|
+
doc = next(docs)
|
|
53
73
|
yield _to_result(res, doc, output_format, output_path=output_path)
|
|
54
74
|
|
|
55
75
|
@classmethod
|
|
56
|
-
def _from_config(
|
|
57
|
-
|
|
76
|
+
def _from_config(
|
|
77
|
+
cls,
|
|
78
|
+
config: DoclingPipelineConfig,
|
|
79
|
+
*,
|
|
80
|
+
device: Device = Device.CPU,
|
|
81
|
+
) -> FromConfig:
|
|
82
|
+
return cls(config.format_options, device=device)
|
|
58
83
|
|
|
59
84
|
|
|
60
85
|
def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
|
@@ -128,3 +153,52 @@ def _to_markdown_doc(
|
|
|
128
153
|
shutil.move(tmp_dir, md_dir)
|
|
129
154
|
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
130
155
|
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class SerializableFormatOptions(DoclingFormatOption):
|
|
159
|
+
# Utility class to serialize Python format options into a JSON which can be
|
|
160
|
+
# correctly deserialized into a docling FormatOption
|
|
161
|
+
# via DoclingFormatOption.to_docling
|
|
162
|
+
model_config = merge_configs(
|
|
163
|
+
BaseModel.model_config, ConfigDict(polymorphic_serialization=True)
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
pipeline_options: PipelineOptions | None = None
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def from_docling(cls, format_opts: FormatOption) -> Self:
|
|
170
|
+
return cls(
|
|
171
|
+
pipeline_cls=format_opts.pipeline_cls.__name__,
|
|
172
|
+
pipeline_options=format_opts.pipeline_options,
|
|
173
|
+
backend=format_opts.backend.__name__,
|
|
174
|
+
backend_options=format_opts.backend_options,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
@field_serializer("pipeline_options", mode="wrap")
|
|
178
|
+
def _serialize_pipeline_opts(
|
|
179
|
+
self, v: PipelineOptions | None, handler: SerializerFunctionWrapHandler
|
|
180
|
+
) -> Any:
|
|
181
|
+
if v is None:
|
|
182
|
+
return handler(v)
|
|
183
|
+
serialized = handler(v)
|
|
184
|
+
picture_desc_opts = getattr(v, "picture_description_options", None)
|
|
185
|
+
if picture_desc_opts is not None:
|
|
186
|
+
if "picture_description_options" not in serialized:
|
|
187
|
+
serialized["picture_description_options"] = dict()
|
|
188
|
+
serialized["picture_description_options"]["kind"] = picture_desc_opts.kind
|
|
189
|
+
ocr_opts = getattr(v, "ocr_options", None)
|
|
190
|
+
if ocr_opts is not None:
|
|
191
|
+
if "ocr_options" not in serialized:
|
|
192
|
+
serialized["ocr_options"] = dict()
|
|
193
|
+
serialized["ocr_options"]["kind"] = ocr_opts.kind
|
|
194
|
+
layout_opts = getattr(v, "layout_options", None)
|
|
195
|
+
if layout_opts is not None:
|
|
196
|
+
if "layout_options" not in serialized:
|
|
197
|
+
serialized["layout_options"] = dict()
|
|
198
|
+
serialized["layout_opts"]["kind"] = layout_opts.kind
|
|
199
|
+
table_structure_opts = getattr(v, "table_structure_options", None)
|
|
200
|
+
if table_structure_opts is not None:
|
|
201
|
+
if "table_structure_options" not in serialized:
|
|
202
|
+
serialized["table_structure_options"] = dict()
|
|
203
|
+
serialized["table_structure_options"]["kind"] = table_structure_opts.kind
|
|
204
|
+
return serialized
|
extract_python/marker_.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import gc
|
|
2
3
|
from collections.abc import AsyncGenerator, Iterable
|
|
3
4
|
from copy import deepcopy
|
|
@@ -7,6 +8,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
|
|
|
7
8
|
|
|
8
9
|
from extract_core import BasePipelineConfig, Pipeline, PipelineType
|
|
9
10
|
from extract_core.objects import (
|
|
11
|
+
Device,
|
|
10
12
|
InputDoc,
|
|
11
13
|
MarkdownDoc,
|
|
12
14
|
OutputFormat,
|
|
@@ -63,7 +65,13 @@ _MARKER_CONVERSION_ERRORS = tuple()
|
|
|
63
65
|
|
|
64
66
|
@Pipeline.register(PipelineType.MARKER)
|
|
65
67
|
class MarkerPipeline(Pipeline):
|
|
66
|
-
def __init__(
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
marker_config: dict[str, Any] | None = None,
|
|
71
|
+
*,
|
|
72
|
+
device: Device = Device.CPU,
|
|
73
|
+
):
|
|
74
|
+
super().__init__(device)
|
|
67
75
|
if marker_config is None:
|
|
68
76
|
marker_config = dict()
|
|
69
77
|
self._marker_config = marker_config
|
|
@@ -81,20 +89,25 @@ class MarkerPipeline(Pipeline):
|
|
|
81
89
|
renderer = config_parser.get_renderer()
|
|
82
90
|
converter = PdfConverter(
|
|
83
91
|
config=config_parser.generate_config_dict(),
|
|
84
|
-
artifact_dict=create_model_dict(),
|
|
92
|
+
artifact_dict=create_model_dict(device=self._device),
|
|
85
93
|
processor_list=config_parser.get_processors(),
|
|
86
94
|
renderer=renderer,
|
|
87
95
|
)
|
|
88
96
|
for doc in docs:
|
|
89
|
-
yield _process_doc(doc, converter, output_format, output_path)
|
|
97
|
+
yield await _process_doc(doc, converter, output_format, output_path)
|
|
90
98
|
|
|
91
99
|
@classmethod
|
|
92
|
-
def _from_config(
|
|
93
|
-
|
|
100
|
+
def _from_config(
|
|
101
|
+
cls,
|
|
102
|
+
config: MarkerPipelineConfig,
|
|
103
|
+
*,
|
|
104
|
+
device: Device = Device.CPU,
|
|
105
|
+
) -> Self:
|
|
106
|
+
return cls(config.config, device=device)
|
|
94
107
|
|
|
95
108
|
|
|
96
109
|
@report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
|
|
97
|
-
def _process_doc(
|
|
110
|
+
async def _process_doc(
|
|
98
111
|
doc: InputDoc,
|
|
99
112
|
converter: "PdfConverter",
|
|
100
113
|
output_format: OutputFormat,
|
|
@@ -102,7 +115,7 @@ def _process_doc(
|
|
|
102
115
|
) -> Result:
|
|
103
116
|
from marker.output import text_from_rendered # noqa: PLC0415
|
|
104
117
|
|
|
105
|
-
rendered = converter
|
|
118
|
+
rendered = await asyncio.to_thread(converter, str(doc.path))
|
|
106
119
|
content, _, images = text_from_rendered(rendered)
|
|
107
120
|
match output_format:
|
|
108
121
|
case OutputFormat.MARKDOWN:
|
extract_python/miner_u.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import shutil
|
|
3
4
|
from collections.abc import AsyncGenerator, Callable, Iterable
|
|
4
5
|
from functools import partial
|
|
@@ -19,9 +20,10 @@ from extract_core import (
|
|
|
19
20
|
Result,
|
|
20
21
|
Status,
|
|
21
22
|
)
|
|
23
|
+
from objects import Device
|
|
22
24
|
|
|
23
25
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
24
|
-
from .utils import path_to_artifacts_dirname
|
|
26
|
+
from .utils import path_to_artifacts_dirname, reset_env
|
|
25
27
|
|
|
26
28
|
_MINER_U_CONVERSION_ERRORS = tuple()
|
|
27
29
|
MDMakeFunction = Callable[[list, str, str], str | None]
|
|
@@ -29,7 +31,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
|
|
|
29
31
|
|
|
30
32
|
@Pipeline.register(PipelineType.MINER_U)
|
|
31
33
|
class MinerUPipeline(Pipeline):
|
|
32
|
-
def __init__(
|
|
34
|
+
def __init__(
|
|
35
|
+
self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
|
|
36
|
+
):
|
|
37
|
+
super().__init__(device)
|
|
33
38
|
self._config = config
|
|
34
39
|
self._language = language
|
|
35
40
|
self._md_make_fn = _parse_md_make_fn(config.backend)
|
|
@@ -39,36 +44,43 @@ class MinerUPipeline(Pipeline):
|
|
|
39
44
|
) -> AsyncGenerator[Result, None]:
|
|
40
45
|
from mineru.cli.common import aio_do_parse # noqa: PLC0415
|
|
41
46
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
_revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
|
|
59
|
-
]
|
|
60
|
-
for doc, res_path in zip(docs, res_paths, strict=True):
|
|
61
|
-
yield _process_doc(
|
|
62
|
-
doc,
|
|
63
|
-
md_make_fn=self._md_make_fn,
|
|
64
|
-
res_path=res_path,
|
|
65
|
-
output_format=output_format,
|
|
66
|
-
output_path=output_path,
|
|
47
|
+
with reset_env():
|
|
48
|
+
os.environ["MINERU_DEVICE_MODE"] = self._device
|
|
49
|
+
docs = list(docs)
|
|
50
|
+
# TODO: exclude files which are not pdf and return an error
|
|
51
|
+
pdfs_bytes = [d.path.read_bytes() for d in docs]
|
|
52
|
+
pdfs_names = [d.path.name for d in docs]
|
|
53
|
+
p_lang_list = [self._language for _ in pdfs_names]
|
|
54
|
+
# TODO: we should only process valid PDFs
|
|
55
|
+
with TemporaryDirectory(prefix="mineru-") as workdir:
|
|
56
|
+
workdir = Path(workdir) # noqa: PLW2901
|
|
57
|
+
await aio_do_parse(
|
|
58
|
+
output_dir=workdir,
|
|
59
|
+
pdf_file_names=pdfs_names,
|
|
60
|
+
pdf_bytes_list=pdfs_bytes,
|
|
61
|
+
p_lang_list=p_lang_list,
|
|
62
|
+
**self._config.as_parse_kwargs(),
|
|
67
63
|
)
|
|
64
|
+
res_paths = [
|
|
65
|
+
_revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
|
|
66
|
+
]
|
|
67
|
+
for doc, res_path in zip(docs, res_paths, strict=True):
|
|
68
|
+
yield _process_doc(
|
|
69
|
+
doc,
|
|
70
|
+
md_make_fn=self._md_make_fn,
|
|
71
|
+
res_path=res_path,
|
|
72
|
+
output_format=output_format,
|
|
73
|
+
output_path=output_path,
|
|
74
|
+
)
|
|
68
75
|
|
|
69
76
|
@classmethod
|
|
70
|
-
def _from_config(
|
|
71
|
-
|
|
77
|
+
def _from_config(
|
|
78
|
+
cls,
|
|
79
|
+
config: MinerUPipelineConfig,
|
|
80
|
+
*,
|
|
81
|
+
device: Device = Device.CPU,
|
|
82
|
+
) -> Self:
|
|
83
|
+
return cls(config.config, language=config.language, device=device)
|
|
72
84
|
|
|
73
85
|
|
|
74
86
|
def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
|
extract_python/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from collections.abc import Callable, Generator, Iterable, Iterator
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
+
from copy import copy
|
|
4
5
|
from functools import wraps
|
|
5
6
|
from itertools import tee
|
|
6
7
|
from pathlib import Path, PurePath
|
|
@@ -62,3 +63,13 @@ def chdir(path: Path) -> Generator[None, None, None]:
|
|
|
62
63
|
yield
|
|
63
64
|
finally:
|
|
64
65
|
os.chdir(cwd)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@contextmanager
|
|
69
|
+
def reset_env() -> Generator[None, None, None]:
|
|
70
|
+
old_env = copy(dict(os.environ))
|
|
71
|
+
try:
|
|
72
|
+
yield
|
|
73
|
+
finally:
|
|
74
|
+
os.environ.clear()
|
|
75
|
+
os.environ.update(old_env)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-python
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.8
|
|
4
4
|
Summary: Structured content extraction
|
|
5
5
|
Project-URL: Homepage, https://github.com/ICIJ/extract-python
|
|
6
6
|
Project-URL: Repository, https://github.com/ICIJ/extract-python
|
|
7
7
|
Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
|
|
8
8
|
Author-email: Clément Doumouro <cdoumouro@icij.org>
|
|
9
9
|
Requires-Python: <3.14,>=3.11
|
|
10
|
-
Requires-Dist: extract-core~=0.
|
|
10
|
+
Requires-Dist: extract-core~=0.5.5
|
|
11
11
|
Requires-Dist: icij-common~=0.8.2
|
|
12
12
|
Provides-Extra: benches
|
|
13
13
|
Requires-Dist: html2image~=2.0.7; extra == 'benches'
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
|
|
2
|
+
extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
|
|
3
|
+
extract_python/docling_.py,sha256=5q3JGJa8Jg8T8WL8MvufFkyILbD3G5mqtkV7HQed9zI,7632
|
|
4
|
+
extract_python/marker_.py,sha256=CIqRvHbdm6Itb7RiOVZWBDx5IvXzBGHjmHsWQFXIMyw,5331
|
|
5
|
+
extract_python/miner_u.py,sha256=5_KhSq5weRy4AmFrt0OZEDhVsnxzHN9TtM3L68mk-8I,6174
|
|
6
|
+
extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
|
|
7
|
+
extract_python-0.5.8.dist-info/METADATA,sha256=GbJOVU_ytNS46Z0M49YR_d7OVxQW6tP1SYSwl_LUr9E,1218
|
|
8
|
+
extract_python-0.5.8.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
extract_python-0.5.8.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
|
|
2
|
-
extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
|
|
3
|
-
extract_python/docling_.py,sha256=C4WP1AJrvS2n-KytlGc_1CShjdTGM077I6b9tvw4NhY,4727
|
|
4
|
-
extract_python/marker_.py,sha256=mLJA1m9G4JQtBs1wz8rmshdbaH81DhIwkRzDKZPJH8A,5058
|
|
5
|
-
extract_python/miner_u.py,sha256=jjHqHx7-2w0LSxYNcjvgWoLDTXsv_y1eeyteSfXqjk4,5771
|
|
6
|
-
extract_python/utils.py,sha256=NiYf65iCF7QO4loh7u4t38Ww3eVJUdBpWStL4eX_DqE,1781
|
|
7
|
-
extract_python-0.5.5.dist-info/METADATA,sha256=iENRXysGcLOtZ3mJNPpGwHiixhkYJYaYQ-sj8j26q2o,1216
|
|
8
|
-
extract_python-0.5.5.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
-
extract_python-0.5.5.dist-info/RECORD,,
|
|
File without changes
|