extract-python 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/docling_.py +16 -4
- extract_python/marker_.py +16 -4
- extract_python/miner_u.py +41 -29
- extract_python/utils.py +11 -0
- {extract_python-0.5.7.dist-info → extract_python-0.5.9.dist-info}/METADATA +1 -1
- extract_python-0.5.9.dist-info/RECORD +9 -0
- extract_python-0.5.7.dist-info/RECORD +0 -9
- {extract_python-0.5.7.dist-info → extract_python-0.5.9.dist-info}/WHEEL +0 -0
extract_python/docling_.py
CHANGED
|
@@ -27,6 +27,7 @@ from extract_core import (
|
|
|
27
27
|
Result,
|
|
28
28
|
Status,
|
|
29
29
|
)
|
|
30
|
+
from extract_core.objects import Device
|
|
30
31
|
from icij_common.pydantic_utils import merge_configs
|
|
31
32
|
from icij_common.registrable import FromConfig
|
|
32
33
|
from pydantic import ConfigDict, field_serializer
|
|
@@ -41,9 +42,15 @@ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "mode
|
|
|
41
42
|
@Pipeline.register(PipelineType.DOCLING)
|
|
42
43
|
class DoclingPipeline(Pipeline):
|
|
43
44
|
def __init__(
|
|
44
|
-
self,
|
|
45
|
+
self,
|
|
46
|
+
format_options: dict["InputFormat", DoclingFormatOption] | None = None,
|
|
47
|
+
*,
|
|
48
|
+
device: Device = Device.CPU,
|
|
45
49
|
):
|
|
46
|
-
|
|
50
|
+
super().__init__(device)
|
|
51
|
+
format_options = {
|
|
52
|
+
k: v.to_docling(self._device) for k, v in format_options.items()
|
|
53
|
+
}
|
|
47
54
|
allowed_format = [
|
|
48
55
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
49
56
|
]
|
|
@@ -66,8 +73,13 @@ class DoclingPipeline(Pipeline):
|
|
|
66
73
|
yield _to_result(res, doc, output_format, output_path=output_path)
|
|
67
74
|
|
|
68
75
|
@classmethod
|
|
69
|
-
def _from_config(
|
|
70
|
-
|
|
76
|
+
def _from_config(
|
|
77
|
+
cls,
|
|
78
|
+
config: DoclingPipelineConfig,
|
|
79
|
+
*,
|
|
80
|
+
device: Device = Device.CPU,
|
|
81
|
+
) -> FromConfig:
|
|
82
|
+
return cls(config.format_options, device=device)
|
|
71
83
|
|
|
72
84
|
|
|
73
85
|
def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
extract_python/marker_.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
|
|
|
8
8
|
|
|
9
9
|
from extract_core import BasePipelineConfig, Pipeline, PipelineType
|
|
10
10
|
from extract_core.objects import (
|
|
11
|
+
Device,
|
|
11
12
|
InputDoc,
|
|
12
13
|
MarkdownDoc,
|
|
13
14
|
OutputFormat,
|
|
@@ -64,7 +65,13 @@ _MARKER_CONVERSION_ERRORS = tuple()
|
|
|
64
65
|
|
|
65
66
|
@Pipeline.register(PipelineType.MARKER)
|
|
66
67
|
class MarkerPipeline(Pipeline):
|
|
67
|
-
def __init__(
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
marker_config: dict[str, Any] | None = None,
|
|
71
|
+
*,
|
|
72
|
+
device: Device = Device.CPU,
|
|
73
|
+
):
|
|
74
|
+
super().__init__(device)
|
|
68
75
|
if marker_config is None:
|
|
69
76
|
marker_config = dict()
|
|
70
77
|
self._marker_config = marker_config
|
|
@@ -82,7 +89,7 @@ class MarkerPipeline(Pipeline):
|
|
|
82
89
|
renderer = config_parser.get_renderer()
|
|
83
90
|
converter = PdfConverter(
|
|
84
91
|
config=config_parser.generate_config_dict(),
|
|
85
|
-
artifact_dict=create_model_dict(),
|
|
92
|
+
artifact_dict=create_model_dict(device=self._device),
|
|
86
93
|
processor_list=config_parser.get_processors(),
|
|
87
94
|
renderer=renderer,
|
|
88
95
|
)
|
|
@@ -90,8 +97,13 @@ class MarkerPipeline(Pipeline):
|
|
|
90
97
|
yield await _process_doc(doc, converter, output_format, output_path)
|
|
91
98
|
|
|
92
99
|
@classmethod
|
|
93
|
-
def _from_config(
|
|
94
|
-
|
|
100
|
+
def _from_config(
|
|
101
|
+
cls,
|
|
102
|
+
config: MarkerPipelineConfig,
|
|
103
|
+
*,
|
|
104
|
+
device: Device = Device.CPU,
|
|
105
|
+
) -> Self:
|
|
106
|
+
return cls(config.config, device=device)
|
|
95
107
|
|
|
96
108
|
|
|
97
109
|
@report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
|
extract_python/miner_u.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import shutil
|
|
3
4
|
from collections.abc import AsyncGenerator, Callable, Iterable
|
|
4
5
|
from functools import partial
|
|
@@ -19,9 +20,10 @@ from extract_core import (
|
|
|
19
20
|
Result,
|
|
20
21
|
Status,
|
|
21
22
|
)
|
|
23
|
+
from extract_core.objects import Device
|
|
22
24
|
|
|
23
25
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
24
|
-
from .utils import path_to_artifacts_dirname
|
|
26
|
+
from .utils import path_to_artifacts_dirname, reset_env
|
|
25
27
|
|
|
26
28
|
_MINER_U_CONVERSION_ERRORS = tuple()
|
|
27
29
|
MDMakeFunction = Callable[[list, str, str], str | None]
|
|
@@ -29,7 +31,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
|
|
|
29
31
|
|
|
30
32
|
@Pipeline.register(PipelineType.MINER_U)
|
|
31
33
|
class MinerUPipeline(Pipeline):
|
|
32
|
-
def __init__(
|
|
34
|
+
def __init__(
|
|
35
|
+
self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
|
|
36
|
+
):
|
|
37
|
+
super().__init__(device)
|
|
33
38
|
self._config = config
|
|
34
39
|
self._language = language
|
|
35
40
|
self._md_make_fn = _parse_md_make_fn(config.backend)
|
|
@@ -39,36 +44,43 @@ class MinerUPipeline(Pipeline):
|
|
|
39
44
|
) -> AsyncGenerator[Result, None]:
|
|
40
45
|
from mineru.cli.common import aio_do_parse # noqa: PLC0415
|
|
41
46
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
_revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
|
|
59
|
-
]
|
|
60
|
-
for doc, res_path in zip(docs, res_paths, strict=True):
|
|
61
|
-
yield _process_doc(
|
|
62
|
-
doc,
|
|
63
|
-
md_make_fn=self._md_make_fn,
|
|
64
|
-
res_path=res_path,
|
|
65
|
-
output_format=output_format,
|
|
66
|
-
output_path=output_path,
|
|
47
|
+
with reset_env():
|
|
48
|
+
os.environ["MINERU_DEVICE_MODE"] = self._device
|
|
49
|
+
docs = list(docs)
|
|
50
|
+
# TODO: exclude files which are not pdf and return an error
|
|
51
|
+
pdfs_bytes = [d.path.read_bytes() for d in docs]
|
|
52
|
+
pdfs_names = [d.path.name for d in docs]
|
|
53
|
+
p_lang_list = [self._language for _ in pdfs_names]
|
|
54
|
+
# TODO: we should only process valid PDFs
|
|
55
|
+
with TemporaryDirectory(prefix="mineru-") as workdir:
|
|
56
|
+
workdir = Path(workdir) # noqa: PLW2901
|
|
57
|
+
await aio_do_parse(
|
|
58
|
+
output_dir=workdir,
|
|
59
|
+
pdf_file_names=pdfs_names,
|
|
60
|
+
pdf_bytes_list=pdfs_bytes,
|
|
61
|
+
p_lang_list=p_lang_list,
|
|
62
|
+
**self._config.as_parse_kwargs(),
|
|
67
63
|
)
|
|
64
|
+
res_paths = [
|
|
65
|
+
_revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
|
|
66
|
+
]
|
|
67
|
+
for doc, res_path in zip(docs, res_paths, strict=True):
|
|
68
|
+
yield _process_doc(
|
|
69
|
+
doc,
|
|
70
|
+
md_make_fn=self._md_make_fn,
|
|
71
|
+
res_path=res_path,
|
|
72
|
+
output_format=output_format,
|
|
73
|
+
output_path=output_path,
|
|
74
|
+
)
|
|
68
75
|
|
|
69
76
|
@classmethod
|
|
70
|
-
def _from_config(
|
|
71
|
-
|
|
77
|
+
def _from_config(
|
|
78
|
+
cls,
|
|
79
|
+
config: MinerUPipelineConfig,
|
|
80
|
+
*,
|
|
81
|
+
device: Device = Device.CPU,
|
|
82
|
+
) -> Self:
|
|
83
|
+
return cls(config.config, language=config.language, device=device)
|
|
72
84
|
|
|
73
85
|
|
|
74
86
|
def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
|
extract_python/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from collections.abc import Callable, Generator, Iterable, Iterator
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
+
from copy import copy
|
|
4
5
|
from functools import wraps
|
|
5
6
|
from itertools import tee
|
|
6
7
|
from pathlib import Path, PurePath
|
|
@@ -62,3 +63,13 @@ def chdir(path: Path) -> Generator[None, None, None]:
|
|
|
62
63
|
yield
|
|
63
64
|
finally:
|
|
64
65
|
os.chdir(cwd)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@contextmanager
|
|
69
|
+
def reset_env() -> Generator[None, None, None]:
|
|
70
|
+
old_env = copy(dict(os.environ))
|
|
71
|
+
try:
|
|
72
|
+
yield
|
|
73
|
+
finally:
|
|
74
|
+
os.environ.clear()
|
|
75
|
+
os.environ.update(old_env)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
|
|
2
|
+
extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
|
|
3
|
+
extract_python/docling_.py,sha256=5q3JGJa8Jg8T8WL8MvufFkyILbD3G5mqtkV7HQed9zI,7632
|
|
4
|
+
extract_python/marker_.py,sha256=CIqRvHbdm6Itb7RiOVZWBDx5IvXzBGHjmHsWQFXIMyw,5331
|
|
5
|
+
extract_python/miner_u.py,sha256=2s2Pxj0wuW-kx1G9gRtHg_gkoPc-WQ31w8zmf_hr7rk,6187
|
|
6
|
+
extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
|
|
7
|
+
extract_python-0.5.9.dist-info/METADATA,sha256=EIPV5roPYVRFSH1nhW3o3TZ7jkUNgB4t8miEhQykL0Q,1218
|
|
8
|
+
extract_python-0.5.9.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
extract_python-0.5.9.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
|
|
2
|
-
extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
|
|
3
|
-
extract_python/docling_.py,sha256=TXXSayirZtA82VM9-2WrGqxTKodX6KyHUDZHURuvd7k,7382
|
|
4
|
-
extract_python/marker_.py,sha256=i_zWNISTI--E7Y-71DYQ5TlsTN7I9T5vT-3szPqMkWE,5110
|
|
5
|
-
extract_python/miner_u.py,sha256=jjHqHx7-2w0LSxYNcjvgWoLDTXsv_y1eeyteSfXqjk4,5771
|
|
6
|
-
extract_python/utils.py,sha256=NiYf65iCF7QO4loh7u4t38Ww3eVJUdBpWStL4eX_DqE,1781
|
|
7
|
-
extract_python-0.5.7.dist-info/METADATA,sha256=4jp__6WVmbXwrGGpf7ExGdJWO1-bqcaN5r411xq--OY,1218
|
|
8
|
-
extract_python-0.5.7.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
-
extract_python-0.5.7.dist-info/RECORD,,
|
|
File without changes
|