extract-python 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@ from extract_core import (
27
27
  Result,
28
28
  Status,
29
29
  )
30
+ from extract_core.objects import Device
30
31
  from icij_common.pydantic_utils import merge_configs
31
32
  from icij_common.registrable import FromConfig
32
33
  from pydantic import ConfigDict, field_serializer
@@ -41,9 +42,15 @@ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "mode
41
42
  @Pipeline.register(PipelineType.DOCLING)
42
43
  class DoclingPipeline(Pipeline):
43
44
  def __init__(
44
- self, format_options: dict["InputFormat", DoclingFormatOption] | None = None
45
+ self,
46
+ format_options: dict["InputFormat", DoclingFormatOption] | None = None,
47
+ *,
48
+ device: Device = Device.CPU,
45
49
  ):
46
- format_options = {k: v.to_docling() for k, v in format_options.items()}
50
+ super().__init__(device)
51
+ format_options = {
52
+ k: v.to_docling(self._device) for k, v in format_options.items()
53
+ }
47
54
  allowed_format = [
48
55
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
49
56
  ]
@@ -66,8 +73,13 @@ class DoclingPipeline(Pipeline):
66
73
  yield _to_result(res, doc, output_format, output_path=output_path)
67
74
 
68
75
  @classmethod
69
- def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
70
- return cls(config.format_options)
76
+ def _from_config(
77
+ cls,
78
+ config: DoclingPipelineConfig,
79
+ *,
80
+ device: Device = Device.CPU,
81
+ ) -> FromConfig:
82
+ return cls(config.format_options, device=device)
71
83
 
72
84
 
73
85
  def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
extract_python/marker_.py CHANGED
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
8
8
 
9
9
  from extract_core import BasePipelineConfig, Pipeline, PipelineType
10
10
  from extract_core.objects import (
11
+ Device,
11
12
  InputDoc,
12
13
  MarkdownDoc,
13
14
  OutputFormat,
@@ -64,7 +65,13 @@ _MARKER_CONVERSION_ERRORS = tuple()
64
65
 
65
66
  @Pipeline.register(PipelineType.MARKER)
66
67
  class MarkerPipeline(Pipeline):
67
- def __init__(self, marker_config: dict[str, Any] | None = None):
68
+ def __init__(
69
+ self,
70
+ marker_config: dict[str, Any] | None = None,
71
+ *,
72
+ device: Device = Device.CPU,
73
+ ):
74
+ super().__init__(device)
68
75
  if marker_config is None:
69
76
  marker_config = dict()
70
77
  self._marker_config = marker_config
@@ -82,7 +89,7 @@ class MarkerPipeline(Pipeline):
82
89
  renderer = config_parser.get_renderer()
83
90
  converter = PdfConverter(
84
91
  config=config_parser.generate_config_dict(),
85
- artifact_dict=create_model_dict(),
92
+ artifact_dict=create_model_dict(device=self._device),
86
93
  processor_list=config_parser.get_processors(),
87
94
  renderer=renderer,
88
95
  )
@@ -90,8 +97,13 @@ class MarkerPipeline(Pipeline):
90
97
  yield await _process_doc(doc, converter, output_format, output_path)
91
98
 
92
99
  @classmethod
93
- def _from_config(cls, config: MarkerPipelineConfig) -> Self:
94
- return cls(config.config)
100
+ def _from_config(
101
+ cls,
102
+ config: MarkerPipelineConfig,
103
+ *,
104
+ device: Device = Device.CPU,
105
+ ) -> Self:
106
+ return cls(config.config, device=device)
95
107
 
96
108
 
97
109
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
extract_python/miner_u.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import shutil
3
4
  from collections.abc import AsyncGenerator, Callable, Iterable
4
5
  from functools import partial
@@ -19,9 +20,10 @@ from extract_core import (
19
20
  Result,
20
21
  Status,
21
22
  )
23
+ from objects import Device
22
24
 
23
25
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
24
- from .utils import path_to_artifacts_dirname
26
+ from .utils import path_to_artifacts_dirname, reset_env
25
27
 
26
28
  _MINER_U_CONVERSION_ERRORS = tuple()
27
29
  MDMakeFunction = Callable[[list, str, str], str | None]
@@ -29,7 +31,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
29
31
 
30
32
  @Pipeline.register(PipelineType.MINER_U)
31
33
  class MinerUPipeline(Pipeline):
32
- def __init__(self, config: MinerUConfig, language: str):
34
+ def __init__(
35
+ self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
36
+ ):
37
+ super().__init__(device)
33
38
  self._config = config
34
39
  self._language = language
35
40
  self._md_make_fn = _parse_md_make_fn(config.backend)
@@ -39,36 +44,43 @@ class MinerUPipeline(Pipeline):
39
44
  ) -> AsyncGenerator[Result, None]:
40
45
  from mineru.cli.common import aio_do_parse # noqa: PLC0415
41
46
 
42
- docs = list(docs)
43
- # TODO: exclude files which are not pdf and return an error
44
- pdfs_bytes = [d.path.read_bytes() for d in docs]
45
- pdfs_names = [d.path.name for d in docs]
46
- p_lang_list = [self._language for _ in pdfs_names]
47
- # TODO: we should only process valid PDFs
48
- with TemporaryDirectory(prefix="mineru-") as workdir:
49
- workdir = Path(workdir) # noqa: PLW2901
50
- await aio_do_parse(
51
- output_dir=workdir,
52
- pdf_file_names=pdfs_names,
53
- pdf_bytes_list=pdfs_bytes,
54
- p_lang_list=p_lang_list,
55
- **self._config.as_parse_kwargs(),
56
- )
57
- res_paths = [
58
- _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
59
- ]
60
- for doc, res_path in zip(docs, res_paths, strict=True):
61
- yield _process_doc(
62
- doc,
63
- md_make_fn=self._md_make_fn,
64
- res_path=res_path,
65
- output_format=output_format,
66
- output_path=output_path,
47
+ with reset_env():
48
+ os.environ["MINERU_DEVICE_MODE"] = self._device
49
+ docs = list(docs)
50
+ # TODO: exclude files which are not pdf and return an error
51
+ pdfs_bytes = [d.path.read_bytes() for d in docs]
52
+ pdfs_names = [d.path.name for d in docs]
53
+ p_lang_list = [self._language for _ in pdfs_names]
54
+ # TODO: we should only process valid PDFs
55
+ with TemporaryDirectory(prefix="mineru-") as workdir:
56
+ workdir = Path(workdir) # noqa: PLW2901
57
+ await aio_do_parse(
58
+ output_dir=workdir,
59
+ pdf_file_names=pdfs_names,
60
+ pdf_bytes_list=pdfs_bytes,
61
+ p_lang_list=p_lang_list,
62
+ **self._config.as_parse_kwargs(),
67
63
  )
64
+ res_paths = [
65
+ _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
66
+ ]
67
+ for doc, res_path in zip(docs, res_paths, strict=True):
68
+ yield _process_doc(
69
+ doc,
70
+ md_make_fn=self._md_make_fn,
71
+ res_path=res_path,
72
+ output_format=output_format,
73
+ output_path=output_path,
74
+ )
68
75
 
69
76
  @classmethod
70
- def _from_config(cls, config: MinerUPipelineConfig) -> Self:
71
- return cls(config.config, language=config.language)
77
+ def _from_config(
78
+ cls,
79
+ config: MinerUPipelineConfig,
80
+ *,
81
+ device: Device = Device.CPU,
82
+ ) -> Self:
83
+ return cls(config.config, language=config.language, device=device)
72
84
 
73
85
 
74
86
  def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
extract_python/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from collections.abc import Callable, Generator, Iterable, Iterator
3
3
  from contextlib import contextmanager
4
+ from copy import copy
4
5
  from functools import wraps
5
6
  from itertools import tee
6
7
  from pathlib import Path, PurePath
@@ -62,3 +63,13 @@ def chdir(path: Path) -> Generator[None, None, None]:
62
63
  yield
63
64
  finally:
64
65
  os.chdir(cwd)
66
+
67
+
68
+ @contextmanager
69
+ def reset_env() -> Generator[None, None, None]:
70
+ old_env = copy(dict(os.environ))
71
+ try:
72
+ yield
73
+ finally:
74
+ os.environ.clear()
75
+ os.environ.update(old_env)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.5.7
3
+ Version: 0.5.8
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,9 @@
1
+ extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
2
+ extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
3
+ extract_python/docling_.py,sha256=5q3JGJa8Jg8T8WL8MvufFkyILbD3G5mqtkV7HQed9zI,7632
4
+ extract_python/marker_.py,sha256=CIqRvHbdm6Itb7RiOVZWBDx5IvXzBGHjmHsWQFXIMyw,5331
5
+ extract_python/miner_u.py,sha256=5_KhSq5weRy4AmFrt0OZEDhVsnxzHN9TtM3L68mk-8I,6174
6
+ extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
7
+ extract_python-0.5.8.dist-info/METADATA,sha256=GbJOVU_ytNS46Z0M49YR_d7OVxQW6tP1SYSwl_LUr9E,1218
8
+ extract_python-0.5.8.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ extract_python-0.5.8.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
2
- extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
3
- extract_python/docling_.py,sha256=TXXSayirZtA82VM9-2WrGqxTKodX6KyHUDZHURuvd7k,7382
4
- extract_python/marker_.py,sha256=i_zWNISTI--E7Y-71DYQ5TlsTN7I9T5vT-3szPqMkWE,5110
5
- extract_python/miner_u.py,sha256=jjHqHx7-2w0LSxYNcjvgWoLDTXsv_y1eeyteSfXqjk4,5771
6
- extract_python/utils.py,sha256=NiYf65iCF7QO4loh7u4t38Ww3eVJUdBpWStL4eX_DqE,1781
7
- extract_python-0.5.7.dist-info/METADATA,sha256=4jp__6WVmbXwrGGpf7ExGdJWO1-bqcaN5r411xq--OY,1218
8
- extract_python-0.5.7.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
- extract_python-0.5.7.dist-info/RECORD,,