extract-python 0.5.5__tar.gz → 0.5.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.5.5
3
+ Version: 0.5.8
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
9
  Requires-Python: <3.14,>=3.11
10
- Requires-Dist: extract-core~=0.1
10
+ Requires-Dist: extract-core~=0.5.5
11
11
  Requires-Dist: icij-common~=0.8.2
12
12
  Provides-Extra: benches
13
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -1,16 +1,20 @@
1
+ import asyncio
1
2
  import shutil
2
3
  import tempfile
3
4
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
5
  from pathlib import Path
6
+ from typing import Any, Self
5
7
 
6
8
  from docling.datamodel.base_models import InputFormat
7
9
  from docling.datamodel.document import ConversionResult
8
- from docling.document_converter import DocumentConverter
10
+ from docling.datamodel.pipeline_options import PipelineOptions
11
+ from docling.document_converter import DocumentConverter, FormatOption
9
12
 
10
13
  # TODO: this is long to load improve it
11
14
  from docling_core.types.doc import ImageRefMode
12
15
  from docling_core.types.io import DocumentStream
13
16
  from extract_core import (
17
+ BaseModel,
14
18
  DoclingFormatOption,
15
19
  DoclingPipelineConfig,
16
20
  Error,
@@ -23,7 +27,11 @@ from extract_core import (
23
27
  Result,
24
28
  Status,
25
29
  )
30
+ from extract_core.objects import Device
31
+ from icij_common.pydantic_utils import merge_configs
26
32
  from icij_common.registrable import FromConfig
33
+ from pydantic import ConfigDict, field_serializer
34
+ from pydantic_core.core_schema import SerializerFunctionWrapHandler
27
35
 
28
36
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
29
37
  from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
@@ -34,9 +42,15 @@ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "mode
34
42
  @Pipeline.register(PipelineType.DOCLING)
35
43
  class DoclingPipeline(Pipeline):
36
44
  def __init__(
37
- self, format_options: dict["InputFormat", DoclingFormatOption] | None = None
45
+ self,
46
+ format_options: dict["InputFormat", DoclingFormatOption] | None = None,
47
+ *,
48
+ device: Device = Device.CPU,
38
49
  ):
39
- format_options = {k: v.to_docling() for k, v in format_options.items()}
50
+ super().__init__(device)
51
+ format_options = {
52
+ k: v.to_docling(self._device) for k, v in format_options.items()
53
+ }
40
54
  allowed_format = [
41
55
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
42
56
  ]
@@ -49,12 +63,23 @@ class DoclingPipeline(Pipeline):
49
63
  ) -> AsyncGenerator[Result, None]:
50
64
  docs, path_or_streams = map_and_preserve(_to_docling, docs)
51
65
  outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
52
- for doc, res in zip(docs, outputs, strict=True):
66
+
67
+ sentinel = object()
68
+ while True:
69
+ res = await asyncio.to_thread(next, outputs, sentinel)
70
+ if res is sentinel:
71
+ return
72
+ doc = next(docs)
53
73
  yield _to_result(res, doc, output_format, output_path=output_path)
54
74
 
55
75
  @classmethod
56
- def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
57
- return cls(config.format_options)
76
+ def _from_config(
77
+ cls,
78
+ config: DoclingPipelineConfig,
79
+ *,
80
+ device: Device = Device.CPU,
81
+ ) -> FromConfig:
82
+ return cls(config.format_options, device=device)
58
83
 
59
84
 
60
85
  def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
@@ -128,3 +153,52 @@ def _to_markdown_doc(
128
153
  shutil.move(tmp_dir, md_dir)
129
154
  pages = PageIndexes.from_page_end_indices(end_indices)
130
155
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
156
+
157
+
158
+ class SerializableFormatOptions(DoclingFormatOption):
159
+ # Utility class to serialize Python format options into a JSON which can be
160
+ # correctly deserialized into a docling FormatOption
161
+ # via DoclingFormatOption.to_docling
162
+ model_config = merge_configs(
163
+ BaseModel.model_config, ConfigDict(polymorphic_serialization=True)
164
+ )
165
+
166
+ pipeline_options: PipelineOptions | None = None
167
+
168
+ @classmethod
169
+ def from_docling(cls, format_opts: FormatOption) -> Self:
170
+ return cls(
171
+ pipeline_cls=format_opts.pipeline_cls.__name__,
172
+ pipeline_options=format_opts.pipeline_options,
173
+ backend=format_opts.backend.__name__,
174
+ backend_options=format_opts.backend_options,
175
+ )
176
+
177
+ @field_serializer("pipeline_options", mode="wrap")
178
+ def _serialize_pipeline_opts(
179
+ self, v: PipelineOptions | None, handler: SerializerFunctionWrapHandler
180
+ ) -> Any:
181
+ if v is None:
182
+ return handler(v)
183
+ serialized = handler(v)
184
+ picture_desc_opts = getattr(v, "picture_description_options", None)
185
+ if picture_desc_opts is not None:
186
+ if "picture_description_options" not in serialized:
187
+ serialized["picture_description_options"] = dict()
188
+ serialized["picture_description_options"]["kind"] = picture_desc_opts.kind
189
+ ocr_opts = getattr(v, "ocr_options", None)
190
+ if ocr_opts is not None:
191
+ if "ocr_options" not in serialized:
192
+ serialized["ocr_options"] = dict()
193
+ serialized["ocr_options"]["kind"] = ocr_opts.kind
194
+ layout_opts = getattr(v, "layout_options", None)
195
+ if layout_opts is not None:
196
+ if "layout_options" not in serialized:
197
+ serialized["layout_options"] = dict()
198
+ serialized["layout_opts"]["kind"] = layout_opts.kind
199
+ table_structure_opts = getattr(v, "table_structure_options", None)
200
+ if table_structure_opts is not None:
201
+ if "table_structure_options" not in serialized:
202
+ serialized["table_structure_options"] = dict()
203
+ serialized["table_structure_options"]["kind"] = table_structure_opts.kind
204
+ return serialized
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import gc
2
3
  from collections.abc import AsyncGenerator, Iterable
3
4
  from copy import deepcopy
@@ -7,6 +8,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
8
 
8
9
  from extract_core import BasePipelineConfig, Pipeline, PipelineType
9
10
  from extract_core.objects import (
11
+ Device,
10
12
  InputDoc,
11
13
  MarkdownDoc,
12
14
  OutputFormat,
@@ -63,7 +65,13 @@ _MARKER_CONVERSION_ERRORS = tuple()
63
65
 
64
66
  @Pipeline.register(PipelineType.MARKER)
65
67
  class MarkerPipeline(Pipeline):
66
- def __init__(self, marker_config: dict[str, Any] | None = None):
68
+ def __init__(
69
+ self,
70
+ marker_config: dict[str, Any] | None = None,
71
+ *,
72
+ device: Device = Device.CPU,
73
+ ):
74
+ super().__init__(device)
67
75
  if marker_config is None:
68
76
  marker_config = dict()
69
77
  self._marker_config = marker_config
@@ -81,20 +89,25 @@ class MarkerPipeline(Pipeline):
81
89
  renderer = config_parser.get_renderer()
82
90
  converter = PdfConverter(
83
91
  config=config_parser.generate_config_dict(),
84
- artifact_dict=create_model_dict(),
92
+ artifact_dict=create_model_dict(device=self._device),
85
93
  processor_list=config_parser.get_processors(),
86
94
  renderer=renderer,
87
95
  )
88
96
  for doc in docs:
89
- yield _process_doc(doc, converter, output_format, output_path)
97
+ yield await _process_doc(doc, converter, output_format, output_path)
90
98
 
91
99
  @classmethod
92
- def _from_config(cls, config: MarkerPipelineConfig) -> Self:
93
- return cls(config.config)
100
+ def _from_config(
101
+ cls,
102
+ config: MarkerPipelineConfig,
103
+ *,
104
+ device: Device = Device.CPU,
105
+ ) -> Self:
106
+ return cls(config.config, device=device)
94
107
 
95
108
 
96
109
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
97
- def _process_doc(
110
+ async def _process_doc(
98
111
  doc: InputDoc,
99
112
  converter: "PdfConverter",
100
113
  output_format: OutputFormat,
@@ -102,7 +115,7 @@ def _process_doc(
102
115
  ) -> Result:
103
116
  from marker.output import text_from_rendered # noqa: PLC0415
104
117
 
105
- rendered = converter(str(doc.path))
118
+ rendered = await asyncio.to_thread(converter, str(doc.path))
106
119
  content, _, images = text_from_rendered(rendered)
107
120
  match output_format:
108
121
  case OutputFormat.MARKDOWN:
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import shutil
3
4
  from collections.abc import AsyncGenerator, Callable, Iterable
4
5
  from functools import partial
@@ -19,9 +20,10 @@ from extract_core import (
19
20
  Result,
20
21
  Status,
21
22
  )
23
+ from objects import Device
22
24
 
23
25
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
24
- from .utils import path_to_artifacts_dirname
26
+ from .utils import path_to_artifacts_dirname, reset_env
25
27
 
26
28
  _MINER_U_CONVERSION_ERRORS = tuple()
27
29
  MDMakeFunction = Callable[[list, str, str], str | None]
@@ -29,7 +31,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
29
31
 
30
32
  @Pipeline.register(PipelineType.MINER_U)
31
33
  class MinerUPipeline(Pipeline):
32
- def __init__(self, config: MinerUConfig, language: str):
34
+ def __init__(
35
+ self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
36
+ ):
37
+ super().__init__(device)
33
38
  self._config = config
34
39
  self._language = language
35
40
  self._md_make_fn = _parse_md_make_fn(config.backend)
@@ -39,36 +44,43 @@ class MinerUPipeline(Pipeline):
39
44
  ) -> AsyncGenerator[Result, None]:
40
45
  from mineru.cli.common import aio_do_parse # noqa: PLC0415
41
46
 
42
- docs = list(docs)
43
- # TODO: exclude files which are not pdf and return an error
44
- pdfs_bytes = [d.path.read_bytes() for d in docs]
45
- pdfs_names = [d.path.name for d in docs]
46
- p_lang_list = [self._language for _ in pdfs_names]
47
- # TODO: we should only process valid PDFs
48
- with TemporaryDirectory(prefix="mineru-") as workdir:
49
- workdir = Path(workdir) # noqa: PLW2901
50
- await aio_do_parse(
51
- output_dir=workdir,
52
- pdf_file_names=pdfs_names,
53
- pdf_bytes_list=pdfs_bytes,
54
- p_lang_list=p_lang_list,
55
- **self._config.as_parse_kwargs(),
56
- )
57
- res_paths = [
58
- _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
59
- ]
60
- for doc, res_path in zip(docs, res_paths, strict=True):
61
- yield _process_doc(
62
- doc,
63
- md_make_fn=self._md_make_fn,
64
- res_path=res_path,
65
- output_format=output_format,
66
- output_path=output_path,
47
+ with reset_env():
48
+ os.environ["MINERU_DEVICE_MODE"] = self._device
49
+ docs = list(docs)
50
+ # TODO: exclude files which are not pdf and return an error
51
+ pdfs_bytes = [d.path.read_bytes() for d in docs]
52
+ pdfs_names = [d.path.name for d in docs]
53
+ p_lang_list = [self._language for _ in pdfs_names]
54
+ # TODO: we should only process valid PDFs
55
+ with TemporaryDirectory(prefix="mineru-") as workdir:
56
+ workdir = Path(workdir) # noqa: PLW2901
57
+ await aio_do_parse(
58
+ output_dir=workdir,
59
+ pdf_file_names=pdfs_names,
60
+ pdf_bytes_list=pdfs_bytes,
61
+ p_lang_list=p_lang_list,
62
+ **self._config.as_parse_kwargs(),
67
63
  )
64
+ res_paths = [
65
+ _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
66
+ ]
67
+ for doc, res_path in zip(docs, res_paths, strict=True):
68
+ yield _process_doc(
69
+ doc,
70
+ md_make_fn=self._md_make_fn,
71
+ res_path=res_path,
72
+ output_format=output_format,
73
+ output_path=output_path,
74
+ )
68
75
 
69
76
  @classmethod
70
- def _from_config(cls, config: MinerUPipelineConfig) -> Self:
71
- return cls(config.config, language=config.language)
77
+ def _from_config(
78
+ cls,
79
+ config: MinerUPipelineConfig,
80
+ *,
81
+ device: Device = Device.CPU,
82
+ ) -> Self:
83
+ return cls(config.config, language=config.language, device=device)
72
84
 
73
85
 
74
86
  def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from collections.abc import Callable, Generator, Iterable, Iterator
3
3
  from contextlib import contextmanager
4
+ from copy import copy
4
5
  from functools import wraps
5
6
  from itertools import tee
6
7
  from pathlib import Path, PurePath
@@ -62,3 +63,13 @@ def chdir(path: Path) -> Generator[None, None, None]:
62
63
  yield
63
64
  finally:
64
65
  os.chdir(cwd)
66
+
67
+
68
+ @contextmanager
69
+ def reset_env() -> Generator[None, None, None]:
70
+ old_env = copy(dict(os.environ))
71
+ try:
72
+ yield
73
+ finally:
74
+ os.environ.clear()
75
+ os.environ.update(old_env)
@@ -9,7 +9,7 @@ readme = "README.md"
9
9
  requires-python = ">=3.11,<3.14"
10
10
  dependencies = [
11
11
  "icij-common~=0.8.2",
12
- "extract-core~=0.1",
12
+ "extract-core~=0.5.5",
13
13
  ]
14
14
 
15
15
  [project.optional-dependencies]
@@ -871,12 +871,14 @@ requires-dist = [
871
871
  [package.metadata.requires-dev]
872
872
  dev = [
873
873
  { name = "docling", specifier = "~=2.96" },
874
- { name = "huggingface-hub", specifier = "~=0.36.2" },
874
+ { name = "docling-slim", extras = ["models-vlm-inline"], specifier = "~=2.96" },
875
875
  { name = "mpmath", specifier = "~=1.3.0" },
876
876
  { name = "pytest", specifier = "~=8.3.5" },
877
877
  { name = "pytest-asyncio", specifier = "~=0.25.3" },
878
878
  { name = "ruff", specifier = "==0.15.2" },
879
879
  { name = "sympy", specifier = "~=1.14.0" },
880
+ { name = "torch", specifier = "==2.12.0", index = "https://download.pytorch.org/whl/cpu" },
881
+ { name = "torchvision", specifier = "==0.27.0", index = "https://download.pytorch.org/whl/cpu" },
880
882
  ]
881
883
 
882
884
  [[package]]
File without changes