extract-python 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,18 +3,8 @@ import tempfile
3
3
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Annotated, ClassVar, TypeVar
7
-
8
- from docling.datamodel.base_models import FormatToExtensions, InputFormat
9
- from docling.datamodel.document import ConversionResult
10
- from docling.datamodel.pipeline_options import (
11
- EasyOcrOptions,
12
- PdfPipelineOptions,
13
- PipelineOptions,
14
- )
15
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
16
- from docling_core.types.doc import ImageRefMode
17
- from docling_core.types.io import DocumentStream
6
+ from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
7
+
18
8
  from icij_common.registrable import FromConfig
19
9
  from pydantic import AfterValidator, Field
20
10
 
@@ -34,28 +24,45 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
34
24
 
35
25
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
36
26
 
27
+ if TYPE_CHECKING:
28
+ from docling.datamodel.base_models import InputFormat
29
+ from docling.datamodel.pipeline_options import PipelineOptions
30
+ from docling.document_converter import ConversionResult, FormatOption
31
+ from docling_core.types.io import DocumentStream
32
+
33
+
34
+ def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
37
36
 
38
- def _validate_pipeline_opts(opts: PipelineOptions) -> None:
39
37
  if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
40
38
  msg = "generate_picture_images should be set to true"
41
39
  raise ValueError(msg)
42
40
 
43
41
 
44
42
  def _validate_options(
45
- data: dict[InputFormat, FormatOption],
46
- ) -> dict[InputFormat, FormatOption]:
43
+ data: dict["InputFormat", "FormatOption"],
44
+ ) -> dict["InputFormat", "FormatOption"]:
47
45
  for opts in data.values():
48
46
  _validate_pipeline_opts(opts.pipeline_options)
49
47
  return data
50
48
 
51
49
 
52
- _DEFAULT_FORMAT_OPTS = {
53
- InputFormat.PDF: PdfFormatOption(
54
- pipeline_options=PdfPipelineOptions(
55
- ocr_options=EasyOcrOptions(), generate_picture_images=True
56
- )
57
- ),
58
- }
50
+ @cache
51
+ def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
+ from docling.datamodel.pipeline_options import (
53
+ EasyOcrOptions,
54
+ PdfPipelineOptions,
55
+ )
56
+ from docling.document_converter import PdfFormatOption
57
+
58
+ return {
59
+ InputFormat.PDF: PdfFormatOption(
60
+ pipeline_options=PdfPipelineOptions(
61
+ ocr_options=EasyOcrOptions(), generate_picture_images=True
62
+ )
63
+ ),
64
+ }
65
+
59
66
 
60
67
  T = TypeVar("T")
61
68
 
@@ -74,7 +81,7 @@ class DoclingPipelineConfig(PipelineConfig):
74
81
 
75
82
  format_options: Annotated[
76
83
  dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
77
- ] = _DEFAULT_FORMAT_OPTS
84
+ ] = Field(default_factory=_default_format_opts)
78
85
 
79
86
  _unsupported_input_formats: ClassVar[set[InputFormat]] = {
80
87
  InputFormat.AUDIO,
@@ -85,6 +92,8 @@ class DoclingPipelineConfig(PipelineConfig):
85
92
  @classmethod
86
93
  @cache
87
94
  def supported_exts(cls) -> set[SupportedExt]:
95
+ from docling.datamodel.base_models import FormatToExtensions, InputFormat
96
+
88
97
  supported = set()
89
98
  for f in InputFormat:
90
99
  if f in cls._unsupported_input_formats:
@@ -97,6 +106,8 @@ class DoclingPipelineConfig(PipelineConfig):
97
106
  @Pipeline.register(PipelineType.DOCLING)
98
107
  class DoclingPipeline(Pipeline):
99
108
  def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
109
+ from docling.document_converter import DocumentConverter
110
+
100
111
  allowed_format = [
101
112
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
102
113
  ]
@@ -117,13 +128,13 @@ class DoclingPipeline(Pipeline):
117
128
  return cls(config.format_options)
118
129
 
119
130
 
120
- def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
131
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
121
132
  for d in docs:
122
133
  yield d.to_docling()
123
134
 
124
135
 
125
136
  def _to_result(
126
- res: ConversionResult,
137
+ res: "ConversionResult",
127
138
  input_document: InputDoc,
128
139
  output_format: OutputFormat,
129
140
  output_path: Path,
@@ -144,11 +155,13 @@ def _to_result(
144
155
 
145
156
 
146
157
  def _to_markdown_doc(
147
- res: ConversionResult,
158
+ res: "ConversionResult",
148
159
  output_path: Path,
149
160
  page_sep: str = DEFAULT_MD_PAGE_SEP,
150
161
  **kwargs,
151
162
  ) -> MarkdownDoc:
163
+ from docling_core.types.doc import ImageRefMode
164
+
152
165
  # TODO: Should we add a hash to avoid collision between files with same names
153
166
  # nested in the tree structured
154
167
  md_dir_name = path_to_artifacts_dirname(res.input.file)
extract_python/marker_.py CHANGED
@@ -3,14 +3,8 @@ from collections.abc import AsyncGenerator, Iterable
3
3
  from copy import deepcopy
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Any, ClassVar, Self
7
-
8
- from marker.config.parser import ConfigParser
9
- from marker.converters.pdf import PdfConverter
10
- from marker.models import create_model_dict
11
- from marker.output import text_from_rendered
12
- from marker.renderers.markdown import MarkdownRenderer
13
- from PIL.Image import Image
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
+
14
8
  from pydantic import Field
15
9
 
16
10
  from .constants import ARTIFACTS, CPU_GROUP
@@ -26,6 +20,10 @@ from .objects import (
26
20
  from .pipeline import Pipeline, PipelineConfig, PipelineType
27
21
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
28
22
 
23
+ if TYPE_CHECKING:
24
+ from marker.converters.pdf import PdfConverter
25
+ from PIL import Image
26
+
29
27
 
30
28
  @PipelineConfig.register()
31
29
  class MarkerPipelineConfig(PipelineConfig):
@@ -75,6 +73,10 @@ class MarkerPipeline(Pipeline):
75
73
  async def extract_content(
76
74
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
77
75
  ) -> AsyncGenerator[Result, None]:
76
+ from marker.config.parser import ConfigParser
77
+ from marker.converters.pdf import PdfConverter
78
+ from marker.models import create_model_dict
79
+
78
80
  config = deepcopy(self._marker_config)
79
81
  config["output_format"] = output_format.to_marker()
80
82
  config_parser = ConfigParser(config)
@@ -96,10 +98,12 @@ class MarkerPipeline(Pipeline):
96
98
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
97
99
  def _process_doc(
98
100
  doc: InputDoc,
99
- converter: PdfConverter,
101
+ converter: "PdfConverter",
100
102
  output_format: OutputFormat,
101
103
  output_path: Path,
102
104
  ) -> Result:
105
+ from marker.output import text_from_rendered
106
+
103
107
  rendered = converter(str(doc.path))
104
108
  content, _, images = text_from_rendered(rendered)
105
109
  match output_format:
@@ -112,8 +116,10 @@ def _process_doc(
112
116
 
113
117
 
114
118
  def _to_markdown_doc(
115
- input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
119
+ input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
116
120
  ) -> MarkdownDoc:
121
+ from marker.renderers.markdown import MarkdownRenderer
122
+
117
123
  # TODO: Should we add a hash to avoid collision between files with same names
118
124
  # nested in the tree structured
119
125
  md_dir_name = path_to_artifacts_dirname(input_doc.path)
extract_python/miner_u.py CHANGED
@@ -8,12 +8,6 @@ from pathlib import Path
8
8
  from tempfile import TemporaryDirectory
9
9
  from typing import Any, ClassVar, Self
10
10
 
11
- from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
12
- union_make as pipeline_union_make,
13
- )
14
- from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
15
- from mineru.cli.common import aio_do_parse
16
- from mineru.utils.enum_class import MakeMode
17
11
  from pydantic import Field
18
12
  from pydantic_extra_types.language_code import LanguageAlpha2
19
13
 
@@ -47,33 +41,39 @@ class MinerUConfig(BaseModel):
47
41
  # TODO: use enum or literal here
48
42
  parse_method: str = "auto"
49
43
 
50
- default_kwargs: ClassVar[dict] = {
51
- "server_url": None,
52
- # We don't dump md directly we process, we dump the middle json in order to be
53
- # able to get page indexes
54
- "parse_method": "auto",
55
- "dump_md": False,
56
- "dump_middle_json": True,
57
- "f_draw_layout_bbox": False,
58
- "f_draw_span_bbox": False,
59
- "f_dump_model_output": False, # might be useful for debug though
60
- "f_dump_orig_pdf": False,
61
- "f_dump_content_list": False, # might be useful for debug though
62
- "start_page_id": 0,
63
- "f_make_md_mode": MakeMode.MM_MD,
64
- "image_analysis": True,
65
- "end_page_id": None,
66
- "client_side_output_generation": False,
67
- }
68
-
69
44
  def as_parse_kwargs(self) -> dict[str, Any]:
70
- kwargs = copy(self.default_kwargs)
45
+ kwargs = copy(self._get_default_kwargs())
71
46
  kwargs["backend"] = self.backend
72
47
  kwargs["parse_method"] = self.parse_method
73
48
  kwargs["formula_enable"] = self.enable_formula_extraction
74
49
  kwargs["table_enable"] = self.enable_table_extraction
75
50
  return kwargs
76
51
 
52
+ @classmethod
53
+ @cache
54
+ def _get_default_kwargs(cls) -> dict[str, Any]:
55
+
56
+ from mineru.utils.enum_class import MakeMode
57
+
58
+ return {
59
+ "server_url": None,
60
+ # We don't dump md directly we process, we dump the middle json in order to be
61
+ # able to get page indexes
62
+ "parse_method": "auto",
63
+ "dump_md": False,
64
+ "dump_middle_json": True,
65
+ "f_draw_layout_bbox": False,
66
+ "f_draw_span_bbox": False,
67
+ "f_dump_model_output": False, # might be useful for debug though
68
+ "f_dump_orig_pdf": False,
69
+ "f_dump_content_list": False, # might be useful for debug though
70
+ "start_page_id": 0,
71
+ "f_make_md_mode": MakeMode.MM_MD,
72
+ "image_analysis": True,
73
+ "end_page_id": None,
74
+ "client_side_output_generation": False,
75
+ }
76
+
77
77
 
78
78
  @PipelineConfig.register() # noqa: F821
79
79
  class MinerUPipelineConfig(PipelineConfig): # noqa: F821
@@ -104,6 +104,8 @@ class MinerUPipeline(Pipeline):
104
104
  async def extract_content(
105
105
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
106
106
  ) -> AsyncGenerator[Result, None]:
107
+ from mineru.cli.common import aio_do_parse
108
+
107
109
  docs = list(docs)
108
110
  # TODO: exclude files which are not pdf and return an error
109
111
  pdfs_bytes = [d.path.read_bytes() for d in docs]
@@ -149,11 +151,18 @@ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
149
151
 
150
152
 
151
153
  def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
154
+
152
155
  match backend:
153
156
  case MinerUBackend.PIPELINE:
154
- return pipeline_union_make
157
+ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
158
+ union_make,
159
+ )
160
+
161
+ return union_make
155
162
  case MinerUBackend.VLM:
156
- return vlm_union_make
163
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
164
+
165
+ return union_make
157
166
  case _:
158
167
  raise ValueError(f"Unsupported backend: {backend}")
159
168
 
@@ -201,8 +210,12 @@ def _dump_md_content(
201
210
  output_path: Path,
202
211
  md_path: Path,
203
212
  im_dir: Path,
204
- md_make_mode: str = MakeMode.MM_MD,
213
+ md_make_mode: str | None = None,
205
214
  ) -> ConversionOutput:
215
+ from mineru.utils.enum_class import MakeMode
216
+
217
+ if md_make_mode is None:
218
+ md_make_mode = MakeMode.MM_MD
206
219
  total_length = 0
207
220
  end_indices = []
208
221
  with md_path.open("w") as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=lWWQ2PT5qOFUcJkeKw8ibF4JxzxQBgf93_CfvNcykDg,7041
4
+ extract_python/marker_.py,sha256=ocRFxWX__A-M31z7Qr67OMcWRvgGO_C3tyZpiKc-bXw,5027
5
+ extract_python/miner_u.py,sha256=hwRFTvtWGN_mRuv0p6H7nKS89dTErQxI1yOrvh6238M,8010
6
+ extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
+ extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.3.1.dist-info/METADATA,sha256=qtfZpwEIKgWzkfbxGYMVP-pNFMFAbLrZo1-hmDXcgvE,1132
10
+ extract_python-0.3.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.3.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=00g7RV33iftjvMLoKaEs2lUZP3LslSCKMpGeSys1Suc,6616
4
- extract_python/marker_.py,sha256=3Q8H-TeM2_GenB6OOqIuytqgI1VE93Ek99_kW0cJHEw,4905
5
- extract_python/miner_u.py,sha256=WdaftyINZdnALqSuu1qKaZJKKyIHVRn-wBke-Na78O0,7747
6
- extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
- extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.3.0.dist-info/METADATA,sha256=MwFN5PsmkUEv8sbhuS6joh7r17W6rrikpJb4Yr-rdKk,1132
10
- extract_python-0.3.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.3.0.dist-info/RECORD,,