extract-python 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {extract_python-0.3.0 → extract_python-0.3.2}/PKG-INFO +1 -1
  2. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/docling_.py +48 -35
  3. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/marker_.py +16 -10
  4. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/miner_u.py +43 -29
  5. {extract_python-0.3.0 → extract_python-0.3.2}/.dockerignore +0 -0
  6. {extract_python-0.3.0 → extract_python-0.3.2}/.github/workflows/publish.yml +0 -0
  7. {extract_python-0.3.0 → extract_python-0.3.2}/.github/workflows/tests.yml +0 -0
  8. {extract_python-0.3.0 → extract_python-0.3.2}/.gitignore +0 -0
  9. {extract_python-0.3.0 → extract_python-0.3.2}/.python-version +0 -0
  10. {extract_python-0.3.0 → extract_python-0.3.2}/Dockerfile +0 -0
  11. {extract_python-0.3.0 → extract_python-0.3.2}/README.md +0 -0
  12. {extract_python-0.3.0 → extract_python-0.3.2}/benches/__init__.py +0 -0
  13. {extract_python-0.3.0 → extract_python-0.3.2}/benches/compare.ipynb +0 -0
  14. {extract_python-0.3.0 → extract_python-0.3.2}/benches/compare.py +0 -0
  15. {extract_python-0.3.0 → extract_python-0.3.2}/benches/constants.py +0 -0
  16. {extract_python-0.3.0 → extract_python-0.3.2}/data/.gitignore +0 -0
  17. {extract_python-0.3.0 → extract_python-0.3.2}/docker-compose.yml +0 -0
  18. {extract_python-0.3.0 → extract_python-0.3.2}/extract +0 -0
  19. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/__init__.py +0 -0
  20. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/constants.py +0 -0
  21. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/objects.py +0 -0
  22. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/pipeline.py +0 -0
  23. {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/utils.py +0 -0
  24. {extract_python-0.3.0 → extract_python-0.3.2}/pyproject.toml +0 -0
  25. {extract_python-0.3.0 → extract_python-0.3.2}/qa/ruff.toml +0 -0
  26. {extract_python-0.3.0 → extract_python-0.3.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -3,18 +3,8 @@ import tempfile
3
3
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Annotated, ClassVar, TypeVar
7
-
8
- from docling.datamodel.base_models import FormatToExtensions, InputFormat
9
- from docling.datamodel.document import ConversionResult
10
- from docling.datamodel.pipeline_options import (
11
- EasyOcrOptions,
12
- PdfPipelineOptions,
13
- PipelineOptions,
14
- )
15
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
16
- from docling_core.types.doc import ImageRefMode
17
- from docling_core.types.io import DocumentStream
6
+ from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
7
+
18
8
  from icij_common.registrable import FromConfig
19
9
  from pydantic import AfterValidator, Field
20
10
 
@@ -34,28 +24,45 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
34
24
 
35
25
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
36
26
 
27
+ if TYPE_CHECKING:
28
+ from docling.datamodel.base_models import InputFormat
29
+ from docling.datamodel.pipeline_options import PipelineOptions
30
+ from docling.document_converter import ConversionResult, FormatOption
31
+ from docling_core.types.io import DocumentStream
32
+
33
+
34
+ def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
+ from docling.datamodel.pipeline_options import PdfPipelineOptions # noqa: PLC0415
37
36
 
38
- def _validate_pipeline_opts(opts: PipelineOptions) -> None:
39
37
  if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
40
38
  msg = "generate_picture_images should be set to true"
41
39
  raise ValueError(msg)
42
40
 
43
41
 
44
42
  def _validate_options(
45
- data: dict[InputFormat, FormatOption],
46
- ) -> dict[InputFormat, FormatOption]:
43
+ data: dict["InputFormat", "FormatOption"],
44
+ ) -> dict["InputFormat", "FormatOption"]:
47
45
  for opts in data.values():
48
46
  _validate_pipeline_opts(opts.pipeline_options)
49
47
  return data
50
48
 
51
49
 
52
- _DEFAULT_FORMAT_OPTS = {
53
- InputFormat.PDF: PdfFormatOption(
54
- pipeline_options=PdfPipelineOptions(
55
- ocr_options=EasyOcrOptions(), generate_picture_images=True
56
- )
57
- ),
58
- }
50
+ @cache
51
+ def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
+ from docling.datamodel.pipeline_options import ( # noqa: PLC0415
53
+ EasyOcrOptions,
54
+ PdfPipelineOptions,
55
+ )
56
+ from docling.document_converter import PdfFormatOption # noqa: PLC0415
57
+
58
+ return {
59
+ InputFormat.PDF: PdfFormatOption(
60
+ pipeline_options=PdfPipelineOptions(
61
+ ocr_options=EasyOcrOptions(), generate_picture_images=True
62
+ )
63
+ ),
64
+ }
65
+
59
66
 
60
67
  T = TypeVar("T")
61
68
 
@@ -73,21 +80,21 @@ class DoclingPipelineConfig(PipelineConfig):
73
80
  task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
74
81
 
75
82
  format_options: Annotated[
76
- dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
77
- ] = _DEFAULT_FORMAT_OPTS
78
-
79
- _unsupported_input_formats: ClassVar[set[InputFormat]] = {
80
- InputFormat.AUDIO,
81
- InputFormat.METS_GBS,
82
- InputFormat.VTT,
83
- }
83
+ dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
84
+ ] = Field(default_factory=_default_format_opts)
84
85
 
85
86
  @classmethod
86
87
  @cache
87
88
  def supported_exts(cls) -> set[SupportedExt]:
89
+ from docling.datamodel.base_models import ( # noqa: PLC0415
90
+ FormatToExtensions,
91
+ InputFormat,
92
+ )
93
+
94
+ unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
88
95
  supported = set()
89
96
  for f in InputFormat:
90
- if f in cls._unsupported_input_formats:
97
+ if f in unsupported:
91
98
  continue
92
99
  for ext in FormatToExtensions[f]:
93
100
  supported.add(SupportedExt(f".{ext.lower()}"))
@@ -96,7 +103,11 @@ class DoclingPipelineConfig(PipelineConfig):
96
103
 
97
104
  @Pipeline.register(PipelineType.DOCLING)
98
105
  class DoclingPipeline(Pipeline):
99
- def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
106
+ def __init__(
107
+ self, format_options: dict["InputFormat", "FormatOption"] | None = None
108
+ ):
109
+ from docling.document_converter import DocumentConverter # noqa: PLC0415
110
+
100
111
  allowed_format = [
101
112
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
102
113
  ]
@@ -117,13 +128,13 @@ class DoclingPipeline(Pipeline):
117
128
  return cls(config.format_options)
118
129
 
119
130
 
120
- def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
131
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
121
132
  for d in docs:
122
133
  yield d.to_docling()
123
134
 
124
135
 
125
136
  def _to_result(
126
- res: ConversionResult,
137
+ res: "ConversionResult",
127
138
  input_document: InputDoc,
128
139
  output_format: OutputFormat,
129
140
  output_path: Path,
@@ -144,11 +155,13 @@ def _to_result(
144
155
 
145
156
 
146
157
  def _to_markdown_doc(
147
- res: ConversionResult,
158
+ res: "ConversionResult",
148
159
  output_path: Path,
149
160
  page_sep: str = DEFAULT_MD_PAGE_SEP,
150
161
  **kwargs,
151
162
  ) -> MarkdownDoc:
163
+ from docling_core.types.doc import ImageRefMode # noqa: PLC0415
164
+
152
165
  # TODO: Should we add a hash to avoid collision between files with same names
153
166
  # nested in the tree structured
154
167
  md_dir_name = path_to_artifacts_dirname(res.input.file)
@@ -3,14 +3,8 @@ from collections.abc import AsyncGenerator, Iterable
3
3
  from copy import deepcopy
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Any, ClassVar, Self
7
-
8
- from marker.config.parser import ConfigParser
9
- from marker.converters.pdf import PdfConverter
10
- from marker.models import create_model_dict
11
- from marker.output import text_from_rendered
12
- from marker.renderers.markdown import MarkdownRenderer
13
- from PIL.Image import Image
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
+
14
8
  from pydantic import Field
15
9
 
16
10
  from .constants import ARTIFACTS, CPU_GROUP
@@ -26,6 +20,10 @@ from .objects import (
26
20
  from .pipeline import Pipeline, PipelineConfig, PipelineType
27
21
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
28
22
 
23
+ if TYPE_CHECKING:
24
+ from marker.converters.pdf import PdfConverter
25
+ from PIL import Image
26
+
29
27
 
30
28
  @PipelineConfig.register()
31
29
  class MarkerPipelineConfig(PipelineConfig):
@@ -75,6 +73,10 @@ class MarkerPipeline(Pipeline):
75
73
  async def extract_content(
76
74
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
77
75
  ) -> AsyncGenerator[Result, None]:
76
+ from marker.config.parser import ConfigParser # noqa: PLC0415
77
+ from marker.converters.pdf import PdfConverter # noqa: PLC0415
78
+ from marker.models import create_model_dict # noqa: PLC0415
79
+
78
80
  config = deepcopy(self._marker_config)
79
81
  config["output_format"] = output_format.to_marker()
80
82
  config_parser = ConfigParser(config)
@@ -96,10 +98,12 @@ class MarkerPipeline(Pipeline):
96
98
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
97
99
  def _process_doc(
98
100
  doc: InputDoc,
99
- converter: PdfConverter,
101
+ converter: "PdfConverter",
100
102
  output_format: OutputFormat,
101
103
  output_path: Path,
102
104
  ) -> Result:
105
+ from marker.output import text_from_rendered # noqa: PLC0415
106
+
103
107
  rendered = converter(str(doc.path))
104
108
  content, _, images = text_from_rendered(rendered)
105
109
  match output_format:
@@ -112,8 +116,10 @@ def _process_doc(
112
116
 
113
117
 
114
118
  def _to_markdown_doc(
115
- input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
119
+ input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
116
120
  ) -> MarkdownDoc:
121
+ from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
122
+
117
123
  # TODO: Should we add a hash to avoid collision between files with same names
118
124
  # nested in the tree structured
119
125
  md_dir_name = path_to_artifacts_dirname(input_doc.path)
@@ -8,12 +8,6 @@ from pathlib import Path
8
8
  from tempfile import TemporaryDirectory
9
9
  from typing import Any, ClassVar, Self
10
10
 
11
- from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
12
- union_make as pipeline_union_make,
13
- )
14
- from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
15
- from mineru.cli.common import aio_do_parse
16
- from mineru.utils.enum_class import MakeMode
17
11
  from pydantic import Field
18
12
  from pydantic_extra_types.language_code import LanguageAlpha2
19
13
 
@@ -47,33 +41,38 @@ class MinerUConfig(BaseModel):
47
41
  # TODO: use enum or literal here
48
42
  parse_method: str = "auto"
49
43
 
50
- default_kwargs: ClassVar[dict] = {
51
- "server_url": None,
52
- # We don't dump md directly we process, we dump the middle json in order to be
53
- # able to get page indexes
54
- "parse_method": "auto",
55
- "dump_md": False,
56
- "dump_middle_json": True,
57
- "f_draw_layout_bbox": False,
58
- "f_draw_span_bbox": False,
59
- "f_dump_model_output": False, # might be useful for debug though
60
- "f_dump_orig_pdf": False,
61
- "f_dump_content_list": False, # might be useful for debug though
62
- "start_page_id": 0,
63
- "f_make_md_mode": MakeMode.MM_MD,
64
- "image_analysis": True,
65
- "end_page_id": None,
66
- "client_side_output_generation": False,
67
- }
68
-
69
44
  def as_parse_kwargs(self) -> dict[str, Any]:
70
- kwargs = copy(self.default_kwargs)
45
+ kwargs = copy(self._get_default_kwargs())
71
46
  kwargs["backend"] = self.backend
72
47
  kwargs["parse_method"] = self.parse_method
73
48
  kwargs["formula_enable"] = self.enable_formula_extraction
74
49
  kwargs["table_enable"] = self.enable_table_extraction
75
50
  return kwargs
76
51
 
52
+ @classmethod
53
+ @cache
54
+ def _get_default_kwargs(cls) -> dict[str, Any]:
55
+ from mineru.utils.enum_class import MakeMode # noqa: PLC0415
56
+
57
+ return {
58
+ "server_url": None,
59
+ # We don't dump md directly we process, we dump the middle json in order
60
+ # to be able to get page indexes
61
+ "parse_method": "auto",
62
+ "dump_md": False,
63
+ "dump_middle_json": True,
64
+ "f_draw_layout_bbox": False,
65
+ "f_draw_span_bbox": False,
66
+ "f_dump_model_output": False, # might be useful for debug though
67
+ "f_dump_orig_pdf": False,
68
+ "f_dump_content_list": False, # might be useful for debug though
69
+ "start_page_id": 0,
70
+ "f_make_md_mode": MakeMode.MM_MD,
71
+ "image_analysis": True,
72
+ "end_page_id": None,
73
+ "client_side_output_generation": False,
74
+ }
75
+
77
76
 
78
77
  @PipelineConfig.register() # noqa: F821
79
78
  class MinerUPipelineConfig(PipelineConfig): # noqa: F821
@@ -104,6 +103,8 @@ class MinerUPipeline(Pipeline):
104
103
  async def extract_content(
105
104
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
106
105
  ) -> AsyncGenerator[Result, None]:
106
+ from mineru.cli.common import aio_do_parse # noqa: PLC0415
107
+
107
108
  docs = list(docs)
108
109
  # TODO: exclude files which are not pdf and return an error
109
110
  pdfs_bytes = [d.path.read_bytes() for d in docs]
@@ -149,11 +150,20 @@ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
149
150
 
150
151
 
151
152
  def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
153
+
152
154
  match backend:
153
155
  case MinerUBackend.PIPELINE:
154
- return pipeline_union_make
156
+ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import ( # noqa: PLC0415
157
+ union_make,
158
+ )
159
+
160
+ return union_make
155
161
  case MinerUBackend.VLM:
156
- return vlm_union_make
162
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import ( # noqa: PLC0415
163
+ union_make,
164
+ )
165
+
166
+ return union_make
157
167
  case _:
158
168
  raise ValueError(f"Unsupported backend: {backend}")
159
169
 
@@ -201,8 +211,12 @@ def _dump_md_content(
201
211
  output_path: Path,
202
212
  md_path: Path,
203
213
  im_dir: Path,
204
- md_make_mode: str = MakeMode.MM_MD,
214
+ md_make_mode: str | None = None,
205
215
  ) -> ConversionOutput:
216
+ from mineru.utils.enum_class import MakeMode # noqa: PLC0415
217
+
218
+ if md_make_mode is None:
219
+ md_make_mode = MakeMode.MM_MD
206
220
  total_length = 0
207
221
  end_indices = []
208
222
  with md_path.open("w") as f:
File without changes
File without changes
File without changes