extract-python 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_python-0.3.0 → extract_python-0.3.2}/PKG-INFO +1 -1
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/docling_.py +48 -35
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/marker_.py +16 -10
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/miner_u.py +43 -29
- {extract_python-0.3.0 → extract_python-0.3.2}/.dockerignore +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/.github/workflows/publish.yml +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/.github/workflows/tests.yml +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/.gitignore +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/.python-version +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/Dockerfile +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/README.md +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/benches/__init__.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/benches/compare.ipynb +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/benches/compare.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/benches/constants.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/data/.gitignore +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/docker-compose.yml +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/extract +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/__init__.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/constants.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/objects.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/pipeline.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/extract_python/utils.py +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/pyproject.toml +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/qa/ruff.toml +0 -0
- {extract_python-0.3.0 → extract_python-0.3.2}/uv.lock +0 -0
|
@@ -3,18 +3,8 @@ import tempfile
|
|
|
3
3
|
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
4
|
from functools import cache
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Annotated, ClassVar, TypeVar
|
|
7
|
-
|
|
8
|
-
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
9
|
-
from docling.datamodel.document import ConversionResult
|
|
10
|
-
from docling.datamodel.pipeline_options import (
|
|
11
|
-
EasyOcrOptions,
|
|
12
|
-
PdfPipelineOptions,
|
|
13
|
-
PipelineOptions,
|
|
14
|
-
)
|
|
15
|
-
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
16
|
-
from docling_core.types.doc import ImageRefMode
|
|
17
|
-
from docling_core.types.io import DocumentStream
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
|
|
7
|
+
|
|
18
8
|
from icij_common.registrable import FromConfig
|
|
19
9
|
from pydantic import AfterValidator, Field
|
|
20
10
|
|
|
@@ -34,28 +24,45 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
|
|
|
34
24
|
|
|
35
25
|
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
36
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from docling.datamodel.base_models import InputFormat
|
|
29
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
|
30
|
+
from docling.document_converter import ConversionResult, FormatOption
|
|
31
|
+
from docling_core.types.io import DocumentStream
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
|
|
35
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions # noqa: PLC0415
|
|
37
36
|
|
|
38
|
-
def _validate_pipeline_opts(opts: PipelineOptions) -> None:
|
|
39
37
|
if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
|
|
40
38
|
msg = "generate_picture_images should be set to true"
|
|
41
39
|
raise ValueError(msg)
|
|
42
40
|
|
|
43
41
|
|
|
44
42
|
def _validate_options(
|
|
45
|
-
data: dict[InputFormat, FormatOption],
|
|
46
|
-
) -> dict[InputFormat, FormatOption]:
|
|
43
|
+
data: dict["InputFormat", "FormatOption"],
|
|
44
|
+
) -> dict["InputFormat", "FormatOption"]:
|
|
47
45
|
for opts in data.values():
|
|
48
46
|
_validate_pipeline_opts(opts.pipeline_options)
|
|
49
47
|
return data
|
|
50
48
|
|
|
51
49
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
)
|
|
58
|
-
|
|
50
|
+
@cache
|
|
51
|
+
def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
|
|
52
|
+
from docling.datamodel.pipeline_options import ( # noqa: PLC0415
|
|
53
|
+
EasyOcrOptions,
|
|
54
|
+
PdfPipelineOptions,
|
|
55
|
+
)
|
|
56
|
+
from docling.document_converter import PdfFormatOption # noqa: PLC0415
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
InputFormat.PDF: PdfFormatOption(
|
|
60
|
+
pipeline_options=PdfPipelineOptions(
|
|
61
|
+
ocr_options=EasyOcrOptions(), generate_picture_images=True
|
|
62
|
+
)
|
|
63
|
+
),
|
|
64
|
+
}
|
|
65
|
+
|
|
59
66
|
|
|
60
67
|
T = TypeVar("T")
|
|
61
68
|
|
|
@@ -73,21 +80,21 @@ class DoclingPipelineConfig(PipelineConfig):
|
|
|
73
80
|
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
74
81
|
|
|
75
82
|
format_options: Annotated[
|
|
76
|
-
dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
|
|
77
|
-
] =
|
|
78
|
-
|
|
79
|
-
_unsupported_input_formats: ClassVar[set[InputFormat]] = {
|
|
80
|
-
InputFormat.AUDIO,
|
|
81
|
-
InputFormat.METS_GBS,
|
|
82
|
-
InputFormat.VTT,
|
|
83
|
-
}
|
|
83
|
+
dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
|
|
84
|
+
] = Field(default_factory=_default_format_opts)
|
|
84
85
|
|
|
85
86
|
@classmethod
|
|
86
87
|
@cache
|
|
87
88
|
def supported_exts(cls) -> set[SupportedExt]:
|
|
89
|
+
from docling.datamodel.base_models import ( # noqa: PLC0415
|
|
90
|
+
FormatToExtensions,
|
|
91
|
+
InputFormat,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
|
|
88
95
|
supported = set()
|
|
89
96
|
for f in InputFormat:
|
|
90
|
-
if f in
|
|
97
|
+
if f in unsupported:
|
|
91
98
|
continue
|
|
92
99
|
for ext in FormatToExtensions[f]:
|
|
93
100
|
supported.add(SupportedExt(f".{ext.lower()}"))
|
|
@@ -96,7 +103,11 @@ class DoclingPipelineConfig(PipelineConfig):
|
|
|
96
103
|
|
|
97
104
|
@Pipeline.register(PipelineType.DOCLING)
|
|
98
105
|
class DoclingPipeline(Pipeline):
|
|
99
|
-
def __init__(
|
|
106
|
+
def __init__(
|
|
107
|
+
self, format_options: dict["InputFormat", "FormatOption"] | None = None
|
|
108
|
+
):
|
|
109
|
+
from docling.document_converter import DocumentConverter # noqa: PLC0415
|
|
110
|
+
|
|
100
111
|
allowed_format = [
|
|
101
112
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
102
113
|
]
|
|
@@ -117,13 +128,13 @@ class DoclingPipeline(Pipeline):
|
|
|
117
128
|
return cls(config.format_options)
|
|
118
129
|
|
|
119
130
|
|
|
120
|
-
def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
|
|
131
|
+
def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
|
121
132
|
for d in docs:
|
|
122
133
|
yield d.to_docling()
|
|
123
134
|
|
|
124
135
|
|
|
125
136
|
def _to_result(
|
|
126
|
-
res: ConversionResult,
|
|
137
|
+
res: "ConversionResult",
|
|
127
138
|
input_document: InputDoc,
|
|
128
139
|
output_format: OutputFormat,
|
|
129
140
|
output_path: Path,
|
|
@@ -144,11 +155,13 @@ def _to_result(
|
|
|
144
155
|
|
|
145
156
|
|
|
146
157
|
def _to_markdown_doc(
|
|
147
|
-
res: ConversionResult,
|
|
158
|
+
res: "ConversionResult",
|
|
148
159
|
output_path: Path,
|
|
149
160
|
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
150
161
|
**kwargs,
|
|
151
162
|
) -> MarkdownDoc:
|
|
163
|
+
from docling_core.types.doc import ImageRefMode # noqa: PLC0415
|
|
164
|
+
|
|
152
165
|
# TODO: Should we add a hash to avoid collision between files with same names
|
|
153
166
|
# nested in the tree structured
|
|
154
167
|
md_dir_name = path_to_artifacts_dirname(res.input.file)
|
|
@@ -3,14 +3,8 @@ from collections.abc import AsyncGenerator, Iterable
|
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from functools import cache
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, ClassVar, Self
|
|
7
|
-
|
|
8
|
-
from marker.config.parser import ConfigParser
|
|
9
|
-
from marker.converters.pdf import PdfConverter
|
|
10
|
-
from marker.models import create_model_dict
|
|
11
|
-
from marker.output import text_from_rendered
|
|
12
|
-
from marker.renderers.markdown import MarkdownRenderer
|
|
13
|
-
from PIL.Image import Image
|
|
6
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Self
|
|
7
|
+
|
|
14
8
|
from pydantic import Field
|
|
15
9
|
|
|
16
10
|
from .constants import ARTIFACTS, CPU_GROUP
|
|
@@ -26,6 +20,10 @@ from .objects import (
|
|
|
26
20
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
27
21
|
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
28
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from marker.converters.pdf import PdfConverter
|
|
25
|
+
from PIL import Image
|
|
26
|
+
|
|
29
27
|
|
|
30
28
|
@PipelineConfig.register()
|
|
31
29
|
class MarkerPipelineConfig(PipelineConfig):
|
|
@@ -75,6 +73,10 @@ class MarkerPipeline(Pipeline):
|
|
|
75
73
|
async def extract_content(
|
|
76
74
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
77
75
|
) -> AsyncGenerator[Result, None]:
|
|
76
|
+
from marker.config.parser import ConfigParser # noqa: PLC0415
|
|
77
|
+
from marker.converters.pdf import PdfConverter # noqa: PLC0415
|
|
78
|
+
from marker.models import create_model_dict # noqa: PLC0415
|
|
79
|
+
|
|
78
80
|
config = deepcopy(self._marker_config)
|
|
79
81
|
config["output_format"] = output_format.to_marker()
|
|
80
82
|
config_parser = ConfigParser(config)
|
|
@@ -96,10 +98,12 @@ class MarkerPipeline(Pipeline):
|
|
|
96
98
|
@report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
|
|
97
99
|
def _process_doc(
|
|
98
100
|
doc: InputDoc,
|
|
99
|
-
converter: PdfConverter,
|
|
101
|
+
converter: "PdfConverter",
|
|
100
102
|
output_format: OutputFormat,
|
|
101
103
|
output_path: Path,
|
|
102
104
|
) -> Result:
|
|
105
|
+
from marker.output import text_from_rendered # noqa: PLC0415
|
|
106
|
+
|
|
103
107
|
rendered = converter(str(doc.path))
|
|
104
108
|
content, _, images = text_from_rendered(rendered)
|
|
105
109
|
match output_format:
|
|
@@ -112,8 +116,10 @@ def _process_doc(
|
|
|
112
116
|
|
|
113
117
|
|
|
114
118
|
def _to_markdown_doc(
|
|
115
|
-
input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
|
|
119
|
+
input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
|
|
116
120
|
) -> MarkdownDoc:
|
|
121
|
+
from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
|
|
122
|
+
|
|
117
123
|
# TODO: Should we add a hash to avoid collision between files with same names
|
|
118
124
|
# nested in the tree structured
|
|
119
125
|
md_dir_name = path_to_artifacts_dirname(input_doc.path)
|
|
@@ -8,12 +8,6 @@ from pathlib import Path
|
|
|
8
8
|
from tempfile import TemporaryDirectory
|
|
9
9
|
from typing import Any, ClassVar, Self
|
|
10
10
|
|
|
11
|
-
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
|
|
12
|
-
union_make as pipeline_union_make,
|
|
13
|
-
)
|
|
14
|
-
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
15
|
-
from mineru.cli.common import aio_do_parse
|
|
16
|
-
from mineru.utils.enum_class import MakeMode
|
|
17
11
|
from pydantic import Field
|
|
18
12
|
from pydantic_extra_types.language_code import LanguageAlpha2
|
|
19
13
|
|
|
@@ -47,33 +41,38 @@ class MinerUConfig(BaseModel):
|
|
|
47
41
|
# TODO: use enum or literal here
|
|
48
42
|
parse_method: str = "auto"
|
|
49
43
|
|
|
50
|
-
default_kwargs: ClassVar[dict] = {
|
|
51
|
-
"server_url": None,
|
|
52
|
-
# We don't dump md directly we process, we dump the middle json in order to be
|
|
53
|
-
# able to get page indexes
|
|
54
|
-
"parse_method": "auto",
|
|
55
|
-
"dump_md": False,
|
|
56
|
-
"dump_middle_json": True,
|
|
57
|
-
"f_draw_layout_bbox": False,
|
|
58
|
-
"f_draw_span_bbox": False,
|
|
59
|
-
"f_dump_model_output": False, # might be useful for debug though
|
|
60
|
-
"f_dump_orig_pdf": False,
|
|
61
|
-
"f_dump_content_list": False, # might be useful for debug though
|
|
62
|
-
"start_page_id": 0,
|
|
63
|
-
"f_make_md_mode": MakeMode.MM_MD,
|
|
64
|
-
"image_analysis": True,
|
|
65
|
-
"end_page_id": None,
|
|
66
|
-
"client_side_output_generation": False,
|
|
67
|
-
}
|
|
68
|
-
|
|
69
44
|
def as_parse_kwargs(self) -> dict[str, Any]:
|
|
70
|
-
kwargs = copy(self.
|
|
45
|
+
kwargs = copy(self._get_default_kwargs())
|
|
71
46
|
kwargs["backend"] = self.backend
|
|
72
47
|
kwargs["parse_method"] = self.parse_method
|
|
73
48
|
kwargs["formula_enable"] = self.enable_formula_extraction
|
|
74
49
|
kwargs["table_enable"] = self.enable_table_extraction
|
|
75
50
|
return kwargs
|
|
76
51
|
|
|
52
|
+
@classmethod
|
|
53
|
+
@cache
|
|
54
|
+
def _get_default_kwargs(cls) -> dict[str, Any]:
|
|
55
|
+
from mineru.utils.enum_class import MakeMode # noqa: PLC0415
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
"server_url": None,
|
|
59
|
+
# We don't dump md directly we process, we dump the middle json in order
|
|
60
|
+
# to be able to get page indexes
|
|
61
|
+
"parse_method": "auto",
|
|
62
|
+
"dump_md": False,
|
|
63
|
+
"dump_middle_json": True,
|
|
64
|
+
"f_draw_layout_bbox": False,
|
|
65
|
+
"f_draw_span_bbox": False,
|
|
66
|
+
"f_dump_model_output": False, # might be useful for debug though
|
|
67
|
+
"f_dump_orig_pdf": False,
|
|
68
|
+
"f_dump_content_list": False, # might be useful for debug though
|
|
69
|
+
"start_page_id": 0,
|
|
70
|
+
"f_make_md_mode": MakeMode.MM_MD,
|
|
71
|
+
"image_analysis": True,
|
|
72
|
+
"end_page_id": None,
|
|
73
|
+
"client_side_output_generation": False,
|
|
74
|
+
}
|
|
75
|
+
|
|
77
76
|
|
|
78
77
|
@PipelineConfig.register() # noqa: F821
|
|
79
78
|
class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
@@ -104,6 +103,8 @@ class MinerUPipeline(Pipeline):
|
|
|
104
103
|
async def extract_content(
|
|
105
104
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
106
105
|
) -> AsyncGenerator[Result, None]:
|
|
106
|
+
from mineru.cli.common import aio_do_parse # noqa: PLC0415
|
|
107
|
+
|
|
107
108
|
docs = list(docs)
|
|
108
109
|
# TODO: exclude files which are not pdf and return an error
|
|
109
110
|
pdfs_bytes = [d.path.read_bytes() for d in docs]
|
|
@@ -149,11 +150,20 @@ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
|
|
|
149
150
|
|
|
150
151
|
|
|
151
152
|
def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
|
|
153
|
+
|
|
152
154
|
match backend:
|
|
153
155
|
case MinerUBackend.PIPELINE:
|
|
154
|
-
|
|
156
|
+
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import ( # noqa: PLC0415
|
|
157
|
+
union_make,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return union_make
|
|
155
161
|
case MinerUBackend.VLM:
|
|
156
|
-
|
|
162
|
+
from mineru.backend.vlm.vlm_middle_json_mkcontent import ( # noqa: PLC0415
|
|
163
|
+
union_make,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return union_make
|
|
157
167
|
case _:
|
|
158
168
|
raise ValueError(f"Unsupported backend: {backend}")
|
|
159
169
|
|
|
@@ -201,8 +211,12 @@ def _dump_md_content(
|
|
|
201
211
|
output_path: Path,
|
|
202
212
|
md_path: Path,
|
|
203
213
|
im_dir: Path,
|
|
204
|
-
md_make_mode: str =
|
|
214
|
+
md_make_mode: str | None = None,
|
|
205
215
|
) -> ConversionOutput:
|
|
216
|
+
from mineru.utils.enum_class import MakeMode # noqa: PLC0415
|
|
217
|
+
|
|
218
|
+
if md_make_mode is None:
|
|
219
|
+
md_make_mode = MakeMode.MM_MD
|
|
206
220
|
total_length = 0
|
|
207
221
|
end_indices = []
|
|
208
222
|
with md_path.open("w") as f:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|