extract-python 0.4.2__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/__init__.py +7 -25
- extract_python/constants.py +0 -4
- extract_python/docling_.py +13 -161
- extract_python/marker_.py +7 -7
- extract_python/miner_u.py +10 -74
- extract_python/utils.py +4 -10
- {extract_python-0.4.2.dist-info → extract_python-0.5.5.dist-info}/METADATA +3 -1
- extract_python-0.5.5.dist-info/RECORD +9 -0
- extract_python/objects.py +0 -323
- extract_python/pipeline.py +0 -38
- extract_python-0.4.2.dist-info/RECORD +0 -11
- {extract_python-0.4.2.dist-info → extract_python-0.5.5.dist-info}/WHEEL +0 -0
extract_python/__init__.py
CHANGED
|
@@ -1,41 +1,23 @@
|
|
|
1
|
-
from .objects import InputDoc, OutputFormat, Status
|
|
2
|
-
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
3
|
-
|
|
4
1
|
try:
|
|
5
|
-
from .docling_ import
|
|
6
|
-
DOCLING_DEFAULT_ARTIFACTS_PATH,
|
|
7
|
-
DoclingPipeline,
|
|
8
|
-
DoclingPipelineConfig,
|
|
9
|
-
)
|
|
2
|
+
from .docling_ import DOCLING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline
|
|
10
3
|
except ImportError:
|
|
11
|
-
DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline
|
|
12
|
-
None,
|
|
13
|
-
None,
|
|
14
|
-
None,
|
|
15
|
-
)
|
|
4
|
+
DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline = None, None
|
|
16
5
|
|
|
17
6
|
try:
|
|
18
|
-
from .marker_ import MarkerPipeline
|
|
7
|
+
from .marker_ import MarkerPipeline
|
|
19
8
|
except ImportError:
|
|
20
|
-
MarkerPipeline
|
|
9
|
+
MarkerPipeline = None
|
|
21
10
|
|
|
22
11
|
|
|
23
12
|
try:
|
|
24
|
-
from .miner_u import MinerUPipeline
|
|
13
|
+
from .miner_u import MinerUPipeline
|
|
25
14
|
except ImportError:
|
|
26
|
-
MinerUPipeline
|
|
15
|
+
MinerUPipeline = None
|
|
27
16
|
|
|
28
17
|
|
|
29
18
|
__all__ = [
|
|
30
19
|
"DoclingPipeline",
|
|
31
|
-
"DoclingPipelineConfig",
|
|
32
|
-
"InputDoc",
|
|
33
20
|
"DOCLING_DEFAULT_ARTIFACTS_PATH",
|
|
34
21
|
"MarkerPipeline",
|
|
35
|
-
"
|
|
36
|
-
"OutputFormat",
|
|
37
|
-
"Pipeline",
|
|
38
|
-
"PipelineType",
|
|
39
|
-
"PipelineConfig",
|
|
40
|
-
"Status",
|
|
22
|
+
"MinerUPipeline",
|
|
41
23
|
]
|
extract_python/constants.py
CHANGED
extract_python/docling_.py
CHANGED
|
@@ -1,190 +1,42 @@
|
|
|
1
|
-
import importlib
|
|
2
1
|
import shutil
|
|
3
2
|
import tempfile
|
|
4
3
|
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
5
|
-
from functools import cache
|
|
6
4
|
from pathlib import Path
|
|
7
|
-
from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
|
|
8
5
|
|
|
9
|
-
from docling.
|
|
10
|
-
from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
|
|
11
|
-
|
|
12
|
-
# Data model import are quick it's ok to leave it there
|
|
13
|
-
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
6
|
+
from docling.datamodel.base_models import InputFormat
|
|
14
7
|
from docling.datamodel.document import ConversionResult
|
|
15
|
-
from docling.
|
|
16
|
-
EasyOcrOptions,
|
|
17
|
-
PdfPipelineOptions,
|
|
18
|
-
PipelineOptions,
|
|
19
|
-
ThreadedPdfPipelineOptions,
|
|
20
|
-
)
|
|
21
|
-
from docling.document_converter import DocumentConverter, FormatOption
|
|
22
|
-
from docling.pipeline.base_pipeline import BasePipeline
|
|
8
|
+
from docling.document_converter import DocumentConverter
|
|
23
9
|
|
|
24
10
|
# TODO: this is long to load improve it
|
|
25
11
|
from docling_core.types.doc import ImageRefMode
|
|
26
12
|
from docling_core.types.io import DocumentStream
|
|
27
|
-
from
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
AfterValidator,
|
|
31
|
-
BeforeValidator,
|
|
32
|
-
Field,
|
|
33
|
-
PlainSerializer,
|
|
34
|
-
WrapSerializer,
|
|
35
|
-
model_validator,
|
|
36
|
-
)
|
|
37
|
-
from pydantic_core.core_schema import SerializerFunctionWrapHandler
|
|
38
|
-
|
|
39
|
-
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
40
|
-
from .objects import (
|
|
13
|
+
from extract_core import (
|
|
14
|
+
DoclingFormatOption,
|
|
15
|
+
DoclingPipelineConfig,
|
|
41
16
|
Error,
|
|
42
17
|
InputDoc,
|
|
43
18
|
MarkdownDoc,
|
|
44
19
|
OutputFormat,
|
|
45
20
|
PageIndexes,
|
|
21
|
+
Pipeline,
|
|
22
|
+
PipelineType,
|
|
46
23
|
Result,
|
|
47
24
|
Status,
|
|
48
|
-
SupportedExt,
|
|
49
25
|
)
|
|
50
|
-
from .
|
|
51
|
-
from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
|
|
52
|
-
|
|
53
|
-
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _validate_pipeline_opts(v: "PipelineOptions") -> None:
|
|
57
|
-
if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
|
|
58
|
-
msg = "generate_picture_images should be set to true"
|
|
59
|
-
raise ValueError(msg)
|
|
60
|
-
return v
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
T = TypeVar("T")
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
67
|
-
# Check if the class available
|
|
68
|
-
for c in all_subclasses(cls):
|
|
69
|
-
if c.__name__ == name:
|
|
70
|
-
return c
|
|
71
|
-
# Then apply ad-hoc search
|
|
72
|
-
if "pipeline" in cls.__name__.lower():
|
|
73
|
-
module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
|
|
74
|
-
try:
|
|
75
|
-
module = importlib.import_module(module_name)
|
|
76
|
-
return getattr(module, name)
|
|
77
|
-
except (ModuleNotFoundError, AttributeError):
|
|
78
|
-
pass
|
|
79
|
-
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def _find_init_arg_type(cls: type[Any], arg: str) -> type:
|
|
83
|
-
hints = get_type_hints(cls.__init__)
|
|
84
|
-
return hints[arg]
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def _resolve_pipeline_cls(v: Any) -> Any:
|
|
88
|
-
if isinstance(v, str):
|
|
89
|
-
return _find_subcls(BasePipeline, v)
|
|
90
|
-
return v
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _ser_class_as_str(v: Any) -> Any:
|
|
94
|
-
if isinstance(v, type):
|
|
95
|
-
return v.__name__
|
|
96
|
-
return v
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def _ser_with_backend_option_kind(
|
|
100
|
-
v: Any, handler: SerializerFunctionWrapHandler
|
|
101
|
-
) -> Any:
|
|
102
|
-
serialized = handler(v)
|
|
103
|
-
if isinstance(v, BaseBackendOptions):
|
|
104
|
-
kind = getattr(v, "kind", None)
|
|
105
|
-
if kind is not None:
|
|
106
|
-
serialized["kind"] = kind
|
|
107
|
-
return serialized
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def _resolve_backend(v: Any) -> Any:
|
|
111
|
-
if isinstance(v, str):
|
|
112
|
-
return _find_subcls(AbstractDocumentBackend, v)
|
|
113
|
-
return v
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
class DoclingFormatOption(FormatOption):
|
|
117
|
-
pipeline_cls: Annotated[
|
|
118
|
-
str | type[BasePipeline],
|
|
119
|
-
BeforeValidator(_resolve_pipeline_cls),
|
|
120
|
-
PlainSerializer(_ser_class_as_str),
|
|
121
|
-
]
|
|
122
|
-
pipeline_options: Annotated[
|
|
123
|
-
dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
|
|
124
|
-
] = None
|
|
125
|
-
backend: Annotated[
|
|
126
|
-
str | type[AbstractDocumentBackend],
|
|
127
|
-
BeforeValidator(_resolve_backend),
|
|
128
|
-
PlainSerializer(_ser_class_as_str),
|
|
129
|
-
]
|
|
130
|
-
backend_options: Annotated[
|
|
131
|
-
BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
|
|
132
|
-
] = None
|
|
133
|
-
|
|
134
|
-
@model_validator(mode="after")
|
|
135
|
-
def _resolve_pipeline_options(self) -> Self:
|
|
136
|
-
if isinstance(self.pipeline_options, dict):
|
|
137
|
-
option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
|
|
138
|
-
self.pipeline_options = option_cls.model_validate(self.pipeline_options)
|
|
139
|
-
return self
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
@cache
|
|
143
|
-
def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
|
|
144
|
-
from docling.backend.docling_parse_backend import ( # noqa: PLC0415
|
|
145
|
-
DoclingParseDocumentBackend,
|
|
146
|
-
)
|
|
147
|
-
from docling.pipeline.standard_pdf_pipeline import ( # noqa: PLC0415
|
|
148
|
-
StandardPdfPipeline,
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
return {
|
|
152
|
-
InputFormat.PDF: DoclingFormatOption(
|
|
153
|
-
pipeline_cls=StandardPdfPipeline,
|
|
154
|
-
backend=DoclingParseDocumentBackend,
|
|
155
|
-
pipeline_options=ThreadedPdfPipelineOptions(
|
|
156
|
-
ocr_options=EasyOcrOptions(), generate_picture_images=True
|
|
157
|
-
),
|
|
158
|
-
),
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
class DoclingPipelineConfig(PipelineConfig):
|
|
163
|
-
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
|
|
26
|
+
from icij_common.registrable import FromConfig
|
|
164
27
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
28
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
29
|
+
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
|
|
168
30
|
|
|
169
|
-
|
|
170
|
-
@cache
|
|
171
|
-
def supported_exts(cls) -> set[SupportedExt]:
|
|
172
|
-
unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
|
|
173
|
-
supported = set()
|
|
174
|
-
for f in InputFormat:
|
|
175
|
-
if f in unsupported:
|
|
176
|
-
continue
|
|
177
|
-
for ext in FormatToExtensions[f]:
|
|
178
|
-
supported.add(SupportedExt(f".{ext.lower()}"))
|
|
179
|
-
return supported
|
|
31
|
+
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
180
32
|
|
|
181
33
|
|
|
182
34
|
@Pipeline.register(PipelineType.DOCLING)
|
|
183
35
|
class DoclingPipeline(Pipeline):
|
|
184
36
|
def __init__(
|
|
185
|
-
self, format_options: dict["InputFormat",
|
|
37
|
+
self, format_options: dict["InputFormat", DoclingFormatOption] | None = None
|
|
186
38
|
):
|
|
187
|
-
|
|
39
|
+
format_options = {k: v.to_docling() for k, v in format_options.items()}
|
|
188
40
|
allowed_format = [
|
|
189
41
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
190
42
|
]
|
extract_python/marker_.py
CHANGED
|
@@ -5,10 +5,8 @@ from functools import cache
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, ClassVar, Self
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
|
|
10
|
-
from .constants import ARTIFACTS
|
|
11
|
-
from .objects import (
|
|
8
|
+
from extract_core import BasePipelineConfig, Pipeline, PipelineType
|
|
9
|
+
from extract_core.objects import (
|
|
12
10
|
InputDoc,
|
|
13
11
|
MarkdownDoc,
|
|
14
12
|
OutputFormat,
|
|
@@ -17,7 +15,9 @@ from .objects import (
|
|
|
17
15
|
Status,
|
|
18
16
|
SupportedExt,
|
|
19
17
|
)
|
|
20
|
-
from
|
|
18
|
+
from pydantic import Field
|
|
19
|
+
|
|
20
|
+
from .constants import ARTIFACTS
|
|
21
21
|
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
22
22
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
@@ -25,10 +25,10 @@ if TYPE_CHECKING:
|
|
|
25
25
|
from PIL import Image
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
class MarkerPipelineConfig(
|
|
28
|
+
class MarkerPipelineConfig(BasePipelineConfig):
|
|
29
29
|
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
|
|
30
30
|
|
|
31
|
-
config: dict[str, Any] = dict
|
|
31
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
32
32
|
|
|
33
33
|
@classmethod
|
|
34
34
|
@cache
|
extract_python/miner_u.py
CHANGED
|
@@ -1,96 +1,32 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import shutil
|
|
3
3
|
from collections.abc import AsyncGenerator, Callable, Iterable
|
|
4
|
-
from
|
|
5
|
-
from enum import StrEnum
|
|
6
|
-
from functools import cache, partial
|
|
4
|
+
from functools import partial
|
|
7
5
|
from pathlib import Path
|
|
8
6
|
from tempfile import TemporaryDirectory
|
|
9
|
-
from typing import
|
|
7
|
+
from typing import Self
|
|
10
8
|
|
|
11
|
-
from
|
|
12
|
-
from pydantic_extra_types.language_code import LanguageAlpha2
|
|
13
|
-
|
|
14
|
-
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
15
|
-
from .objects import (
|
|
16
|
-
BaseModel,
|
|
9
|
+
from extract_core import (
|
|
17
10
|
ConversionOutput,
|
|
18
11
|
InputDoc,
|
|
12
|
+
MinerUBackend,
|
|
13
|
+
MinerUConfig,
|
|
14
|
+
MinerUPipelineConfig,
|
|
19
15
|
OutputFormat,
|
|
20
16
|
PageIndexes,
|
|
17
|
+
Pipeline,
|
|
18
|
+
PipelineType,
|
|
21
19
|
Result,
|
|
22
20
|
Status,
|
|
23
|
-
SupportedExt,
|
|
24
21
|
)
|
|
25
|
-
|
|
22
|
+
|
|
23
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
26
24
|
from .utils import path_to_artifacts_dirname
|
|
27
25
|
|
|
28
26
|
_MINER_U_CONVERSION_ERRORS = tuple()
|
|
29
27
|
MDMakeFunction = Callable[[list, str, str], str | None]
|
|
30
28
|
|
|
31
29
|
|
|
32
|
-
class MinerUBackend(StrEnum):
|
|
33
|
-
PIPELINE = "pipeline"
|
|
34
|
-
VLM = "vlm"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class MinerUConfig(BaseModel):
|
|
38
|
-
backend: MinerUBackend = MinerUBackend.PIPELINE
|
|
39
|
-
enable_formula_extraction: bool = True
|
|
40
|
-
enable_table_extraction: bool = True
|
|
41
|
-
# TODO: use enum or literal here
|
|
42
|
-
parse_method: str = "auto"
|
|
43
|
-
|
|
44
|
-
def as_parse_kwargs(self) -> dict[str, Any]:
|
|
45
|
-
kwargs = copy(self._get_default_kwargs())
|
|
46
|
-
kwargs["backend"] = self.backend
|
|
47
|
-
kwargs["parse_method"] = self.parse_method
|
|
48
|
-
kwargs["formula_enable"] = self.enable_formula_extraction
|
|
49
|
-
kwargs["table_enable"] = self.enable_table_extraction
|
|
50
|
-
return kwargs
|
|
51
|
-
|
|
52
|
-
@classmethod
|
|
53
|
-
@cache
|
|
54
|
-
def _get_default_kwargs(cls) -> dict[str, Any]:
|
|
55
|
-
from mineru.utils.enum_class import MakeMode # noqa: PLC0415
|
|
56
|
-
|
|
57
|
-
return {
|
|
58
|
-
"server_url": None,
|
|
59
|
-
# We don't dump md directly we process, we dump the middle json in order
|
|
60
|
-
# to be able to get page indexes
|
|
61
|
-
"parse_method": "auto",
|
|
62
|
-
"dump_md": False,
|
|
63
|
-
"dump_middle_json": True,
|
|
64
|
-
"f_draw_layout_bbox": False,
|
|
65
|
-
"f_draw_span_bbox": False,
|
|
66
|
-
"f_dump_model_output": False, # might be useful for debug though
|
|
67
|
-
"f_dump_orig_pdf": False,
|
|
68
|
-
"f_dump_content_list": False, # might be useful for debug though
|
|
69
|
-
"start_page_id": 0,
|
|
70
|
-
"f_make_md_mode": MakeMode.MM_MD,
|
|
71
|
-
"image_analysis": True,
|
|
72
|
-
"end_page_id": None,
|
|
73
|
-
"client_side_output_generation": False,
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
78
|
-
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
|
|
79
|
-
|
|
80
|
-
config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
|
|
81
|
-
language: LanguageAlpha2 = Field(frozen=True, default="en")
|
|
82
|
-
|
|
83
|
-
@classmethod
|
|
84
|
-
@cache
|
|
85
|
-
def supported_exts(cls) -> set[SupportedExt]:
|
|
86
|
-
return {
|
|
87
|
-
SupportedExt.PDF,
|
|
88
|
-
SupportedExt.DOCX,
|
|
89
|
-
SupportedExt.PPTX,
|
|
90
|
-
SupportedExt.XLSX,
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
|
|
94
30
|
@Pipeline.register(PipelineType.MINER_U)
|
|
95
31
|
class MinerUPipeline(Pipeline):
|
|
96
32
|
def __init__(self, config: MinerUConfig, language: str):
|
extract_python/utils.py
CHANGED
|
@@ -6,26 +6,20 @@ from itertools import tee
|
|
|
6
6
|
from pathlib import Path, PurePath
|
|
7
7
|
from typing import Protocol, TypeVar
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from extract_core import Error, InputDoc, Result, Status
|
|
10
10
|
|
|
11
11
|
R = TypeVar("R")
|
|
12
|
-
|
|
12
|
+
In = TypeVar("In")
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def map_and_preserve(
|
|
16
|
-
fn: Callable[[Iterable[
|
|
17
|
-
) -> tuple[Iterable[
|
|
16
|
+
fn: Callable[[Iterable[In]], Iterator[R]], inputs: Iterable[In]
|
|
17
|
+
) -> tuple[Iterable[In], Iterator[R]]:
|
|
18
18
|
save_inputs, function_inputs = tee(inputs)
|
|
19
19
|
outputs = iter(fn(function_inputs))
|
|
20
20
|
return save_inputs, outputs
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def all_subclasses(cls: type[T]) -> set[type[T]]:
|
|
24
|
-
return set(cls.__subclasses__()).union(
|
|
25
|
-
[s for c in cls.__subclasses__() for s in all_subclasses(c)]
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
23
|
def path_to_artifacts_dirname(path: PurePath, sep: str = "_") -> str:
|
|
30
24
|
dirname = f"{path.name[: -len(path.suffix)]}"
|
|
31
25
|
ext = path.suffix
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.5
|
|
4
4
|
Summary: Structured content extraction
|
|
5
5
|
Project-URL: Homepage, https://github.com/ICIJ/extract-python
|
|
6
6
|
Project-URL: Repository, https://github.com/ICIJ/extract-python
|
|
7
7
|
Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
|
|
8
8
|
Author-email: Clément Doumouro <cdoumouro@icij.org>
|
|
9
9
|
Requires-Python: <3.14,>=3.11
|
|
10
|
+
Requires-Dist: extract-core~=0.1
|
|
10
11
|
Requires-Dist: icij-common~=0.8.2
|
|
11
12
|
Provides-Extra: benches
|
|
12
13
|
Requires-Dist: html2image~=2.0.7; extra == 'benches'
|
|
@@ -21,4 +22,5 @@ Provides-Extra: mineru
|
|
|
21
22
|
Requires-Dist: mineru[mlx]~=3.2; (sys_platform == 'darwin') and extra == 'mineru'
|
|
22
23
|
Requires-Dist: mineru[pipeline,vlm]~=3.2; extra == 'mineru'
|
|
23
24
|
Requires-Dist: pydantic-extra-types[pycountry]~=2.11; extra == 'mineru'
|
|
25
|
+
Requires-Dist: python-pptx~=1.0; extra == 'mineru'
|
|
24
26
|
Requires-Dist: six~=1.17; extra == 'mineru'
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
|
|
2
|
+
extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
|
|
3
|
+
extract_python/docling_.py,sha256=C4WP1AJrvS2n-KytlGc_1CShjdTGM077I6b9tvw4NhY,4727
|
|
4
|
+
extract_python/marker_.py,sha256=mLJA1m9G4JQtBs1wz8rmshdbaH81DhIwkRzDKZPJH8A,5058
|
|
5
|
+
extract_python/miner_u.py,sha256=jjHqHx7-2w0LSxYNcjvgWoLDTXsv_y1eeyteSfXqjk4,5771
|
|
6
|
+
extract_python/utils.py,sha256=NiYf65iCF7QO4loh7u4t38Ww3eVJUdBpWStL4eX_DqE,1781
|
|
7
|
+
extract_python-0.5.5.dist-info/METADATA,sha256=iENRXysGcLOtZ3mJNPpGwHiixhkYJYaYQ-sj8j26q2o,1216
|
|
8
|
+
extract_python-0.5.5.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
extract_python-0.5.5.dist-info/RECORD,,
|
extract_python/objects.py
DELETED
|
@@ -1,323 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import os
|
|
5
|
-
import traceback
|
|
6
|
-
import uuid
|
|
7
|
-
from abc import ABC
|
|
8
|
-
from enum import StrEnum
|
|
9
|
-
from functools import cache
|
|
10
|
-
from io import BytesIO
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from typing import Annotated, Any, NoReturn, Self
|
|
13
|
-
|
|
14
|
-
from icij_common.pydantic_utils import (
|
|
15
|
-
icij_config,
|
|
16
|
-
merge_configs,
|
|
17
|
-
no_enum_values_config,
|
|
18
|
-
safe_copy,
|
|
19
|
-
)
|
|
20
|
-
from pydantic import AfterValidator, RootModel, TypeAdapter
|
|
21
|
-
from pydantic import BaseModel as _BaseModel
|
|
22
|
-
|
|
23
|
-
try:
|
|
24
|
-
from docling.datamodel.base_models import (
|
|
25
|
-
ConversionStatus,
|
|
26
|
-
ErrorItem,
|
|
27
|
-
FormatToExtensions,
|
|
28
|
-
InputFormat,
|
|
29
|
-
)
|
|
30
|
-
from docling.datamodel.document import InputDocument
|
|
31
|
-
from docling_core.types.io import DocumentStream
|
|
32
|
-
except ImportError:
|
|
33
|
-
ConversionStatus, ErrorItem, InputFormat = None, None, None
|
|
34
|
-
InputDocument = None
|
|
35
|
-
DocumentStream = None
|
|
36
|
-
|
|
37
|
-
logger = logging.getLogger(__name__)
|
|
38
|
-
base_config = merge_configs(icij_config(), no_enum_values_config())
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@cache
|
|
42
|
-
def _ext_to_docling_input_format() -> dict:
|
|
43
|
-
from .docling_ import DoclingPipelineConfig # noqa: PLC0415
|
|
44
|
-
|
|
45
|
-
mapping = dict()
|
|
46
|
-
supported = DoclingPipelineConfig.supported_exts()
|
|
47
|
-
for input_f, exts in FormatToExtensions.items():
|
|
48
|
-
for ext in exts:
|
|
49
|
-
try:
|
|
50
|
-
ext = SupportedExt(f".{ext.lower()}") # noqa: PLW2901
|
|
51
|
-
except ValueError:
|
|
52
|
-
continue
|
|
53
|
-
if ext in supported:
|
|
54
|
-
mapping[ext] = input_f
|
|
55
|
-
return mapping
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class BaseModel(_BaseModel):
|
|
59
|
-
model_config = base_config
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class SupportedExt(StrEnum):
|
|
63
|
-
ADOC = ".adoc"
|
|
64
|
-
ASC = ".asc"
|
|
65
|
-
ASCIIDOC = ".asciidoc"
|
|
66
|
-
BMP = ".bmp"
|
|
67
|
-
CSV = ".csv"
|
|
68
|
-
DOC = ".doc"
|
|
69
|
-
DOCX = ".docx"
|
|
70
|
-
DOTX = ".dotx"
|
|
71
|
-
DOTM = ".dotm"
|
|
72
|
-
DOCM = ".docm"
|
|
73
|
-
EPUB = ".epub"
|
|
74
|
-
EML = ".eml"
|
|
75
|
-
GIF = ".gif"
|
|
76
|
-
HTLM = ".html"
|
|
77
|
-
HTM = ".htm"
|
|
78
|
-
JPEG = ".jpeg"
|
|
79
|
-
JPG = ".jpg"
|
|
80
|
-
JSON = ".json"
|
|
81
|
-
LATEX = ".latex"
|
|
82
|
-
MD = ".md"
|
|
83
|
-
NXML = ".nxml"
|
|
84
|
-
ODP = ".odp"
|
|
85
|
-
ODS = ".ods"
|
|
86
|
-
ODT = ".odt"
|
|
87
|
-
PDF = ".pdf"
|
|
88
|
-
PNG = ".png"
|
|
89
|
-
PPSX = ".ppsx"
|
|
90
|
-
PPT = ".ppt"
|
|
91
|
-
PPTM = ".pptm"
|
|
92
|
-
PPSM = ".ppsm"
|
|
93
|
-
POTX = ".potx"
|
|
94
|
-
POTM = ".potm"
|
|
95
|
-
PPTX = ".pptx"
|
|
96
|
-
QMD = ".qmd"
|
|
97
|
-
RMD = ".rmd"
|
|
98
|
-
TEX = ".tex"
|
|
99
|
-
TIF = ".tif"
|
|
100
|
-
TIFF = ".tiff"
|
|
101
|
-
TXT = ".txt"
|
|
102
|
-
TEXT = ".text"
|
|
103
|
-
WEBP = ".webp"
|
|
104
|
-
XBRL = ".xbrl"
|
|
105
|
-
XHTML = ".xhtml"
|
|
106
|
-
XLS = ".xls"
|
|
107
|
-
XLSM = ".xlsm"
|
|
108
|
-
XLSX = ".xlsx"
|
|
109
|
-
XLTX = ".xltx"
|
|
110
|
-
XML = ".xml"
|
|
111
|
-
|
|
112
|
-
def to_docling(self) -> InputFormat:
|
|
113
|
-
return _ext_to_docling_input_format()[self]
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
class OutputFormat(StrEnum):
|
|
117
|
-
MARKDOWN = ".md"
|
|
118
|
-
|
|
119
|
-
@property
|
|
120
|
-
def suffix(self) -> str:
|
|
121
|
-
return self.value[1:]
|
|
122
|
-
|
|
123
|
-
def to_marker(self) -> str:
|
|
124
|
-
match self:
|
|
125
|
-
case OutputFormat.MARKDOWN:
|
|
126
|
-
return "markdown"
|
|
127
|
-
case _:
|
|
128
|
-
raise ValueError(f"{self} is unsupported by marker")
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
class Status(StrEnum):
|
|
132
|
-
FAILURE = "failure"
|
|
133
|
-
SUCCESS = "success"
|
|
134
|
-
PARTIAL_SUCCESS = "partial_success"
|
|
135
|
-
|
|
136
|
-
@classmethod
|
|
137
|
-
def from_docling(cls, v: Any) -> Self:
|
|
138
|
-
from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
|
|
139
|
-
|
|
140
|
-
if v is ConversionStatus.SUCCESS:
|
|
141
|
-
return cls.SUCCESS
|
|
142
|
-
if v is ConversionStatus.PARTIAL_SUCCESS:
|
|
143
|
-
return cls.PARTIAL_SUCCESS
|
|
144
|
-
if isinstance(v, ConversionStatus):
|
|
145
|
-
return cls.FAILURE
|
|
146
|
-
raise TypeError(f"can't convert {v!r} to {cls.__name__!r}")
|
|
147
|
-
|
|
148
|
-
@property
|
|
149
|
-
def allows_conversion(self) -> bool:
|
|
150
|
-
return self is Status.SUCCESS or self is Status.PARTIAL_SUCCESS
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
class Error(BaseModel):
|
|
154
|
-
id: str
|
|
155
|
-
title: str
|
|
156
|
-
detail: str
|
|
157
|
-
|
|
158
|
-
@classmethod
|
|
159
|
-
def from_exception(cls, exception: BaseException) -> Self:
|
|
160
|
-
title = exception.__class__.__name__
|
|
161
|
-
trace_lines = traceback.format_exception(
|
|
162
|
-
None, value=exception, tb=exception.__traceback__
|
|
163
|
-
)
|
|
164
|
-
detail = f"{exception}\n{''.join(trace_lines)}"
|
|
165
|
-
error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
|
|
166
|
-
error = cls(id=error_id, title=title, detail=detail)
|
|
167
|
-
return error
|
|
168
|
-
|
|
169
|
-
@classmethod
|
|
170
|
-
def from_docling(cls, docling_error: ErrorItem) -> Self:
|
|
171
|
-
title = "DoclingConversionError"
|
|
172
|
-
error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
|
|
173
|
-
detail = (
|
|
174
|
-
f"error in module {docling_error.module_name} of"
|
|
175
|
-
f" {docling_error.component_type}:\n{docling_error.error_message}"
|
|
176
|
-
)
|
|
177
|
-
return cls(id=error_id, title=title, detail=detail)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def _id_title(title: str) -> str:
|
|
181
|
-
id_title = []
|
|
182
|
-
for i, letter in enumerate(title):
|
|
183
|
-
if i and letter.isupper():
|
|
184
|
-
id_title.append("-")
|
|
185
|
-
id_title.append(letter.lower())
|
|
186
|
-
return "".join(id_title)
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
class InputDoc(BaseModel):
|
|
190
|
-
ext: SupportedExt
|
|
191
|
-
path: Path
|
|
192
|
-
content: bytes | None = None
|
|
193
|
-
|
|
194
|
-
@classmethod
|
|
195
|
-
def from_path(cls, path: str | Path) -> Self:
|
|
196
|
-
if isinstance(path, str):
|
|
197
|
-
path = Path(path)
|
|
198
|
-
ext = SupportedExt(path.suffix)
|
|
199
|
-
return cls(path=path, ext=ext)
|
|
200
|
-
|
|
201
|
-
def to_docling(self) -> Path | DocumentStream:
|
|
202
|
-
if self.content is not None:
|
|
203
|
-
return DocumentStream(name=str(self.path), stream=BytesIO(self.content))
|
|
204
|
-
if not self.path.suffix:
|
|
205
|
-
return DocumentStream(
|
|
206
|
-
name=str(self.path), stream=BytesIO(self.path.read_bytes())
|
|
207
|
-
)
|
|
208
|
-
return self.path
|
|
209
|
-
|
|
210
|
-
def without_content(self) -> Self:
|
|
211
|
-
return safe_copy(self, update={"content": None})
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
class PageIndexes(RootModel[list[tuple[int, int]]]):
|
|
215
|
-
# Stores page end index
|
|
216
|
-
@classmethod
|
|
217
|
-
def from_page_end_indices(cls, lengths: list[int]) -> Self:
|
|
218
|
-
return [
|
|
219
|
-
((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
|
|
220
|
-
]
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
class ConversionOutput(BaseModel):
|
|
224
|
-
path: Path
|
|
225
|
-
pages: PageIndexes = []
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
class MarkdownDoc(ConversionOutput):
|
|
229
|
-
@classmethod
|
|
230
|
-
@property
|
|
231
|
-
@cache
|
|
232
|
-
def _valid_conversion_statuses(cls) -> set[ConversionStatus]:
|
|
233
|
-
from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
|
|
234
|
-
|
|
235
|
-
return {ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS}
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def _input_should_not_have_content(value: InputDoc) -> InputDoc:
|
|
239
|
-
if value.content is not None:
|
|
240
|
-
raise ValueError(f"response input can't have content, but got {value}")
|
|
241
|
-
return value
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
class _BaseResult(BaseModel, ABC):
|
|
245
|
-
input: InputDoc
|
|
246
|
-
status: Status
|
|
247
|
-
errors: list[Error] = []
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
class Result(_BaseResult):
|
|
251
|
-
# TODO: we could also use generics here when we add more output formats
|
|
252
|
-
output: ConversionOutput | None
|
|
253
|
-
|
|
254
|
-
def to_response(self) -> ResponseResult:
|
|
255
|
-
return ResponseResult(
|
|
256
|
-
input=self.input.without_content(),
|
|
257
|
-
status=self.status,
|
|
258
|
-
errors=self.errors,
|
|
259
|
-
output_path=self.output.path,
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
class ResponseResult(_BaseResult):
|
|
264
|
-
input: Annotated[InputDoc, AfterValidator(func=_input_should_not_have_content)]
|
|
265
|
-
output_path: Path
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
class ExtractionResponse(BaseModel):
|
|
269
|
-
results: list[ResponseResult]
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
_INPUT_DOCS_ADAPTER = TypeAdapter(list[InputDoc | Path])
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
def parse_extraction_request(
|
|
276
|
-
docs: str | list[dict | str], *, data_dir: Path
|
|
277
|
-
) -> list[InputDoc]:
|
|
278
|
-
if isinstance(docs, str):
|
|
279
|
-
logger.debug("exploring files in %s", data_dir.absolute())
|
|
280
|
-
docs_dir = Path(data_dir) / docs
|
|
281
|
-
docs = _as_input_docs(docs_dir)
|
|
282
|
-
msg = "found %s"
|
|
283
|
-
if len(docs) > 10:
|
|
284
|
-
msg = msg + ", and more..."
|
|
285
|
-
logger.debug("found %s", docs[:10])
|
|
286
|
-
return docs
|
|
287
|
-
docs = _INPUT_DOCS_ADAPTER.validate_python(docs)
|
|
288
|
-
if not docs:
|
|
289
|
-
return []
|
|
290
|
-
if isinstance(docs[0], Path):
|
|
291
|
-
doc_meta = []
|
|
292
|
-
unknown_exts = []
|
|
293
|
-
for doc in docs:
|
|
294
|
-
_, ext = os.path.splitext(str(doc))
|
|
295
|
-
if not ext:
|
|
296
|
-
unknown_exts.append(doc)
|
|
297
|
-
else:
|
|
298
|
-
doc_meta.append(InputDoc.from_path(path=doc.relative_to(data_dir)))
|
|
299
|
-
if unknown_exts:
|
|
300
|
-
raise ValueError(f"found files with unknown extensions {unknown_exts}")
|
|
301
|
-
return doc_meta
|
|
302
|
-
return docs
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
def _raise(err: OSError) -> NoReturn:
|
|
306
|
-
raise err
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
def _as_input_docs(
|
|
310
|
-
docs_dir: Path, *, supported_ext: set[str] | None = None
|
|
311
|
-
) -> list[InputDoc]:
|
|
312
|
-
if supported_ext is None:
|
|
313
|
-
supported_ext = {v.value for v in SupportedExt}
|
|
314
|
-
docs = []
|
|
315
|
-
for root, _, files in os.walk(docs_dir, onerror=_raise):
|
|
316
|
-
root = Path(root) # noqa: PLW2901
|
|
317
|
-
for f in files:
|
|
318
|
-
ext = Path(f).suffix
|
|
319
|
-
if not ext or ext not in supported_ext:
|
|
320
|
-
continue
|
|
321
|
-
docs.append(InputDoc.from_path(path=root / f))
|
|
322
|
-
docs = sorted(docs, key=lambda x: x.path)
|
|
323
|
-
return docs
|
extract_python/pipeline.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
from collections.abc import AsyncGenerator, Iterable
|
|
3
|
-
from enum import StrEnum
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import ClassVar
|
|
6
|
-
|
|
7
|
-
from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_values_config
|
|
8
|
-
from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
|
|
9
|
-
from pydantic import Field
|
|
10
|
-
|
|
11
|
-
from .objects import InputDoc, OutputFormat, Result, SupportedExt
|
|
12
|
-
|
|
13
|
-
StructuredContent = str
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class PipelineType(StrEnum):
|
|
17
|
-
DOCLING = "docling"
|
|
18
|
-
MARKER = "marker"
|
|
19
|
-
MINER_U = "miner_u"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class PipelineConfig(RegistrableConfig, ABC):
|
|
23
|
-
# TODO: move this icij_config() to RegistrableConfig
|
|
24
|
-
model_config = merge_configs(icij_config(), no_enum_values_config())
|
|
25
|
-
|
|
26
|
-
registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
|
|
27
|
-
pipeline: ClassVar[PipelineType]
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
@abstractmethod
|
|
31
|
-
def supported_exts(cls) -> set[SupportedExt]: ...
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class Pipeline(RegistrableFromConfig, ABC):
|
|
35
|
-
@abstractmethod
|
|
36
|
-
async def extract_content(
|
|
37
|
-
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
38
|
-
) -> AsyncGenerator[Result, None]: ...
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
-
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
-
extract_python/docling_.py,sha256=ys2vK4zgpWsPObIZWRFhHM4fNkojMYUa9QRevl8bd3c,9342
|
|
4
|
-
extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
|
|
5
|
-
extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
|
|
6
|
-
extract_python/objects.py,sha256=MHCUZ9L8LVXlSlHyDMnbuWV1KHWMhUEJQMEDTc9hYD0,8761
|
|
7
|
-
extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
|
|
8
|
-
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
-
extract_python-0.4.2.dist-info/METADATA,sha256=95THYq0jZgY2-1X2s8hDoFEo9_aNeukdHPxlcd8_rmI,1132
|
|
10
|
-
extract_python-0.4.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
-
extract_python-0.4.2.dist-info/RECORD,,
|
|
File without changes
|