extract-python 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/docling_.py +128 -53
- extract_python/marker_.py +2 -4
- extract_python/miner_u.py +2 -4
- extract_python/pipeline.py +1 -3
- {extract_python-0.3.2.dist-info → extract_python-0.4.1.dist-info}/METADATA +1 -1
- extract_python-0.4.1.dist-info/RECORD +11 -0
- extract_python-0.3.2.dist-info/RECORD +0 -11
- {extract_python-0.3.2.dist-info → extract_python-0.4.1.dist-info}/WHEEL +0 -0
extract_python/docling_.py
CHANGED
|
@@ -1,14 +1,42 @@
|
|
|
1
|
+
import importlib
|
|
1
2
|
import shutil
|
|
2
3
|
import tempfile
|
|
3
4
|
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
5
|
from functools import cache
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
|
|
8
|
+
|
|
9
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
10
|
+
from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
|
|
11
|
+
|
|
12
|
+
# Data model import are quick it's ok to leave it there
|
|
13
|
+
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
14
|
+
from docling.datamodel.document import ConversionResult
|
|
15
|
+
from docling.datamodel.pipeline_options import (
|
|
16
|
+
EasyOcrOptions,
|
|
17
|
+
PdfPipelineOptions,
|
|
18
|
+
PipelineOptions,
|
|
19
|
+
ThreadedPdfPipelineOptions,
|
|
20
|
+
)
|
|
21
|
+
from docling.document_converter import DocumentConverter, FormatOption
|
|
22
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
|
7
23
|
|
|
24
|
+
# TODO: this is long to load improve it
|
|
25
|
+
from docling_core.types.doc import ImageRefMode
|
|
26
|
+
from docling_core.types.io import DocumentStream
|
|
27
|
+
from icij_common.pydantic_utils import to_lower_snake_case
|
|
8
28
|
from icij_common.registrable import FromConfig
|
|
9
|
-
from pydantic import
|
|
29
|
+
from pydantic import (
|
|
30
|
+
AfterValidator,
|
|
31
|
+
BeforeValidator,
|
|
32
|
+
Field,
|
|
33
|
+
PlainSerializer,
|
|
34
|
+
WrapSerializer,
|
|
35
|
+
model_validator,
|
|
36
|
+
)
|
|
37
|
+
from pydantic_core.core_schema import SerializerFunctionWrapHandler
|
|
10
38
|
|
|
11
|
-
from .constants import ARTIFACTS,
|
|
39
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
12
40
|
from .objects import (
|
|
13
41
|
Error,
|
|
14
42
|
InputDoc,
|
|
@@ -24,73 +52,123 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
|
|
|
24
52
|
|
|
25
53
|
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
26
54
|
|
|
27
|
-
if TYPE_CHECKING:
|
|
28
|
-
from docling.datamodel.base_models import InputFormat
|
|
29
|
-
from docling.datamodel.pipeline_options import PipelineOptions
|
|
30
|
-
from docling.document_converter import ConversionResult, FormatOption
|
|
31
|
-
from docling_core.types.io import DocumentStream
|
|
32
|
-
|
|
33
55
|
|
|
34
|
-
def _validate_pipeline_opts(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
|
|
56
|
+
def _validate_pipeline_opts(v: "PipelineOptions") -> None:
|
|
57
|
+
if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
|
|
38
58
|
msg = "generate_picture_images should be set to true"
|
|
39
59
|
raise ValueError(msg)
|
|
60
|
+
return v
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
T = TypeVar("T")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
67
|
+
# Check if the class available
|
|
68
|
+
for c in all_subclasses(cls):
|
|
69
|
+
if c.__name__ == name:
|
|
70
|
+
return c
|
|
71
|
+
# Then apply ad-hoc search
|
|
72
|
+
if "pipeline" in cls.__name__.lower():
|
|
73
|
+
module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
|
|
74
|
+
try:
|
|
75
|
+
module = importlib.import_module(module_name)
|
|
76
|
+
return getattr(module, name)
|
|
77
|
+
except (ModuleNotFoundError, AttributeError):
|
|
78
|
+
pass
|
|
79
|
+
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _find_init_arg_type(cls: type[Any], arg: str) -> type:
|
|
83
|
+
hints = get_type_hints(cls.__init__)
|
|
84
|
+
return hints[arg]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _resolve_pipeline_cls(v: Any) -> Any:
|
|
88
|
+
if isinstance(v, str):
|
|
89
|
+
return _find_subcls(BasePipeline, v)
|
|
90
|
+
return v
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _ser_class_as_str(v: Any) -> Any:
|
|
94
|
+
if isinstance(v, type):
|
|
95
|
+
return v.__name__
|
|
96
|
+
return v
|
|
40
97
|
|
|
41
98
|
|
|
42
|
-
def
|
|
43
|
-
|
|
44
|
-
) ->
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
99
|
+
def _ser_with_backend_option_kind(
|
|
100
|
+
v: Any, handler: SerializerFunctionWrapHandler
|
|
101
|
+
) -> Any:
|
|
102
|
+
serialized = handler(v)
|
|
103
|
+
if isinstance(v, BaseBackendOptions):
|
|
104
|
+
kind = getattr(v, "kind", None)
|
|
105
|
+
if kind is not None:
|
|
106
|
+
serialized["kind"] = kind
|
|
107
|
+
return serialized
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _resolve_backend(v: Any) -> Any:
|
|
111
|
+
if isinstance(v, str):
|
|
112
|
+
return _find_subcls(AbstractDocumentBackend, v)
|
|
113
|
+
return v
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class DoclingFormatOption(FormatOption):
|
|
117
|
+
pipeline_cls: Annotated[
|
|
118
|
+
str | type[BasePipeline],
|
|
119
|
+
BeforeValidator(_resolve_pipeline_cls),
|
|
120
|
+
PlainSerializer(_ser_class_as_str),
|
|
121
|
+
]
|
|
122
|
+
pipeline_options: Annotated[
|
|
123
|
+
dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
|
|
124
|
+
] = None
|
|
125
|
+
backend: Annotated[
|
|
126
|
+
str | type[AbstractDocumentBackend],
|
|
127
|
+
BeforeValidator(_resolve_backend),
|
|
128
|
+
PlainSerializer(_ser_class_as_str),
|
|
129
|
+
]
|
|
130
|
+
backend_options: Annotated[
|
|
131
|
+
BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
|
|
132
|
+
] = None
|
|
133
|
+
|
|
134
|
+
@model_validator(mode="after")
|
|
135
|
+
def _resolve_pipeline_options(self) -> Self:
|
|
136
|
+
if isinstance(self.pipeline_options, dict):
|
|
137
|
+
option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
|
|
138
|
+
self.pipeline_options = option_cls.model_validate(self.pipeline_options)
|
|
139
|
+
return self
|
|
48
140
|
|
|
49
141
|
|
|
50
142
|
@cache
|
|
51
|
-
def _default_format_opts() -> dict[
|
|
52
|
-
from docling.
|
|
53
|
-
|
|
54
|
-
|
|
143
|
+
def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
|
|
144
|
+
from docling.backend.docling_parse_backend import ( # noqa: PLC0415
|
|
145
|
+
DoclingParseDocumentBackend,
|
|
146
|
+
)
|
|
147
|
+
from docling.pipeline.standard_pdf_pipeline import ( # noqa: PLC0415
|
|
148
|
+
StandardPdfPipeline,
|
|
55
149
|
)
|
|
56
|
-
from docling.document_converter import PdfFormatOption # noqa: PLC0415
|
|
57
150
|
|
|
58
151
|
return {
|
|
59
|
-
InputFormat.PDF:
|
|
60
|
-
|
|
152
|
+
InputFormat.PDF: DoclingFormatOption(
|
|
153
|
+
pipeline_cls=StandardPdfPipeline,
|
|
154
|
+
backend=DoclingParseDocumentBackend,
|
|
155
|
+
pipeline_options=ThreadedPdfPipelineOptions(
|
|
61
156
|
ocr_options=EasyOcrOptions(), generate_picture_images=True
|
|
62
|
-
)
|
|
157
|
+
),
|
|
63
158
|
),
|
|
64
159
|
}
|
|
65
160
|
|
|
66
161
|
|
|
67
|
-
T = TypeVar("T")
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
71
|
-
for c in all_subclasses(cls):
|
|
72
|
-
if c.__name__ == name:
|
|
73
|
-
return c
|
|
74
|
-
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
@PipelineConfig.register()
|
|
78
162
|
class DoclingPipelineConfig(PipelineConfig):
|
|
79
|
-
pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
|
|
80
|
-
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
163
|
+
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
|
|
81
164
|
|
|
82
|
-
format_options:
|
|
83
|
-
|
|
84
|
-
|
|
165
|
+
format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
|
|
166
|
+
default_factory=_default_format_opts
|
|
167
|
+
)
|
|
85
168
|
|
|
86
169
|
@classmethod
|
|
87
170
|
@cache
|
|
88
171
|
def supported_exts(cls) -> set[SupportedExt]:
|
|
89
|
-
from docling.datamodel.base_models import ( # noqa: PLC0415
|
|
90
|
-
FormatToExtensions,
|
|
91
|
-
InputFormat,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
172
|
unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
|
|
95
173
|
supported = set()
|
|
96
174
|
for f in InputFormat:
|
|
@@ -106,7 +184,6 @@ class DoclingPipeline(Pipeline):
|
|
|
106
184
|
def __init__(
|
|
107
185
|
self, format_options: dict["InputFormat", "FormatOption"] | None = None
|
|
108
186
|
):
|
|
109
|
-
from docling.document_converter import DocumentConverter # noqa: PLC0415
|
|
110
187
|
|
|
111
188
|
allowed_format = [
|
|
112
189
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
@@ -134,7 +211,7 @@ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
|
|
134
211
|
|
|
135
212
|
|
|
136
213
|
def _to_result(
|
|
137
|
-
res:
|
|
214
|
+
res: ConversionResult,
|
|
138
215
|
input_document: InputDoc,
|
|
139
216
|
output_format: OutputFormat,
|
|
140
217
|
output_path: Path,
|
|
@@ -155,13 +232,11 @@ def _to_result(
|
|
|
155
232
|
|
|
156
233
|
|
|
157
234
|
def _to_markdown_doc(
|
|
158
|
-
res:
|
|
235
|
+
res: ConversionResult,
|
|
159
236
|
output_path: Path,
|
|
160
237
|
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
161
238
|
**kwargs,
|
|
162
239
|
) -> MarkdownDoc:
|
|
163
|
-
from docling_core.types.doc import ImageRefMode # noqa: PLC0415
|
|
164
|
-
|
|
165
240
|
# TODO: Should we add a hash to avoid collision between files with same names
|
|
166
241
|
# nested in the tree structured
|
|
167
242
|
md_dir_name = path_to_artifacts_dirname(res.input.file)
|
extract_python/marker_.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
|
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
-
from .constants import ARTIFACTS
|
|
10
|
+
from .constants import ARTIFACTS
|
|
11
11
|
from .objects import (
|
|
12
12
|
InputDoc,
|
|
13
13
|
MarkdownDoc,
|
|
@@ -25,10 +25,8 @@ if TYPE_CHECKING:
|
|
|
25
25
|
from PIL import Image
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
@PipelineConfig.register()
|
|
29
28
|
class MarkerPipelineConfig(PipelineConfig):
|
|
30
|
-
pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
|
|
31
|
-
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
29
|
+
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
|
|
32
30
|
|
|
33
31
|
config: dict[str, Any] = dict()
|
|
34
32
|
|
extract_python/miner_u.py
CHANGED
|
@@ -11,7 +11,7 @@ from typing import Any, ClassVar, Self
|
|
|
11
11
|
from pydantic import Field
|
|
12
12
|
from pydantic_extra_types.language_code import LanguageAlpha2
|
|
13
13
|
|
|
14
|
-
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
14
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
15
15
|
from .objects import (
|
|
16
16
|
BaseModel,
|
|
17
17
|
ConversionOutput,
|
|
@@ -74,10 +74,8 @@ class MinerUConfig(BaseModel):
|
|
|
74
74
|
}
|
|
75
75
|
|
|
76
76
|
|
|
77
|
-
@PipelineConfig.register() # noqa: F821
|
|
78
77
|
class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
79
|
-
pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
|
|
80
|
-
task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
|
|
78
|
+
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
|
|
81
79
|
|
|
82
80
|
config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
|
|
83
81
|
language: LanguageAlpha2 = Field(frozen=True, default="en")
|
extract_python/pipeline.py
CHANGED
|
@@ -24,9 +24,7 @@ class PipelineConfig(RegistrableConfig, ABC):
|
|
|
24
24
|
model_config = merge_configs(icij_config(), no_enum_values_config())
|
|
25
25
|
|
|
26
26
|
registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
|
|
27
|
-
pipeline: PipelineType
|
|
28
|
-
|
|
29
|
-
task_group: ClassVar[str] = Field(frozen=True)
|
|
27
|
+
pipeline: ClassVar[PipelineType]
|
|
30
28
|
|
|
31
29
|
@classmethod
|
|
32
30
|
@abstractmethod
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
+
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
+
extract_python/docling_.py,sha256=ys2vK4zgpWsPObIZWRFhHM4fNkojMYUa9QRevl8bd3c,9342
|
|
4
|
+
extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
|
|
5
|
+
extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
|
|
6
|
+
extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
|
|
7
|
+
extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
|
|
8
|
+
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
+
extract_python-0.4.1.dist-info/METADATA,sha256=tjxWkMOJ4mhT6eF-HmZmJl_HJgNT2fluq2sZUPWfE7o,1132
|
|
10
|
+
extract_python-0.4.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
extract_python-0.4.1.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
-
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
-
extract_python/docling_.py,sha256=JD5lLFSRo6KC7LMF6rH2MVNJaQAwsVwzFd_WIRQhEWQ,7112
|
|
4
|
-
extract_python/marker_.py,sha256=GM1GB0gp8TkeyPGn7S5tCKkfEqcQdKjIu1CtYs2zt2g,5112
|
|
5
|
-
extract_python/miner_u.py,sha256=i7JKcoKvU3G_fB_0ffsTaLdRYAPvuK6zwohgjOVIBTY,8127
|
|
6
|
-
extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
|
|
7
|
-
extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
|
|
8
|
-
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
-
extract_python-0.3.2.dist-info/METADATA,sha256=BbUayvHGHkr9HZ-Pq1iUcxvtEq7QSZjCWTYS-iiWOWg,1132
|
|
10
|
-
extract_python-0.3.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
-
extract_python-0.3.2.dist-info/RECORD,,
|
|
File without changes
|