extract-python 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/docling_.py +97 -53
- extract_python/marker_.py +2 -4
- extract_python/miner_u.py +2 -4
- extract_python/pipeline.py +1 -3
- {extract_python-0.3.2.dist-info → extract_python-0.4.0.dist-info}/METADATA +1 -1
- extract_python-0.4.0.dist-info/RECORD +11 -0
- extract_python-0.3.2.dist-info/RECORD +0 -11
- {extract_python-0.3.2.dist-info → extract_python-0.4.0.dist-info}/WHEEL +0 -0
extract_python/docling_.py
CHANGED
|
@@ -1,14 +1,34 @@
|
|
|
1
|
+
import importlib
|
|
1
2
|
import shutil
|
|
2
3
|
import tempfile
|
|
3
4
|
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
5
|
from functools import cache
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
|
|
8
|
+
|
|
9
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
10
|
+
from docling.datamodel.backend_options import BackendOptions
|
|
11
|
+
|
|
12
|
+
# Data model import are quick it's ok to leave it there
|
|
13
|
+
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
14
|
+
from docling.datamodel.document import ConversionResult
|
|
15
|
+
from docling.datamodel.pipeline_options import (
|
|
16
|
+
EasyOcrOptions,
|
|
17
|
+
PdfPipelineOptions,
|
|
18
|
+
PipelineOptions,
|
|
19
|
+
ThreadedPdfPipelineOptions,
|
|
20
|
+
)
|
|
21
|
+
from docling.document_converter import DocumentConverter, FormatOption
|
|
22
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
|
7
23
|
|
|
24
|
+
# TODO: this is long to load improve it
|
|
25
|
+
from docling_core.types.doc import ImageRefMode
|
|
26
|
+
from docling_core.types.io import DocumentStream
|
|
27
|
+
from icij_common.pydantic_utils import to_lower_snake_case
|
|
8
28
|
from icij_common.registrable import FromConfig
|
|
9
|
-
from pydantic import AfterValidator, Field
|
|
29
|
+
from pydantic import AfterValidator, BeforeValidator, Field, model_validator
|
|
10
30
|
|
|
11
|
-
from .constants import ARTIFACTS,
|
|
31
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
12
32
|
from .objects import (
|
|
13
33
|
Error,
|
|
14
34
|
InputDoc,
|
|
@@ -24,73 +44,100 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
|
|
|
24
44
|
|
|
25
45
|
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
26
46
|
|
|
27
|
-
if TYPE_CHECKING:
|
|
28
|
-
from docling.datamodel.base_models import InputFormat
|
|
29
|
-
from docling.datamodel.pipeline_options import PipelineOptions
|
|
30
|
-
from docling.document_converter import ConversionResult, FormatOption
|
|
31
|
-
from docling_core.types.io import DocumentStream
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
|
|
35
|
-
from docling.datamodel.pipeline_options import PdfPipelineOptions # noqa: PLC0415
|
|
36
47
|
|
|
37
|
-
|
|
48
|
+
def _validate_pipeline_opts(v: "PipelineOptions") -> None:
|
|
49
|
+
if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
|
|
38
50
|
msg = "generate_picture_images should be set to true"
|
|
39
51
|
raise ValueError(msg)
|
|
52
|
+
return v
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
T = TypeVar("T")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
59
|
+
# Check if the class available
|
|
60
|
+
for c in all_subclasses(cls):
|
|
61
|
+
if c.__name__ == name:
|
|
62
|
+
return c
|
|
63
|
+
# Then apply ad-hoc search
|
|
64
|
+
if "pipeline" in cls.__name__.lower():
|
|
65
|
+
module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
|
|
66
|
+
try:
|
|
67
|
+
module = importlib.import_module(module_name)
|
|
68
|
+
return getattr(module, name)
|
|
69
|
+
except (ModuleNotFoundError, AttributeError):
|
|
70
|
+
pass
|
|
71
|
+
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _find_init_arg_type(cls: type[Any], arg: str) -> type:
|
|
75
|
+
hints = get_type_hints(cls.__init__)
|
|
76
|
+
return hints[arg].__class__
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _resolve_pipeline_cls(v: Any) -> Any:
|
|
80
|
+
if isinstance(v, str):
|
|
81
|
+
return _find_subcls(BasePipeline, v)
|
|
82
|
+
return v
|
|
83
|
+
|
|
40
84
|
|
|
85
|
+
def _resolve_backend(v: Any) -> Any:
|
|
86
|
+
if isinstance(v, str):
|
|
87
|
+
return _find_subcls(AbstractDocumentBackend, v)
|
|
88
|
+
return v
|
|
41
89
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
90
|
+
|
|
91
|
+
class DoclingFormatOption(FormatOption):
|
|
92
|
+
pipeline_cls: Annotated[
|
|
93
|
+
str | type[BasePipeline], BeforeValidator(_resolve_pipeline_cls)
|
|
94
|
+
]
|
|
95
|
+
pipeline_options: Annotated[
|
|
96
|
+
dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
|
|
97
|
+
] = None
|
|
98
|
+
backend: Annotated[
|
|
99
|
+
str | type[AbstractDocumentBackend], BeforeValidator(_resolve_backend)
|
|
100
|
+
]
|
|
101
|
+
backend_options: BackendOptions | None = None
|
|
102
|
+
|
|
103
|
+
@model_validator(mode="after")
|
|
104
|
+
def _resolve_pipeline_options(self) -> Self:
|
|
105
|
+
if isinstance(self.pipeline_options, dict):
|
|
106
|
+
option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
|
|
107
|
+
self.pipeline_options = option_cls.model_validate(self.pipeline_options)
|
|
108
|
+
return self
|
|
48
109
|
|
|
49
110
|
|
|
50
111
|
@cache
|
|
51
|
-
def _default_format_opts() -> dict[
|
|
52
|
-
from docling.
|
|
53
|
-
|
|
54
|
-
|
|
112
|
+
def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
|
|
113
|
+
from docling.backend.docling_parse_backend import ( # noqa: PLC0415
|
|
114
|
+
DoclingParseDocumentBackend,
|
|
115
|
+
)
|
|
116
|
+
from docling.pipeline.standard_pdf_pipeline import ( # noqa: PLC0415
|
|
117
|
+
StandardPdfPipeline,
|
|
55
118
|
)
|
|
56
|
-
from docling.document_converter import PdfFormatOption # noqa: PLC0415
|
|
57
119
|
|
|
58
120
|
return {
|
|
59
|
-
InputFormat.PDF:
|
|
60
|
-
|
|
121
|
+
InputFormat.PDF: DoclingFormatOption(
|
|
122
|
+
pipeline_cls=StandardPdfPipeline,
|
|
123
|
+
backend=DoclingParseDocumentBackend,
|
|
124
|
+
pipeline_options=ThreadedPdfPipelineOptions(
|
|
61
125
|
ocr_options=EasyOcrOptions(), generate_picture_images=True
|
|
62
|
-
)
|
|
126
|
+
),
|
|
63
127
|
),
|
|
64
128
|
}
|
|
65
129
|
|
|
66
130
|
|
|
67
|
-
T = TypeVar("T")
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
71
|
-
for c in all_subclasses(cls):
|
|
72
|
-
if c.__name__ == name:
|
|
73
|
-
return c
|
|
74
|
-
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
@PipelineConfig.register()
|
|
78
131
|
class DoclingPipelineConfig(PipelineConfig):
|
|
79
|
-
pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
|
|
80
|
-
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
132
|
+
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
|
|
81
133
|
|
|
82
|
-
format_options:
|
|
83
|
-
|
|
84
|
-
|
|
134
|
+
format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
|
|
135
|
+
default_factory=_default_format_opts
|
|
136
|
+
)
|
|
85
137
|
|
|
86
138
|
@classmethod
|
|
87
139
|
@cache
|
|
88
140
|
def supported_exts(cls) -> set[SupportedExt]:
|
|
89
|
-
from docling.datamodel.base_models import ( # noqa: PLC0415
|
|
90
|
-
FormatToExtensions,
|
|
91
|
-
InputFormat,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
141
|
unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
|
|
95
142
|
supported = set()
|
|
96
143
|
for f in InputFormat:
|
|
@@ -106,7 +153,6 @@ class DoclingPipeline(Pipeline):
|
|
|
106
153
|
def __init__(
|
|
107
154
|
self, format_options: dict["InputFormat", "FormatOption"] | None = None
|
|
108
155
|
):
|
|
109
|
-
from docling.document_converter import DocumentConverter # noqa: PLC0415
|
|
110
156
|
|
|
111
157
|
allowed_format = [
|
|
112
158
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
@@ -134,7 +180,7 @@ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
|
|
134
180
|
|
|
135
181
|
|
|
136
182
|
def _to_result(
|
|
137
|
-
res:
|
|
183
|
+
res: ConversionResult,
|
|
138
184
|
input_document: InputDoc,
|
|
139
185
|
output_format: OutputFormat,
|
|
140
186
|
output_path: Path,
|
|
@@ -155,13 +201,11 @@ def _to_result(
|
|
|
155
201
|
|
|
156
202
|
|
|
157
203
|
def _to_markdown_doc(
|
|
158
|
-
res:
|
|
204
|
+
res: ConversionResult,
|
|
159
205
|
output_path: Path,
|
|
160
206
|
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
161
207
|
**kwargs,
|
|
162
208
|
) -> MarkdownDoc:
|
|
163
|
-
from docling_core.types.doc import ImageRefMode # noqa: PLC0415
|
|
164
|
-
|
|
165
209
|
# TODO: Should we add a hash to avoid collision between files with same names
|
|
166
210
|
# nested in the tree structured
|
|
167
211
|
md_dir_name = path_to_artifacts_dirname(res.input.file)
|
extract_python/marker_.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
|
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
-
from .constants import ARTIFACTS
|
|
10
|
+
from .constants import ARTIFACTS
|
|
11
11
|
from .objects import (
|
|
12
12
|
InputDoc,
|
|
13
13
|
MarkdownDoc,
|
|
@@ -25,10 +25,8 @@ if TYPE_CHECKING:
|
|
|
25
25
|
from PIL import Image
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
@PipelineConfig.register()
|
|
29
28
|
class MarkerPipelineConfig(PipelineConfig):
|
|
30
|
-
pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
|
|
31
|
-
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
29
|
+
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
|
|
32
30
|
|
|
33
31
|
config: dict[str, Any] = dict()
|
|
34
32
|
|
extract_python/miner_u.py
CHANGED
|
@@ -11,7 +11,7 @@ from typing import Any, ClassVar, Self
|
|
|
11
11
|
from pydantic import Field
|
|
12
12
|
from pydantic_extra_types.language_code import LanguageAlpha2
|
|
13
13
|
|
|
14
|
-
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
14
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
15
15
|
from .objects import (
|
|
16
16
|
BaseModel,
|
|
17
17
|
ConversionOutput,
|
|
@@ -74,10 +74,8 @@ class MinerUConfig(BaseModel):
|
|
|
74
74
|
}
|
|
75
75
|
|
|
76
76
|
|
|
77
|
-
@PipelineConfig.register() # noqa: F821
|
|
78
77
|
class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
79
|
-
pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
|
|
80
|
-
task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
|
|
78
|
+
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
|
|
81
79
|
|
|
82
80
|
config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
|
|
83
81
|
language: LanguageAlpha2 = Field(frozen=True, default="en")
|
extract_python/pipeline.py
CHANGED
|
@@ -24,9 +24,7 @@ class PipelineConfig(RegistrableConfig, ABC):
|
|
|
24
24
|
model_config = merge_configs(icij_config(), no_enum_values_config())
|
|
25
25
|
|
|
26
26
|
registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
|
|
27
|
-
pipeline: PipelineType
|
|
28
|
-
|
|
29
|
-
task_group: ClassVar[str] = Field(frozen=True)
|
|
27
|
+
pipeline: ClassVar[PipelineType]
|
|
30
28
|
|
|
31
29
|
@classmethod
|
|
32
30
|
@abstractmethod
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
+
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
+
extract_python/docling_.py,sha256=ZGlOVrgQw50bDh4B4DiRiRQSv5rGX-EFi8Z51mnAHpY,8620
|
|
4
|
+
extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
|
|
5
|
+
extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
|
|
6
|
+
extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
|
|
7
|
+
extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
|
|
8
|
+
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
+
extract_python-0.4.0.dist-info/METADATA,sha256=_cFyQr6erjdP5CxXtFI9lbyMIDJ8fVuU2LM-h1oyv7k,1132
|
|
10
|
+
extract_python-0.4.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
extract_python-0.4.0.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
-
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
-
extract_python/docling_.py,sha256=JD5lLFSRo6KC7LMF6rH2MVNJaQAwsVwzFd_WIRQhEWQ,7112
|
|
4
|
-
extract_python/marker_.py,sha256=GM1GB0gp8TkeyPGn7S5tCKkfEqcQdKjIu1CtYs2zt2g,5112
|
|
5
|
-
extract_python/miner_u.py,sha256=i7JKcoKvU3G_fB_0ffsTaLdRYAPvuK6zwohgjOVIBTY,8127
|
|
6
|
-
extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
|
|
7
|
-
extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
|
|
8
|
-
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
-
extract_python-0.3.2.dist-info/METADATA,sha256=BbUayvHGHkr9HZ-Pq1iUcxvtEq7QSZjCWTYS-iiWOWg,1132
|
|
10
|
-
extract_python-0.3.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
-
extract_python-0.3.2.dist-info/RECORD,,
|
|
File without changes
|