extract-python 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/docling_.py +37 -6
- extract_python/objects.py +1 -0
- {extract_python-0.4.0.dist-info → extract_python-0.4.2.dist-info}/METADATA +1 -1
- {extract_python-0.4.0.dist-info → extract_python-0.4.2.dist-info}/RECORD +5 -5
- {extract_python-0.4.0.dist-info → extract_python-0.4.2.dist-info}/WHEEL +0 -0
extract_python/docling_.py
CHANGED
|
@@ -7,7 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
|
|
8
8
|
|
|
9
9
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
10
|
-
from docling.datamodel.backend_options import BackendOptions
|
|
10
|
+
from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
|
|
11
11
|
|
|
12
12
|
# Data model import are quick it's ok to leave it there
|
|
13
13
|
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
@@ -26,7 +26,15 @@ from docling_core.types.doc import ImageRefMode
|
|
|
26
26
|
from docling_core.types.io import DocumentStream
|
|
27
27
|
from icij_common.pydantic_utils import to_lower_snake_case
|
|
28
28
|
from icij_common.registrable import FromConfig
|
|
29
|
-
from pydantic import
|
|
29
|
+
from pydantic import (
|
|
30
|
+
AfterValidator,
|
|
31
|
+
BeforeValidator,
|
|
32
|
+
Field,
|
|
33
|
+
PlainSerializer,
|
|
34
|
+
WrapSerializer,
|
|
35
|
+
model_validator,
|
|
36
|
+
)
|
|
37
|
+
from pydantic_core.core_schema import SerializerFunctionWrapHandler
|
|
30
38
|
|
|
31
39
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
32
40
|
from .objects import (
|
|
@@ -73,7 +81,7 @@ def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
|
73
81
|
|
|
74
82
|
def _find_init_arg_type(cls: type[Any], arg: str) -> type:
|
|
75
83
|
hints = get_type_hints(cls.__init__)
|
|
76
|
-
return hints[arg]
|
|
84
|
+
return hints[arg]
|
|
77
85
|
|
|
78
86
|
|
|
79
87
|
def _resolve_pipeline_cls(v: Any) -> Any:
|
|
@@ -82,6 +90,23 @@ def _resolve_pipeline_cls(v: Any) -> Any:
|
|
|
82
90
|
return v
|
|
83
91
|
|
|
84
92
|
|
|
93
|
+
def _ser_class_as_str(v: Any) -> Any:
|
|
94
|
+
if isinstance(v, type):
|
|
95
|
+
return v.__name__
|
|
96
|
+
return v
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _ser_with_backend_option_kind(
|
|
100
|
+
v: Any, handler: SerializerFunctionWrapHandler
|
|
101
|
+
) -> Any:
|
|
102
|
+
serialized = handler(v)
|
|
103
|
+
if isinstance(v, BaseBackendOptions):
|
|
104
|
+
kind = getattr(v, "kind", None)
|
|
105
|
+
if kind is not None:
|
|
106
|
+
serialized["kind"] = kind
|
|
107
|
+
return serialized
|
|
108
|
+
|
|
109
|
+
|
|
85
110
|
def _resolve_backend(v: Any) -> Any:
|
|
86
111
|
if isinstance(v, str):
|
|
87
112
|
return _find_subcls(AbstractDocumentBackend, v)
|
|
@@ -90,15 +115,21 @@ def _resolve_backend(v: Any) -> Any:
|
|
|
90
115
|
|
|
91
116
|
class DoclingFormatOption(FormatOption):
|
|
92
117
|
pipeline_cls: Annotated[
|
|
93
|
-
str | type[BasePipeline],
|
|
118
|
+
str | type[BasePipeline],
|
|
119
|
+
BeforeValidator(_resolve_pipeline_cls),
|
|
120
|
+
PlainSerializer(_ser_class_as_str),
|
|
94
121
|
]
|
|
95
122
|
pipeline_options: Annotated[
|
|
96
123
|
dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
|
|
97
124
|
] = None
|
|
98
125
|
backend: Annotated[
|
|
99
|
-
str | type[AbstractDocumentBackend],
|
|
126
|
+
str | type[AbstractDocumentBackend],
|
|
127
|
+
BeforeValidator(_resolve_backend),
|
|
128
|
+
PlainSerializer(_ser_class_as_str),
|
|
100
129
|
]
|
|
101
|
-
backend_options:
|
|
130
|
+
backend_options: Annotated[
|
|
131
|
+
BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
|
|
132
|
+
] = None
|
|
102
133
|
|
|
103
134
|
@model_validator(mode="after")
|
|
104
135
|
def _resolve_pipeline_options(self) -> Self:
|
extract_python/objects.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
2
|
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
-
extract_python/docling_.py,sha256=
|
|
3
|
+
extract_python/docling_.py,sha256=ys2vK4zgpWsPObIZWRFhHM4fNkojMYUa9QRevl8bd3c,9342
|
|
4
4
|
extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
|
|
5
5
|
extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
|
|
6
|
-
extract_python/objects.py,sha256=
|
|
6
|
+
extract_python/objects.py,sha256=MHCUZ9L8LVXlSlHyDMnbuWV1KHWMhUEJQMEDTc9hYD0,8761
|
|
7
7
|
extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
|
|
8
8
|
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
-
extract_python-0.4.
|
|
10
|
-
extract_python-0.4.
|
|
11
|
-
extract_python-0.4.
|
|
9
|
+
extract_python-0.4.2.dist-info/METADATA,sha256=95THYq0jZgY2-1X2s8hDoFEo9_aNeukdHPxlcd8_rmI,1132
|
|
10
|
+
extract_python-0.4.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
extract_python-0.4.2.dist-info/RECORD,,
|
|
File without changes
|