extract-python 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,42 @@
1
+ import importlib
1
2
  import shutil
2
3
  import tempfile
3
4
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
5
  from functools import cache
5
6
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
7
+ from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
8
+
9
+ from docling.backend.abstract_backend import AbstractDocumentBackend
10
+ from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
11
+
12
+ # Data model import are quick it's ok to leave it there
13
+ from docling.datamodel.base_models import FormatToExtensions, InputFormat
14
+ from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.pipeline_options import (
16
+ EasyOcrOptions,
17
+ PdfPipelineOptions,
18
+ PipelineOptions,
19
+ ThreadedPdfPipelineOptions,
20
+ )
21
+ from docling.document_converter import DocumentConverter, FormatOption
22
+ from docling.pipeline.base_pipeline import BasePipeline
7
23
 
24
+ # TODO: this is long to load improve it
25
+ from docling_core.types.doc import ImageRefMode
26
+ from docling_core.types.io import DocumentStream
27
+ from icij_common.pydantic_utils import to_lower_snake_case
8
28
  from icij_common.registrable import FromConfig
9
- from pydantic import AfterValidator, Field
29
+ from pydantic import (
30
+ AfterValidator,
31
+ BeforeValidator,
32
+ Field,
33
+ PlainSerializer,
34
+ WrapSerializer,
35
+ model_validator,
36
+ )
37
+ from pydantic_core.core_schema import SerializerFunctionWrapHandler
10
38
 
11
- from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
39
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
12
40
  from .objects import (
13
41
  Error,
14
42
  InputDoc,
@@ -24,73 +52,123 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
24
52
 
25
53
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
26
54
 
27
- if TYPE_CHECKING:
28
- from docling.datamodel.base_models import InputFormat
29
- from docling.datamodel.pipeline_options import PipelineOptions
30
- from docling.document_converter import ConversionResult, FormatOption
31
- from docling_core.types.io import DocumentStream
32
-
33
55
 
34
- def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
- from docling.datamodel.pipeline_options import PdfPipelineOptions # noqa: PLC0415
36
-
37
- if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
56
+ def _validate_pipeline_opts(v: "PipelineOptions") -> None:
57
+ if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
38
58
  msg = "generate_picture_images should be set to true"
39
59
  raise ValueError(msg)
60
+ return v
61
+
62
+
63
+ T = TypeVar("T")
64
+
65
+
66
+ def _find_subcls(cls: type[T], name: str) -> type[T]:
67
+ # Check if the class available
68
+ for c in all_subclasses(cls):
69
+ if c.__name__ == name:
70
+ return c
71
+ # Then apply ad-hoc search
72
+ if "pipeline" in cls.__name__.lower():
73
+ module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
74
+ try:
75
+ module = importlib.import_module(module_name)
76
+ return getattr(module, name)
77
+ except (ModuleNotFoundError, AttributeError):
78
+ pass
79
+ raise ValueError(f"unknown {cls.__name__} subclass {name}")
80
+
81
+
82
+ def _find_init_arg_type(cls: type[Any], arg: str) -> type:
83
+ hints = get_type_hints(cls.__init__)
84
+ return hints[arg]
85
+
86
+
87
+ def _resolve_pipeline_cls(v: Any) -> Any:
88
+ if isinstance(v, str):
89
+ return _find_subcls(BasePipeline, v)
90
+ return v
91
+
92
+
93
+ def _ser_class_as_str(v: Any) -> Any:
94
+ if isinstance(v, type):
95
+ return v.__name__
96
+ return v
40
97
 
41
98
 
42
- def _validate_options(
43
- data: dict["InputFormat", "FormatOption"],
44
- ) -> dict["InputFormat", "FormatOption"]:
45
- for opts in data.values():
46
- _validate_pipeline_opts(opts.pipeline_options)
47
- return data
99
+ def _ser_with_backend_option_kind(
100
+ v: Any, handler: SerializerFunctionWrapHandler
101
+ ) -> Any:
102
+ serialized = handler(v)
103
+ if isinstance(v, BaseBackendOptions):
104
+ kind = getattr(v, "kind", None)
105
+ if kind is not None:
106
+ serialized["kind"] = kind
107
+ return serialized
108
+
109
+
110
+ def _resolve_backend(v: Any) -> Any:
111
+ if isinstance(v, str):
112
+ return _find_subcls(AbstractDocumentBackend, v)
113
+ return v
114
+
115
+
116
+ class DoclingFormatOption(FormatOption):
117
+ pipeline_cls: Annotated[
118
+ str | type[BasePipeline],
119
+ BeforeValidator(_resolve_pipeline_cls),
120
+ PlainSerializer(_ser_class_as_str),
121
+ ]
122
+ pipeline_options: Annotated[
123
+ dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
124
+ ] = None
125
+ backend: Annotated[
126
+ str | type[AbstractDocumentBackend],
127
+ BeforeValidator(_resolve_backend),
128
+ PlainSerializer(_ser_class_as_str),
129
+ ]
130
+ backend_options: Annotated[
131
+ BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
132
+ ] = None
133
+
134
+ @model_validator(mode="after")
135
+ def _resolve_pipeline_options(self) -> Self:
136
+ if isinstance(self.pipeline_options, dict):
137
+ option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
138
+ self.pipeline_options = option_cls.model_validate(self.pipeline_options)
139
+ return self
48
140
 
49
141
 
50
142
  @cache
51
- def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
- from docling.datamodel.pipeline_options import ( # noqa: PLC0415
53
- EasyOcrOptions,
54
- PdfPipelineOptions,
143
+ def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
144
+ from docling.backend.docling_parse_backend import ( # noqa: PLC0415
145
+ DoclingParseDocumentBackend,
146
+ )
147
+ from docling.pipeline.standard_pdf_pipeline import ( # noqa: PLC0415
148
+ StandardPdfPipeline,
55
149
  )
56
- from docling.document_converter import PdfFormatOption # noqa: PLC0415
57
150
 
58
151
  return {
59
- InputFormat.PDF: PdfFormatOption(
60
- pipeline_options=PdfPipelineOptions(
152
+ InputFormat.PDF: DoclingFormatOption(
153
+ pipeline_cls=StandardPdfPipeline,
154
+ backend=DoclingParseDocumentBackend,
155
+ pipeline_options=ThreadedPdfPipelineOptions(
61
156
  ocr_options=EasyOcrOptions(), generate_picture_images=True
62
- )
157
+ ),
63
158
  ),
64
159
  }
65
160
 
66
161
 
67
- T = TypeVar("T")
68
-
69
-
70
- def _find_subcls(cls: type[T], name: str) -> type[T]:
71
- for c in all_subclasses(cls):
72
- if c.__name__ == name:
73
- return c
74
- raise ValueError(f"unknown {cls.__name__} subclass {name}")
75
-
76
-
77
- @PipelineConfig.register()
78
162
  class DoclingPipelineConfig(PipelineConfig):
79
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
80
- task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
163
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
81
164
 
82
- format_options: Annotated[
83
- dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
84
- ] = Field(default_factory=_default_format_opts)
165
+ format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
166
+ default_factory=_default_format_opts
167
+ )
85
168
 
86
169
  @classmethod
87
170
  @cache
88
171
  def supported_exts(cls) -> set[SupportedExt]:
89
- from docling.datamodel.base_models import ( # noqa: PLC0415
90
- FormatToExtensions,
91
- InputFormat,
92
- )
93
-
94
172
  unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
95
173
  supported = set()
96
174
  for f in InputFormat:
@@ -106,7 +184,6 @@ class DoclingPipeline(Pipeline):
106
184
  def __init__(
107
185
  self, format_options: dict["InputFormat", "FormatOption"] | None = None
108
186
  ):
109
- from docling.document_converter import DocumentConverter # noqa: PLC0415
110
187
 
111
188
  allowed_format = [
112
189
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
@@ -134,7 +211,7 @@ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
134
211
 
135
212
 
136
213
  def _to_result(
137
- res: "ConversionResult",
214
+ res: ConversionResult,
138
215
  input_document: InputDoc,
139
216
  output_format: OutputFormat,
140
217
  output_path: Path,
@@ -155,13 +232,11 @@ def _to_result(
155
232
 
156
233
 
157
234
  def _to_markdown_doc(
158
- res: "ConversionResult",
235
+ res: ConversionResult,
159
236
  output_path: Path,
160
237
  page_sep: str = DEFAULT_MD_PAGE_SEP,
161
238
  **kwargs,
162
239
  ) -> MarkdownDoc:
163
- from docling_core.types.doc import ImageRefMode # noqa: PLC0415
164
-
165
240
  # TODO: Should we add a hash to avoid collision between files with same names
166
241
  # nested in the tree structured
167
242
  md_dir_name = path_to_artifacts_dirname(res.input.file)
extract_python/marker_.py CHANGED
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from .constants import ARTIFACTS, CPU_GROUP
10
+ from .constants import ARTIFACTS
11
11
  from .objects import (
12
12
  InputDoc,
13
13
  MarkdownDoc,
@@ -25,10 +25,8 @@ if TYPE_CHECKING:
25
25
  from PIL import Image
26
26
 
27
27
 
28
- @PipelineConfig.register()
29
28
  class MarkerPipelineConfig(PipelineConfig):
30
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
31
- task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
29
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
32
30
 
33
31
  config: dict[str, Any] = dict()
34
32
 
extract_python/miner_u.py CHANGED
@@ -11,7 +11,7 @@ from typing import Any, ClassVar, Self
11
11
  from pydantic import Field
12
12
  from pydantic_extra_types.language_code import LanguageAlpha2
13
13
 
14
- from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP, MINER_U_GROUP
14
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
15
15
  from .objects import (
16
16
  BaseModel,
17
17
  ConversionOutput,
@@ -74,10 +74,8 @@ class MinerUConfig(BaseModel):
74
74
  }
75
75
 
76
76
 
77
- @PipelineConfig.register() # noqa: F821
78
77
  class MinerUPipelineConfig(PipelineConfig): # noqa: F821
79
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
80
- task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
78
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
81
79
 
82
80
  config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
83
81
  language: LanguageAlpha2 = Field(frozen=True, default="en")
@@ -24,9 +24,7 @@ class PipelineConfig(RegistrableConfig, ABC):
24
24
  model_config = merge_configs(icij_config(), no_enum_values_config())
25
25
 
26
26
  registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
27
- pipeline: PipelineType
28
-
29
- task_group: ClassVar[str] = Field(frozen=True)
27
+ pipeline: ClassVar[PipelineType]
30
28
 
31
29
  @classmethod
32
30
  @abstractmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.3.2
3
+ Version: 0.4.1
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=ys2vK4zgpWsPObIZWRFhHM4fNkojMYUa9QRevl8bd3c,9342
4
+ extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
5
+ extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
6
+ extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
+ extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.4.1.dist-info/METADATA,sha256=tjxWkMOJ4mhT6eF-HmZmJl_HJgNT2fluq2sZUPWfE7o,1132
10
+ extract_python-0.4.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.4.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=JD5lLFSRo6KC7LMF6rH2MVNJaQAwsVwzFd_WIRQhEWQ,7112
4
- extract_python/marker_.py,sha256=GM1GB0gp8TkeyPGn7S5tCKkfEqcQdKjIu1CtYs2zt2g,5112
5
- extract_python/miner_u.py,sha256=i7JKcoKvU3G_fB_0ffsTaLdRYAPvuK6zwohgjOVIBTY,8127
6
- extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
- extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.3.2.dist-info/METADATA,sha256=BbUayvHGHkr9HZ-Pq1iUcxvtEq7QSZjCWTYS-iiWOWg,1132
10
- extract_python-0.3.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.3.2.dist-info/RECORD,,