extract-python 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,34 @@
1
+ import importlib
1
2
  import shutil
2
3
  import tempfile
3
4
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
5
  from functools import cache
5
6
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
7
+ from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
8
+
9
+ from docling.backend.abstract_backend import AbstractDocumentBackend
10
+ from docling.datamodel.backend_options import BackendOptions
11
+
12
+ # Data model import are quick it's ok to leave it there
13
+ from docling.datamodel.base_models import FormatToExtensions, InputFormat
14
+ from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.pipeline_options import (
16
+ EasyOcrOptions,
17
+ PdfPipelineOptions,
18
+ PipelineOptions,
19
+ ThreadedPdfPipelineOptions,
20
+ )
21
+ from docling.document_converter import DocumentConverter, FormatOption
22
+ from docling.pipeline.base_pipeline import BasePipeline
7
23
 
24
+ # TODO: this is long to load improve it
25
+ from docling_core.types.doc import ImageRefMode
26
+ from docling_core.types.io import DocumentStream
27
+ from icij_common.pydantic_utils import to_lower_snake_case
8
28
  from icij_common.registrable import FromConfig
9
- from pydantic import AfterValidator, Field
29
+ from pydantic import AfterValidator, BeforeValidator, Field, model_validator
10
30
 
11
- from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
31
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
12
32
  from .objects import (
13
33
  Error,
14
34
  InputDoc,
@@ -24,73 +44,100 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
24
44
 
25
45
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
26
46
 
27
- if TYPE_CHECKING:
28
- from docling.datamodel.base_models import InputFormat
29
- from docling.datamodel.pipeline_options import PipelineOptions
30
- from docling.document_converter import ConversionResult, FormatOption
31
- from docling_core.types.io import DocumentStream
32
-
33
-
34
- def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
- from docling.datamodel.pipeline_options import PdfPipelineOptions # noqa: PLC0415
36
47
 
37
- if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
48
+ def _validate_pipeline_opts(v: "PipelineOptions") -> None:
49
+ if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
38
50
  msg = "generate_picture_images should be set to true"
39
51
  raise ValueError(msg)
52
+ return v
53
+
54
+
55
+ T = TypeVar("T")
56
+
57
+
58
+ def _find_subcls(cls: type[T], name: str) -> type[T]:
59
+ # Check if the class available
60
+ for c in all_subclasses(cls):
61
+ if c.__name__ == name:
62
+ return c
63
+ # Then apply ad-hoc search
64
+ if "pipeline" in cls.__name__.lower():
65
+ module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
66
+ try:
67
+ module = importlib.import_module(module_name)
68
+ return getattr(module, name)
69
+ except (ModuleNotFoundError, AttributeError):
70
+ pass
71
+ raise ValueError(f"unknown {cls.__name__} subclass {name}")
72
+
73
+
74
+ def _find_init_arg_type(cls: type[Any], arg: str) -> type:
75
+ hints = get_type_hints(cls.__init__)
76
+ return hints[arg].__class__
77
+
78
+
79
+ def _resolve_pipeline_cls(v: Any) -> Any:
80
+ if isinstance(v, str):
81
+ return _find_subcls(BasePipeline, v)
82
+ return v
83
+
40
84
 
85
+ def _resolve_backend(v: Any) -> Any:
86
+ if isinstance(v, str):
87
+ return _find_subcls(AbstractDocumentBackend, v)
88
+ return v
41
89
 
42
- def _validate_options(
43
- data: dict["InputFormat", "FormatOption"],
44
- ) -> dict["InputFormat", "FormatOption"]:
45
- for opts in data.values():
46
- _validate_pipeline_opts(opts.pipeline_options)
47
- return data
90
+
91
+ class DoclingFormatOption(FormatOption):
92
+ pipeline_cls: Annotated[
93
+ str | type[BasePipeline], BeforeValidator(_resolve_pipeline_cls)
94
+ ]
95
+ pipeline_options: Annotated[
96
+ dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
97
+ ] = None
98
+ backend: Annotated[
99
+ str | type[AbstractDocumentBackend], BeforeValidator(_resolve_backend)
100
+ ]
101
+ backend_options: BackendOptions | None = None
102
+
103
+ @model_validator(mode="after")
104
+ def _resolve_pipeline_options(self) -> Self:
105
+ if isinstance(self.pipeline_options, dict):
106
+ option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
107
+ self.pipeline_options = option_cls.model_validate(self.pipeline_options)
108
+ return self
48
109
 
49
110
 
50
111
  @cache
51
- def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
- from docling.datamodel.pipeline_options import ( # noqa: PLC0415
53
- EasyOcrOptions,
54
- PdfPipelineOptions,
112
+ def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
113
+ from docling.backend.docling_parse_backend import ( # noqa: PLC0415
114
+ DoclingParseDocumentBackend,
115
+ )
116
+ from docling.pipeline.standard_pdf_pipeline import ( # noqa: PLC0415
117
+ StandardPdfPipeline,
55
118
  )
56
- from docling.document_converter import PdfFormatOption # noqa: PLC0415
57
119
 
58
120
  return {
59
- InputFormat.PDF: PdfFormatOption(
60
- pipeline_options=PdfPipelineOptions(
121
+ InputFormat.PDF: DoclingFormatOption(
122
+ pipeline_cls=StandardPdfPipeline,
123
+ backend=DoclingParseDocumentBackend,
124
+ pipeline_options=ThreadedPdfPipelineOptions(
61
125
  ocr_options=EasyOcrOptions(), generate_picture_images=True
62
- )
126
+ ),
63
127
  ),
64
128
  }
65
129
 
66
130
 
67
- T = TypeVar("T")
68
-
69
-
70
- def _find_subcls(cls: type[T], name: str) -> type[T]:
71
- for c in all_subclasses(cls):
72
- if c.__name__ == name:
73
- return c
74
- raise ValueError(f"unknown {cls.__name__} subclass {name}")
75
-
76
-
77
- @PipelineConfig.register()
78
131
  class DoclingPipelineConfig(PipelineConfig):
79
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
80
- task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
132
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
81
133
 
82
- format_options: Annotated[
83
- dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
84
- ] = Field(default_factory=_default_format_opts)
134
+ format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
135
+ default_factory=_default_format_opts
136
+ )
85
137
 
86
138
  @classmethod
87
139
  @cache
88
140
  def supported_exts(cls) -> set[SupportedExt]:
89
- from docling.datamodel.base_models import ( # noqa: PLC0415
90
- FormatToExtensions,
91
- InputFormat,
92
- )
93
-
94
141
  unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
95
142
  supported = set()
96
143
  for f in InputFormat:
@@ -106,7 +153,6 @@ class DoclingPipeline(Pipeline):
106
153
  def __init__(
107
154
  self, format_options: dict["InputFormat", "FormatOption"] | None = None
108
155
  ):
109
- from docling.document_converter import DocumentConverter # noqa: PLC0415
110
156
 
111
157
  allowed_format = [
112
158
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
@@ -134,7 +180,7 @@ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
134
180
 
135
181
 
136
182
  def _to_result(
137
- res: "ConversionResult",
183
+ res: ConversionResult,
138
184
  input_document: InputDoc,
139
185
  output_format: OutputFormat,
140
186
  output_path: Path,
@@ -155,13 +201,11 @@ def _to_result(
155
201
 
156
202
 
157
203
  def _to_markdown_doc(
158
- res: "ConversionResult",
204
+ res: ConversionResult,
159
205
  output_path: Path,
160
206
  page_sep: str = DEFAULT_MD_PAGE_SEP,
161
207
  **kwargs,
162
208
  ) -> MarkdownDoc:
163
- from docling_core.types.doc import ImageRefMode # noqa: PLC0415
164
-
165
209
  # TODO: Should we add a hash to avoid collision between files with same names
166
210
  # nested in the tree structured
167
211
  md_dir_name = path_to_artifacts_dirname(res.input.file)
extract_python/marker_.py CHANGED
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from .constants import ARTIFACTS, CPU_GROUP
10
+ from .constants import ARTIFACTS
11
11
  from .objects import (
12
12
  InputDoc,
13
13
  MarkdownDoc,
@@ -25,10 +25,8 @@ if TYPE_CHECKING:
25
25
  from PIL import Image
26
26
 
27
27
 
28
- @PipelineConfig.register()
29
28
  class MarkerPipelineConfig(PipelineConfig):
30
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
31
- task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
29
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
32
30
 
33
31
  config: dict[str, Any] = dict()
34
32
 
extract_python/miner_u.py CHANGED
@@ -11,7 +11,7 @@ from typing import Any, ClassVar, Self
11
11
  from pydantic import Field
12
12
  from pydantic_extra_types.language_code import LanguageAlpha2
13
13
 
14
- from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP, MINER_U_GROUP
14
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
15
15
  from .objects import (
16
16
  BaseModel,
17
17
  ConversionOutput,
@@ -74,10 +74,8 @@ class MinerUConfig(BaseModel):
74
74
  }
75
75
 
76
76
 
77
- @PipelineConfig.register() # noqa: F821
78
77
  class MinerUPipelineConfig(PipelineConfig): # noqa: F821
79
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
80
- task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
78
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
81
79
 
82
80
  config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
83
81
  language: LanguageAlpha2 = Field(frozen=True, default="en")
@@ -24,9 +24,7 @@ class PipelineConfig(RegistrableConfig, ABC):
24
24
  model_config = merge_configs(icij_config(), no_enum_values_config())
25
25
 
26
26
  registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
27
- pipeline: PipelineType
28
-
29
- task_group: ClassVar[str] = Field(frozen=True)
27
+ pipeline: ClassVar[PipelineType]
30
28
 
31
29
  @classmethod
32
30
  @abstractmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.3.2
3
+ Version: 0.4.0
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=ZGlOVrgQw50bDh4B4DiRiRQSv5rGX-EFi8Z51mnAHpY,8620
4
+ extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
5
+ extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
6
+ extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
+ extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.4.0.dist-info/METADATA,sha256=_cFyQr6erjdP5CxXtFI9lbyMIDJ8fVuU2LM-h1oyv7k,1132
10
+ extract_python-0.4.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.4.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=JD5lLFSRo6KC7LMF6rH2MVNJaQAwsVwzFd_WIRQhEWQ,7112
4
- extract_python/marker_.py,sha256=GM1GB0gp8TkeyPGn7S5tCKkfEqcQdKjIu1CtYs2zt2g,5112
5
- extract_python/miner_u.py,sha256=i7JKcoKvU3G_fB_0ffsTaLdRYAPvuK6zwohgjOVIBTY,8127
6
- extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
- extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.3.2.dist-info/METADATA,sha256=BbUayvHGHkr9HZ-Pq1iUcxvtEq7QSZjCWTYS-iiWOWg,1132
10
- extract_python-0.3.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.3.2.dist-info/RECORD,,