extract-python 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,34 @@
1
+ import importlib
1
2
  import shutil
2
3
  import tempfile
3
4
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
5
  from functools import cache
5
6
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
7
+ from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
8
+
9
+ from docling.backend.abstract_backend import AbstractDocumentBackend
10
+ from docling.datamodel.backend_options import BackendOptions
11
+
12
+ # Data model import are quick it's ok to leave it there
13
+ from docling.datamodel.base_models import FormatToExtensions, InputFormat
14
+ from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.pipeline_options import (
16
+ EasyOcrOptions,
17
+ PdfPipelineOptions,
18
+ PipelineOptions,
19
+ ThreadedPdfPipelineOptions,
20
+ )
21
+ from docling.document_converter import DocumentConverter, FormatOption
22
+ from docling.pipeline.base_pipeline import BasePipeline
7
23
 
24
+ # TODO: this is long to load improve it
25
+ from docling_core.types.doc import ImageRefMode
26
+ from docling_core.types.io import DocumentStream
27
+ from icij_common.pydantic_utils import to_lower_snake_case
8
28
  from icij_common.registrable import FromConfig
9
- from pydantic import AfterValidator, Field
29
+ from pydantic import AfterValidator, BeforeValidator, Field, model_validator
10
30
 
11
- from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
31
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
12
32
  from .objects import (
13
33
  Error,
14
34
  InputDoc,
@@ -24,79 +44,104 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
24
44
 
25
45
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
26
46
 
27
- if TYPE_CHECKING:
28
- from docling.datamodel.base_models import InputFormat
29
- from docling.datamodel.pipeline_options import PipelineOptions
30
- from docling.document_converter import ConversionResult, FormatOption
31
- from docling_core.types.io import DocumentStream
32
-
33
-
34
- def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
- from docling.datamodel.pipeline_options import PdfPipelineOptions
36
47
 
37
- if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
48
+ def _validate_pipeline_opts(v: "PipelineOptions") -> None:
49
+ if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
38
50
  msg = "generate_picture_images should be set to true"
39
51
  raise ValueError(msg)
40
-
41
-
42
- def _validate_options(
43
- data: dict["InputFormat", "FormatOption"],
44
- ) -> dict["InputFormat", "FormatOption"]:
45
- for opts in data.values():
46
- _validate_pipeline_opts(opts.pipeline_options)
47
- return data
48
-
49
-
50
- @cache
51
- def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
- from docling.datamodel.pipeline_options import (
53
- EasyOcrOptions,
54
- PdfPipelineOptions,
55
- )
56
- from docling.document_converter import PdfFormatOption
57
-
58
- return {
59
- InputFormat.PDF: PdfFormatOption(
60
- pipeline_options=PdfPipelineOptions(
61
- ocr_options=EasyOcrOptions(), generate_picture_images=True
62
- )
63
- ),
64
- }
52
+ return v
65
53
 
66
54
 
67
55
  T = TypeVar("T")
68
56
 
69
57
 
70
58
  def _find_subcls(cls: type[T], name: str) -> type[T]:
59
+ # Check if the class available
71
60
  for c in all_subclasses(cls):
72
61
  if c.__name__ == name:
73
62
  return c
63
+ # Then apply ad-hoc search
64
+ if "pipeline" in cls.__name__.lower():
65
+ module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
66
+ try:
67
+ module = importlib.import_module(module_name)
68
+ return getattr(module, name)
69
+ except (ModuleNotFoundError, AttributeError):
70
+ pass
74
71
  raise ValueError(f"unknown {cls.__name__} subclass {name}")
75
72
 
76
73
 
77
- @PipelineConfig.register()
78
- class DoclingPipelineConfig(PipelineConfig):
79
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
80
- task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
74
+ def _find_init_arg_type(cls: type[Any], arg: str) -> type:
75
+ hints = get_type_hints(cls.__init__)
76
+ return hints[arg].__class__
77
+
78
+
79
+ def _resolve_pipeline_cls(v: Any) -> Any:
80
+ if isinstance(v, str):
81
+ return _find_subcls(BasePipeline, v)
82
+ return v
83
+
81
84
 
82
- format_options: Annotated[
83
- dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
84
- ] = Field(default_factory=_default_format_opts)
85
+ def _resolve_backend(v: Any) -> Any:
86
+ if isinstance(v, str):
87
+ return _find_subcls(AbstractDocumentBackend, v)
88
+ return v
85
89
 
86
- _unsupported_input_formats: ClassVar[set[InputFormat]] = {
87
- InputFormat.AUDIO,
88
- InputFormat.METS_GBS,
89
- InputFormat.VTT,
90
+
91
+ class DoclingFormatOption(FormatOption):
92
+ pipeline_cls: Annotated[
93
+ str | type[BasePipeline], BeforeValidator(_resolve_pipeline_cls)
94
+ ]
95
+ pipeline_options: Annotated[
96
+ dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
97
+ ] = None
98
+ backend: Annotated[
99
+ str | type[AbstractDocumentBackend], BeforeValidator(_resolve_backend)
100
+ ]
101
+ backend_options: BackendOptions | None = None
102
+
103
+ @model_validator(mode="after")
104
+ def _resolve_pipeline_options(self) -> Self:
105
+ if isinstance(self.pipeline_options, dict):
106
+ option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
107
+ self.pipeline_options = option_cls.model_validate(self.pipeline_options)
108
+ return self
109
+
110
+
111
+ @cache
112
+ def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
113
+ from docling.backend.docling_parse_backend import ( # noqa: PLC0415
114
+ DoclingParseDocumentBackend,
115
+ )
116
+ from docling.pipeline.standard_pdf_pipeline import ( # noqa: PLC0415
117
+ StandardPdfPipeline,
118
+ )
119
+
120
+ return {
121
+ InputFormat.PDF: DoclingFormatOption(
122
+ pipeline_cls=StandardPdfPipeline,
123
+ backend=DoclingParseDocumentBackend,
124
+ pipeline_options=ThreadedPdfPipelineOptions(
125
+ ocr_options=EasyOcrOptions(), generate_picture_images=True
126
+ ),
127
+ ),
90
128
  }
91
129
 
130
+
131
+ class DoclingPipelineConfig(PipelineConfig):
132
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
133
+
134
+ format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
135
+ default_factory=_default_format_opts
136
+ )
137
+
92
138
  @classmethod
93
139
  @cache
94
140
  def supported_exts(cls) -> set[SupportedExt]:
95
- from docling.datamodel.base_models import FormatToExtensions, InputFormat
96
-
141
+ unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
97
142
  supported = set()
98
143
  for f in InputFormat:
99
- if f in cls._unsupported_input_formats:
144
+ if f in unsupported:
100
145
  continue
101
146
  for ext in FormatToExtensions[f]:
102
147
  supported.add(SupportedExt(f".{ext.lower()}"))
@@ -105,8 +150,9 @@ class DoclingPipelineConfig(PipelineConfig):
105
150
 
106
151
  @Pipeline.register(PipelineType.DOCLING)
107
152
  class DoclingPipeline(Pipeline):
108
- def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
109
- from docling.document_converter import DocumentConverter
153
+ def __init__(
154
+ self, format_options: dict["InputFormat", "FormatOption"] | None = None
155
+ ):
110
156
 
111
157
  allowed_format = [
112
158
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
@@ -128,13 +174,13 @@ class DoclingPipeline(Pipeline):
128
174
  return cls(config.format_options)
129
175
 
130
176
 
131
- def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
177
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
132
178
  for d in docs:
133
179
  yield d.to_docling()
134
180
 
135
181
 
136
182
  def _to_result(
137
- res: "ConversionResult",
183
+ res: ConversionResult,
138
184
  input_document: InputDoc,
139
185
  output_format: OutputFormat,
140
186
  output_path: Path,
@@ -155,13 +201,11 @@ def _to_result(
155
201
 
156
202
 
157
203
  def _to_markdown_doc(
158
- res: "ConversionResult",
204
+ res: ConversionResult,
159
205
  output_path: Path,
160
206
  page_sep: str = DEFAULT_MD_PAGE_SEP,
161
207
  **kwargs,
162
208
  ) -> MarkdownDoc:
163
- from docling_core.types.doc import ImageRefMode
164
-
165
209
  # TODO: Should we add a hash to avoid collision between files with same names
166
210
  # nested in the tree structured
167
211
  md_dir_name = path_to_artifacts_dirname(res.input.file)
extract_python/marker_.py CHANGED
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from .constants import ARTIFACTS, CPU_GROUP
10
+ from .constants import ARTIFACTS
11
11
  from .objects import (
12
12
  InputDoc,
13
13
  MarkdownDoc,
@@ -25,10 +25,8 @@ if TYPE_CHECKING:
25
25
  from PIL import Image
26
26
 
27
27
 
28
- @PipelineConfig.register()
29
28
  class MarkerPipelineConfig(PipelineConfig):
30
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
31
- task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
29
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
32
30
 
33
31
  config: dict[str, Any] = dict()
34
32
 
@@ -73,9 +71,9 @@ class MarkerPipeline(Pipeline):
73
71
  async def extract_content(
74
72
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
75
73
  ) -> AsyncGenerator[Result, None]:
76
- from marker.config.parser import ConfigParser
77
- from marker.converters.pdf import PdfConverter
78
- from marker.models import create_model_dict
74
+ from marker.config.parser import ConfigParser # noqa: PLC0415
75
+ from marker.converters.pdf import PdfConverter # noqa: PLC0415
76
+ from marker.models import create_model_dict # noqa: PLC0415
79
77
 
80
78
  config = deepcopy(self._marker_config)
81
79
  config["output_format"] = output_format.to_marker()
@@ -102,7 +100,7 @@ def _process_doc(
102
100
  output_format: OutputFormat,
103
101
  output_path: Path,
104
102
  ) -> Result:
105
- from marker.output import text_from_rendered
103
+ from marker.output import text_from_rendered # noqa: PLC0415
106
104
 
107
105
  rendered = converter(str(doc.path))
108
106
  content, _, images = text_from_rendered(rendered)
@@ -118,7 +116,7 @@ def _process_doc(
118
116
  def _to_markdown_doc(
119
117
  input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
120
118
  ) -> MarkdownDoc:
121
- from marker.renderers.markdown import MarkdownRenderer
119
+ from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
122
120
 
123
121
  # TODO: Should we add a hash to avoid collision between files with same names
124
122
  # nested in the tree structured
extract_python/miner_u.py CHANGED
@@ -11,7 +11,7 @@ from typing import Any, ClassVar, Self
11
11
  from pydantic import Field
12
12
  from pydantic_extra_types.language_code import LanguageAlpha2
13
13
 
14
- from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP, MINER_U_GROUP
14
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
15
15
  from .objects import (
16
16
  BaseModel,
17
17
  ConversionOutput,
@@ -52,13 +52,12 @@ class MinerUConfig(BaseModel):
52
52
  @classmethod
53
53
  @cache
54
54
  def _get_default_kwargs(cls) -> dict[str, Any]:
55
-
56
- from mineru.utils.enum_class import MakeMode
55
+ from mineru.utils.enum_class import MakeMode # noqa: PLC0415
57
56
 
58
57
  return {
59
58
  "server_url": None,
60
- # We don't dump md directly we process, we dump the middle json in order to be
61
- # able to get page indexes
59
+ # We don't dump md directly we process, we dump the middle json in order
60
+ # to be able to get page indexes
62
61
  "parse_method": "auto",
63
62
  "dump_md": False,
64
63
  "dump_middle_json": True,
@@ -75,10 +74,8 @@ class MinerUConfig(BaseModel):
75
74
  }
76
75
 
77
76
 
78
- @PipelineConfig.register() # noqa: F821
79
77
  class MinerUPipelineConfig(PipelineConfig): # noqa: F821
80
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
81
- task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
78
+ pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
82
79
 
83
80
  config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
84
81
  language: LanguageAlpha2 = Field(frozen=True, default="en")
@@ -104,7 +101,7 @@ class MinerUPipeline(Pipeline):
104
101
  async def extract_content(
105
102
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
106
103
  ) -> AsyncGenerator[Result, None]:
107
- from mineru.cli.common import aio_do_parse
104
+ from mineru.cli.common import aio_do_parse # noqa: PLC0415
108
105
 
109
106
  docs = list(docs)
110
107
  # TODO: exclude files which are not pdf and return an error
@@ -154,13 +151,15 @@ def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
154
151
 
155
152
  match backend:
156
153
  case MinerUBackend.PIPELINE:
157
- from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
154
+ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import ( # noqa: PLC0415
158
155
  union_make,
159
156
  )
160
157
 
161
158
  return union_make
162
159
  case MinerUBackend.VLM:
163
- from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
160
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import ( # noqa: PLC0415
161
+ union_make,
162
+ )
164
163
 
165
164
  return union_make
166
165
  case _:
@@ -212,7 +211,7 @@ def _dump_md_content(
212
211
  im_dir: Path,
213
212
  md_make_mode: str | None = None,
214
213
  ) -> ConversionOutput:
215
- from mineru.utils.enum_class import MakeMode
214
+ from mineru.utils.enum_class import MakeMode # noqa: PLC0415
216
215
 
217
216
  if md_make_mode is None:
218
217
  md_make_mode = MakeMode.MM_MD
@@ -24,9 +24,7 @@ class PipelineConfig(RegistrableConfig, ABC):
24
24
  model_config = merge_configs(icij_config(), no_enum_values_config())
25
25
 
26
26
  registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
27
- pipeline: PipelineType
28
-
29
- task_group: ClassVar[str] = Field(frozen=True)
27
+ pipeline: ClassVar[PipelineType]
30
28
 
31
29
  @classmethod
32
30
  @abstractmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.3.1
3
+ Version: 0.4.0
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=ZGlOVrgQw50bDh4B4DiRiRQSv5rGX-EFi8Z51mnAHpY,8620
4
+ extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
5
+ extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
6
+ extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
+ extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.4.0.dist-info/METADATA,sha256=_cFyQr6erjdP5CxXtFI9lbyMIDJ8fVuU2LM-h1oyv7k,1132
10
+ extract_python-0.4.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.4.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=lWWQ2PT5qOFUcJkeKw8ibF4JxzxQBgf93_CfvNcykDg,7041
4
- extract_python/marker_.py,sha256=ocRFxWX__A-M31z7Qr67OMcWRvgGO_C3tyZpiKc-bXw,5027
5
- extract_python/miner_u.py,sha256=hwRFTvtWGN_mRuv0p6H7nKS89dTErQxI1yOrvh6238M,8010
6
- extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
- extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.3.1.dist-info/METADATA,sha256=qtfZpwEIKgWzkfbxGYMVP-pNFMFAbLrZo1-hmDXcgvE,1132
10
- extract_python-0.3.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.3.1.dist-info/RECORD,,