extract-python 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,28 +3,23 @@ import tempfile
3
3
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Any, ClassVar, Literal, TypeVar
6
+ from typing import Annotated, ClassVar, TypeVar
7
7
 
8
- from docling.backend.abstract_backend import AbstractDocumentBackend
9
- from docling.datamodel.base_models import InputFormat
8
+ from docling.datamodel.base_models import FormatToExtensions, InputFormat
10
9
  from docling.datamodel.document import ConversionResult
11
10
  from docling.datamodel.pipeline_options import (
12
11
  EasyOcrOptions,
13
12
  PdfPipelineOptions,
14
13
  PipelineOptions,
15
- VlmPipelineOptions,
16
14
  )
17
- from docling.document_converter import DocumentConverter, FormatOption
18
- from docling.models.factories import get_ocr_factory
19
- from docling.pipeline.base_pipeline import BasePipeline
15
+ from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
20
16
  from docling_core.types.doc import ImageRefMode
21
17
  from docling_core.types.io import DocumentStream
22
18
  from icij_common.registrable import FromConfig
23
- from pydantic import Field, model_validator
19
+ from pydantic import AfterValidator, Field
24
20
 
25
21
  from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
26
22
  from .objects import (
27
- BaseModel,
28
23
  Error,
29
24
  InputDoc,
30
25
  MarkdownDoc,
@@ -32,6 +27,7 @@ from .objects import (
32
27
  PageIndexes,
33
28
  Result,
34
29
  Status,
30
+ SupportedExt,
35
31
  )
36
32
  from .pipeline import Pipeline, PipelineConfig, PipelineType
37
33
  from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
@@ -39,73 +35,27 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
39
35
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
40
36
 
41
37
 
42
- class _PdfPipelineOptions(PdfPipelineOptions):
43
- generate_picture_images: bool = Field(default=True, frozen=True)
38
+ def _validate_pipeline_opts(opts: PipelineOptions) -> None:
39
+ if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
40
+ msg = "generate_picture_images should be set to true"
41
+ raise ValueError(msg)
44
42
 
45
- @model_validator(mode="before")
46
- @classmethod
47
- def validate_ocr_options(cls, data: Any) -> Any:
48
- if isinstance(data, dict):
49
- ocr_options = data.get("ocr_options")
50
- if not isinstance(ocr_options, dict):
51
- return data
52
- allow_external_plugins = ocr_options.get("allow_external_plugins", False)
53
- ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
54
- kind = ocr_options.pop("kind")
55
- data["ocr_options"] = ocr_factory.create_options(kind=kind, **ocr_options)
56
- return data
57
-
58
-
59
- OptionsByPipeline = list[
60
- tuple[Literal["pdf"], _PdfPipelineOptions]
61
- | tuple[Literal["vlm"], VlmPipelineOptions]
62
- ]
63
-
64
-
65
- def _default_pipeline_options() -> OptionsByPipeline:
66
- pipeline_options = _PdfPipelineOptions(ocr_options=EasyOcrOptions())
67
- return [("pdf", pipeline_options), ("vlm", VlmPipelineOptions())]
68
-
69
-
70
- class DoclingFormatOption(BaseModel):
71
- pipeline_cls: str
72
- backend_cls: str
73
-
74
- def to_docling(
75
- self, pipeline_options: dict[Literal["pdf", "vlm"], PipelineOptions]
76
- ) -> FormatOption:
77
- pipeline_cls = _find_subcls(BasePipeline, self.pipeline_cls)
78
- backend_cls = _find_subcls(AbstractDocumentBackend, self.backend_cls)
79
- if "vlm" in self.pipeline_cls.lower():
80
- pipeline_options = pipeline_options.get("vlm")
81
- if pipeline_options is not None:
82
- pipeline_options = VlmPipelineOptions.model_validate(pipeline_options)
83
- elif "pdf" in self.pipeline_cls.lower():
84
- pipeline_options = pipeline_options.get("pdf")
85
- if pipeline_options is not None:
86
- pipeline_options = _PdfPipelineOptions.model_validate(pipeline_options)
87
- else:
88
- raise ValueError(
89
- f"invalid pipeline_cls: {pipeline_cls}, expected a VLM or PDF pipeline"
90
- )
91
- return FormatOption(
92
- pipeline_cls=pipeline_cls,
93
- pipeline_options=pipeline_options,
94
- backend=backend_cls,
95
- )
96
43
 
44
+ def _validate_options(
45
+ data: dict[InputFormat, FormatOption],
46
+ ) -> dict[InputFormat, FormatOption]:
47
+ for opts in data.values():
48
+ _validate_pipeline_opts(opts.pipeline_options)
49
+ return data
97
50
 
98
- @cache
99
- def _default_format_options() -> dict[InputFormat, DoclingFormatOption]:
100
- supported_fmt = {InputFormat.PDF}
101
- return {
102
- fmt: DoclingFormatOption(
103
- pipeline_cls=opt.pipeline_cls.__name__, backend_cls=opt.backend.__name__
104
- )
105
- for fmt, opt in DocumentConverter().format_to_options.items()
106
- if fmt in supported_fmt
107
- }
108
51
 
52
+ _DEFAULT_FORMAT_OPTS = {
53
+ InputFormat.PDF: PdfFormatOption(
54
+ pipeline_options=PdfPipelineOptions(
55
+ ocr_options=EasyOcrOptions(), generate_picture_images=True
56
+ )
57
+ ),
58
+ }
109
59
 
110
60
  T = TypeVar("T")
111
61
 
@@ -122,30 +72,37 @@ class DoclingPipelineConfig(PipelineConfig):
122
72
  pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
123
73
  task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
124
74
 
125
- pipeline_options: OptionsByPipeline = Field(
126
- default_factory=_default_pipeline_options
127
- )
128
- format_options: dict[InputFormat, DoclingFormatOption] = Field(
129
- default_factory=_default_format_options
130
- )
131
-
132
- def to_format_options(self) -> dict[InputFormat, FormatOption]:
133
- pipeline_options = dict(self.pipeline_options)
134
- return {
135
- InputFormat(f): opt.to_docling(pipeline_options)
136
- for f, opt in self.format_options.items()
137
- }
75
+ format_options: Annotated[
76
+ dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
77
+ ] = _DEFAULT_FORMAT_OPTS
138
78
 
79
+ _unsupported_input_formats: ClassVar[set[InputFormat]] = {
80
+ InputFormat.AUDIO,
81
+ InputFormat.METS_GBS,
82
+ InputFormat.VTT,
83
+ }
139
84
 
140
- DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
85
+ @classmethod
86
+ @cache
87
+ def supported_exts(cls) -> set[SupportedExt]:
88
+ supported = set()
89
+ for f in InputFormat:
90
+ if f in cls._unsupported_input_formats:
91
+ continue
92
+ for ext in FormatToExtensions[f]:
93
+ supported.add(SupportedExt(f".{ext.lower()}"))
94
+ return supported
141
95
 
142
96
 
143
97
  @Pipeline.register(PipelineType.DOCLING)
144
98
  class DoclingPipeline(Pipeline):
145
99
  def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
146
- if format_options is None:
147
- format_options = DEFAULT_FORMAT_OPTIONS
148
- self._converter = DocumentConverter(format_options=format_options)
100
+ allowed_format = [
101
+ f.to_docling() for f in DoclingPipelineConfig.supported_exts()
102
+ ]
103
+ self._converter = DocumentConverter(
104
+ allowed_formats=allowed_format, format_options=format_options
105
+ )
149
106
 
150
107
  async def extract_content(
151
108
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
@@ -157,7 +114,7 @@ class DoclingPipeline(Pipeline):
157
114
 
158
115
  @classmethod
159
116
  def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
160
- return cls(config.to_format_options())
117
+ return cls(config.format_options)
161
118
 
162
119
 
163
120
  def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
extract_python/marker_.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import gc
2
2
  from collections.abc import AsyncGenerator, Iterable
3
3
  from copy import deepcopy
4
+ from functools import cache
4
5
  from pathlib import Path
5
6
  from typing import Any, ClassVar, Self
6
7
 
@@ -20,6 +21,7 @@ from .objects import (
20
21
  PageIndexes,
21
22
  Result,
22
23
  Status,
24
+ SupportedExt,
23
25
  )
24
26
  from .pipeline import Pipeline, PipelineConfig, PipelineType
25
27
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
@@ -32,6 +34,33 @@ class MarkerPipelineConfig(PipelineConfig):
32
34
 
33
35
  config: dict[str, Any] = dict()
34
36
 
37
+ @classmethod
38
+ @cache
39
+ def supported_exts(cls) -> set[SupportedExt]:
40
+ # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
41
+ return {
42
+ SupportedExt.PDF,
43
+ SupportedExt.XLS,
44
+ SupportedExt.XLSX,
45
+ SupportedExt.XLSM,
46
+ SupportedExt.CSV,
47
+ SupportedExt.ODS,
48
+ SupportedExt.DOC,
49
+ SupportedExt.DOCX,
50
+ SupportedExt.ODT,
51
+ SupportedExt.PPT,
52
+ SupportedExt.PPTX,
53
+ SupportedExt.ODP,
54
+ SupportedExt.HTLM,
55
+ SupportedExt.EPUB,
56
+ SupportedExt.PNG,
57
+ SupportedExt.JPG,
58
+ SupportedExt.JPEG,
59
+ SupportedExt.WEBP,
60
+ SupportedExt.GIF,
61
+ SupportedExt.TIFF,
62
+ }
63
+
35
64
 
36
65
  _MARKER_CONVERSION_ERRORS = tuple()
37
66
 
extract_python/miner_u.py CHANGED
@@ -3,7 +3,7 @@ import shutil
3
3
  from collections.abc import AsyncGenerator, Callable, Iterable
4
4
  from copy import copy
5
5
  from enum import StrEnum
6
- from functools import partial
6
+ from functools import cache, partial
7
7
  from pathlib import Path
8
8
  from tempfile import TemporaryDirectory
9
9
  from typing import Any, ClassVar, Self
@@ -26,6 +26,7 @@ from .objects import (
26
26
  PageIndexes,
27
27
  Result,
28
28
  Status,
29
+ SupportedExt,
29
30
  )
30
31
  from .pipeline import Pipeline, PipelineConfig, PipelineType
31
32
  from .utils import path_to_artifacts_dirname
@@ -82,6 +83,16 @@ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
82
83
  config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
83
84
  language: LanguageAlpha2 = Field(frozen=True, default="en")
84
85
 
86
+ @classmethod
87
+ @cache
88
+ def supported_exts(cls) -> set[SupportedExt]:
89
+ return {
90
+ SupportedExt.PDF,
91
+ SupportedExt.DOCX,
92
+ SupportedExt.PPTX,
93
+ SupportedExt.XLSX,
94
+ }
95
+
85
96
 
86
97
  @Pipeline.register(PipelineType.MINER_U)
87
98
  class MinerUPipeline(Pipeline):
extract_python/objects.py CHANGED
@@ -21,7 +21,12 @@ from pydantic import AfterValidator, RootModel, TypeAdapter
21
21
  from pydantic import BaseModel as _BaseModel
22
22
 
23
23
  try:
24
- from docling.datamodel.base_models import ConversionStatus, ErrorItem, InputFormat
24
+ from docling.datamodel.base_models import (
25
+ ConversionStatus,
26
+ ErrorItem,
27
+ FormatToExtensions,
28
+ InputFormat,
29
+ )
25
30
  from docling.datamodel.document import InputDocument
26
31
  from docling_core.types.io import DocumentStream
27
32
  except ImportError:
@@ -33,15 +38,78 @@ logger = logging.getLogger(__name__)
33
38
  base_config = merge_configs(icij_config(), no_enum_values_config())
34
39
 
35
40
 
41
+ @cache
42
+ def _ext_to_docling_input_format() -> dict:
43
+ from .docling_ import DoclingPipelineConfig # noqa: PLC0415
44
+
45
+ mapping = dict()
46
+ supported = DoclingPipelineConfig.supported_exts()
47
+ for input_f, exts in FormatToExtensions.items():
48
+ for ext in exts:
49
+ try:
50
+ ext = SupportedExt(f".{ext.lower()}") # noqa: PLW2901
51
+ except ValueError:
52
+ continue
53
+ if ext in supported:
54
+ mapping[ext] = input_f
55
+ return mapping
56
+
57
+
36
58
  class BaseModel(_BaseModel):
37
59
  model_config = base_config
38
60
 
39
61
 
40
62
  class SupportedExt(StrEnum):
63
+ ADOC = ".adoc"
64
+ ASC = ".asc"
65
+ ASCIIDOC = ".asciidoc"
66
+ BMP = ".bmp"
67
+ CSV = ".csv"
68
+ DOC = ".doc"
69
+ DOCX = ".docx"
70
+ DOTX = ".dotx"
71
+ DOTM = ".dotm"
72
+ DOCM = ".docm"
73
+ EPUB = ".epub"
74
+ GIF = ".gif"
75
+ HTLM = ".html"
76
+ HTM = ".htm"
77
+ JPEG = ".jpeg"
78
+ JPG = ".jpg"
79
+ JSON = ".json"
80
+ LATEX = ".latex"
81
+ MD = ".md"
82
+ NXML = ".nxml"
83
+ ODP = ".odp"
84
+ ODS = ".ods"
85
+ ODT = ".odt"
41
86
  PDF = ".pdf"
87
+ PNG = ".png"
88
+ PPSX = ".ppsx"
89
+ PPT = ".ppt"
90
+ PPTM = ".pptm"
91
+ PPSM = ".ppsm"
92
+ POTX = ".potx"
93
+ POTM = ".potm"
94
+ PPTX = ".pptx"
95
+ QMD = ".qmd"
96
+ RMD = ".rmd"
97
+ TEX = ".tex"
98
+ TIF = ".tif"
99
+ TIFF = ".tiff"
100
+ TXT = ".txt"
101
+ TEXT = ".text"
102
+ WEBP = ".webp"
103
+ XBRL = ".xbrl"
104
+ XHTML = ".xhtml"
105
+ XLS = ".xls"
106
+ XLSM = ".xlsm"
107
+ XLSX = ".xlsx"
108
+ XLTX = ".xltx"
109
+ XML = ".xml"
42
110
 
43
111
  def to_docling(self) -> InputFormat:
44
- return InputFormat(self.value[1:])
112
+ return _ext_to_docling_input_format()[self]
45
113
 
46
114
 
47
115
  class OutputFormat(StrEnum):
@@ -8,7 +8,7 @@ from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_value
8
8
  from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
9
9
  from pydantic import Field
10
10
 
11
- from .objects import InputDoc, OutputFormat, Result
11
+ from .objects import InputDoc, OutputFormat, Result, SupportedExt
12
12
 
13
13
  StructuredContent = str
14
14
 
@@ -28,6 +28,10 @@ class PipelineConfig(RegistrableConfig, ABC):
28
28
 
29
29
  task_group: ClassVar[str] = Field(frozen=True)
30
30
 
31
+ @classmethod
32
+ @abstractmethod
33
+ def supported_exts(cls) -> set[SupportedExt]: ...
34
+
31
35
 
32
36
  class Pipeline(RegistrableFromConfig, ABC):
33
37
  @abstractmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=00g7RV33iftjvMLoKaEs2lUZP3LslSCKMpGeSys1Suc,6616
4
+ extract_python/marker_.py,sha256=3Q8H-TeM2_GenB6OOqIuytqgI1VE93Ek99_kW0cJHEw,4905
5
+ extract_python/miner_u.py,sha256=WdaftyINZdnALqSuu1qKaZJKKyIHVRn-wBke-Na78O0,7747
6
+ extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
+ extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.3.0.dist-info/METADATA,sha256=MwFN5PsmkUEv8sbhuS6joh7r17W6rrikpJb4Yr-rdKk,1132
10
+ extract_python-0.3.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.3.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=FMDsPVz05sGMPvIOX90lOLygWp6nC5DEjRfgx_ESPJ4,8530
4
- extract_python/marker_.py,sha256=z3PkUUStC-E78HhqByMwJ7re6-I7YUQzSxWToegHrUQ,4060
5
- extract_python/miner_u.py,sha256=f5pvLvay1ThBXNOI1R276aWSWsk5mhIPzWVjCy2u_lw,7493
6
- extract_python/objects.py,sha256=gTyGA5gaMAmW5P_PbAO2LNMqtP69CxlknebBFTojiwQ,7322
7
- extract_python/pipeline.py,sha256=qUgGar1rlYQgNz78BcUT1nQRsG3hy5UwpCl0e-0V77I,1098
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.1.0.dist-info/METADATA,sha256=wyYMrleKk9yUU1UaTYT0EsGpw_e3qbE8LOBanyLv0Qg,1132
10
- extract_python-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.1.0.dist-info/RECORD,,