extract-python 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,6 +32,7 @@ from .objects import (
32
32
  PageIndexes,
33
33
  Result,
34
34
  Status,
35
+ SupportedExt,
35
36
  )
36
37
  from .pipeline import Pipeline, PipelineConfig, PipelineType
37
38
  from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
@@ -136,6 +137,30 @@ class DoclingPipelineConfig(PipelineConfig):
136
137
  for f, opt in self.format_options.items()
137
138
  }
138
139
 
140
+ @classmethod
141
+ @cache
142
+ def supported_formats(cls) -> set[SupportedExt]:
143
+ # Subset of https://docling-project.github.io/docling/usage/supported_formats/
144
+ return {
145
+ SupportedExt.ADOC,
146
+ SupportedExt.ASCIIDOC,
147
+ SupportedExt.BMP,
148
+ SupportedExt.CSV,
149
+ SupportedExt.DOCX,
150
+ SupportedExt.HTLM,
151
+ SupportedExt.JPG,
152
+ SupportedExt.MD,
153
+ SupportedExt.PDF,
154
+ SupportedExt.PNG,
155
+ SupportedExt.PPTX,
156
+ SupportedExt.TEX,
157
+ SupportedExt.TIFF,
158
+ SupportedExt.TXT,
159
+ SupportedExt.WEBP,
160
+ SupportedExt.XHTML,
161
+ SupportedExt.XLSX,
162
+ }
163
+
139
164
 
140
165
  DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
141
166
 
extract_python/marker_.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import gc
2
2
  from collections.abc import AsyncGenerator, Iterable
3
3
  from copy import deepcopy
4
+ from functools import cache
4
5
  from pathlib import Path
5
6
  from typing import Any, ClassVar, Self
6
7
 
@@ -20,6 +21,7 @@ from .objects import (
20
21
  PageIndexes,
21
22
  Result,
22
23
  Status,
24
+ SupportedExt,
23
25
  )
24
26
  from .pipeline import Pipeline, PipelineConfig, PipelineType
25
27
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
@@ -32,6 +34,33 @@ class MarkerPipelineConfig(PipelineConfig):
32
34
 
33
35
  config: dict[str, Any] = dict()
34
36
 
37
+ @classmethod
38
+ @cache
39
+ def supported_formats(cls) -> set[SupportedExt]:
40
+ # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
41
+ return {
42
+ SupportedExt.PDF,
43
+ SupportedExt.XLS,
44
+ SupportedExt.XLSX,
45
+ SupportedExt.XLSM,
46
+ SupportedExt.CSV,
47
+ SupportedExt.ODS,
48
+ SupportedExt.DOC,
49
+ SupportedExt.DOCX,
50
+ SupportedExt.ODT,
51
+ SupportedExt.PPT,
52
+ SupportedExt.PPTX,
53
+ SupportedExt.ODP,
54
+ SupportedExt.HTLM,
55
+ SupportedExt.EPUB,
56
+ SupportedExt.PNG,
57
+ SupportedExt.JPG,
58
+ SupportedExt.JPEG,
59
+ SupportedExt.WEBP,
60
+ SupportedExt.GIF,
61
+ SupportedExt.TIFF,
62
+ }
63
+
35
64
 
36
65
  _MARKER_CONVERSION_ERRORS = tuple()
37
66
 
extract_python/miner_u.py CHANGED
@@ -3,7 +3,7 @@ import shutil
3
3
  from collections.abc import AsyncGenerator, Callable, Iterable
4
4
  from copy import copy
5
5
  from enum import StrEnum
6
- from functools import partial
6
+ from functools import cache, partial
7
7
  from pathlib import Path
8
8
  from tempfile import TemporaryDirectory
9
9
  from typing import Any, ClassVar, Self
@@ -26,6 +26,7 @@ from .objects import (
26
26
  PageIndexes,
27
27
  Result,
28
28
  Status,
29
+ SupportedExt,
29
30
  )
30
31
  from .pipeline import Pipeline, PipelineConfig, PipelineType
31
32
  from .utils import path_to_artifacts_dirname
@@ -82,6 +83,16 @@ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
82
83
  config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
83
84
  language: LanguageAlpha2 = Field(frozen=True, default="en")
84
85
 
86
+ @classmethod
87
+ @cache
88
+ def supported_formats(cls) -> set[SupportedExt]:
89
+ return {
90
+ SupportedExt.PDF,
91
+ SupportedExt.DOCX,
92
+ SupportedExt.PPTX,
93
+ SupportedExt.XLSX,
94
+ }
95
+
85
96
 
86
97
  @Pipeline.register(PipelineType.MINER_U)
87
98
  class MinerUPipeline(Pipeline):
extract_python/objects.py CHANGED
@@ -38,7 +38,34 @@ class BaseModel(_BaseModel):
38
38
 
39
39
 
40
40
  class SupportedExt(StrEnum):
41
+ ADOC = ".adoc"
42
+ ASCIIDOC = ".asciidoc"
43
+ BMP = ".bmp"
44
+ CSV = ".csv"
45
+ DOC = ".doc"
46
+ DOCX = ".docx"
47
+ EPUB = ".epub"
48
+ GIF = ".gif"
49
+ HTLM = ".html"
50
+ JPEG = ".jpeg"
51
+ JPG = ".jpg"
52
+ MD = ".md"
53
+ ODP = ".odp"
54
+ ODS = ".ods"
55
+ ODT = ".odt"
41
56
  PDF = ".pdf"
57
+ PNG = ".png"
58
+ PPT = ".ppt"
59
+ PPTX = ".pptx"
60
+ TEX = ".tex"
61
+ TIFF = ".tiff"
62
+ TXT = ".txt"
63
+ WEBP = ".webp"
64
+ XHTML = ".xhtml"
65
+ XLS = ".xls"
66
+ XLSM = ".xlsm"
67
+ XLSX = ".xlsx"
68
+ XLTX = ".xltx"
42
69
 
43
70
  def to_docling(self) -> InputFormat:
44
71
  return InputFormat(self.value[1:])
@@ -8,7 +8,7 @@ from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_value
8
8
  from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
9
9
  from pydantic import Field
10
10
 
11
- from .objects import InputDoc, OutputFormat, Result
11
+ from .objects import InputDoc, OutputFormat, Result, SupportedExt
12
12
 
13
13
  StructuredContent = str
14
14
 
@@ -28,6 +28,10 @@ class PipelineConfig(RegistrableConfig, ABC):
28
28
 
29
29
  task_group: ClassVar[str] = Field(frozen=True)
30
30
 
31
+ @classmethod
32
+ @abstractmethod
33
+ def supported_formats(cls) -> set[SupportedExt]: ...
34
+
31
35
 
32
36
  class Pipeline(RegistrableFromConfig, ABC):
33
37
  @abstractmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.1.0
3
+ Version: 0.2.1
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=L7RPOYTN4kuodsgfi7NVsXILRerbAZUDaYxqVofxOog,9267
4
+ extract_python/marker_.py,sha256=EUGpBRAe9mE0QbSdMFdvE16_m-c-DeAtwZ8F79w2Mcg,4908
5
+ extract_python/miner_u.py,sha256=Bse66I5Yj-PiOgejr3JjXXDkjCh46M9KuwTEB8QK5g4,7750
6
+ extract_python/objects.py,sha256=hqa9ONk9KwylvQa6DvKIEQnnCgfy-T-d5SU2LpfmTcQ,7815
7
+ extract_python/pipeline.py,sha256=0qkuqEcxEbc3_sy8gNbLPwq8IIlC8cfGaqk_5fNpOCM,1207
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.2.1.dist-info/METADATA,sha256=iaIXzaha4s-kqJzkedwJn5VsvQGZhy3KJQJpDns8pR4,1132
10
+ extract_python-0.2.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.2.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=FMDsPVz05sGMPvIOX90lOLygWp6nC5DEjRfgx_ESPJ4,8530
4
- extract_python/marker_.py,sha256=z3PkUUStC-E78HhqByMwJ7re6-I7YUQzSxWToegHrUQ,4060
5
- extract_python/miner_u.py,sha256=f5pvLvay1ThBXNOI1R276aWSWsk5mhIPzWVjCy2u_lw,7493
6
- extract_python/objects.py,sha256=gTyGA5gaMAmW5P_PbAO2LNMqtP69CxlknebBFTojiwQ,7322
7
- extract_python/pipeline.py,sha256=qUgGar1rlYQgNz78BcUT1nQRsG3hy5UwpCl0e-0V77I,1098
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.1.0.dist-info/METADATA,sha256=wyYMrleKk9yUU1UaTYT0EsGpw_e3qbE8LOBanyLv0Qg,1132
10
- extract_python-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.1.0.dist-info/RECORD,,