extract-python 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/docling_.py +25 -0
- extract_python/marker_.py +29 -0
- extract_python/miner_u.py +12 -1
- extract_python/objects.py +27 -0
- extract_python/pipeline.py +5 -1
- {extract_python-0.1.0.dist-info → extract_python-0.2.1.dist-info}/METADATA +1 -1
- extract_python-0.2.1.dist-info/RECORD +11 -0
- extract_python-0.1.0.dist-info/RECORD +0 -11
- {extract_python-0.1.0.dist-info → extract_python-0.2.1.dist-info}/WHEEL +0 -0
extract_python/docling_.py
CHANGED
|
@@ -32,6 +32,7 @@ from .objects import (
|
|
|
32
32
|
PageIndexes,
|
|
33
33
|
Result,
|
|
34
34
|
Status,
|
|
35
|
+
SupportedExt,
|
|
35
36
|
)
|
|
36
37
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
37
38
|
from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
|
|
@@ -136,6 +137,30 @@ class DoclingPipelineConfig(PipelineConfig):
|
|
|
136
137
|
for f, opt in self.format_options.items()
|
|
137
138
|
}
|
|
138
139
|
|
|
140
|
+
@classmethod
|
|
141
|
+
@cache
|
|
142
|
+
def supported_formats(cls) -> set[SupportedExt]:
|
|
143
|
+
# Subset of https://docling-project.github.io/docling/usage/supported_formats/
|
|
144
|
+
return {
|
|
145
|
+
SupportedExt.ADOC,
|
|
146
|
+
SupportedExt.ASCIIDOC,
|
|
147
|
+
SupportedExt.BMP,
|
|
148
|
+
SupportedExt.CSV,
|
|
149
|
+
SupportedExt.DOCX,
|
|
150
|
+
SupportedExt.HTLM,
|
|
151
|
+
SupportedExt.JPG,
|
|
152
|
+
SupportedExt.MD,
|
|
153
|
+
SupportedExt.PDF,
|
|
154
|
+
SupportedExt.PNG,
|
|
155
|
+
SupportedExt.PPTX,
|
|
156
|
+
SupportedExt.TEX,
|
|
157
|
+
SupportedExt.TIFF,
|
|
158
|
+
SupportedExt.TXT,
|
|
159
|
+
SupportedExt.WEBP,
|
|
160
|
+
SupportedExt.XHTML,
|
|
161
|
+
SupportedExt.XLSX,
|
|
162
|
+
}
|
|
163
|
+
|
|
139
164
|
|
|
140
165
|
DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
|
|
141
166
|
|
extract_python/marker_.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import gc
|
|
2
2
|
from collections.abc import AsyncGenerator, Iterable
|
|
3
3
|
from copy import deepcopy
|
|
4
|
+
from functools import cache
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any, ClassVar, Self
|
|
6
7
|
|
|
@@ -20,6 +21,7 @@ from .objects import (
|
|
|
20
21
|
PageIndexes,
|
|
21
22
|
Result,
|
|
22
23
|
Status,
|
|
24
|
+
SupportedExt,
|
|
23
25
|
)
|
|
24
26
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
25
27
|
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
@@ -32,6 +34,33 @@ class MarkerPipelineConfig(PipelineConfig):
|
|
|
32
34
|
|
|
33
35
|
config: dict[str, Any] = dict()
|
|
34
36
|
|
|
37
|
+
@classmethod
|
|
38
|
+
@cache
|
|
39
|
+
def supported_formats(cls) -> set[SupportedExt]:
|
|
40
|
+
# Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
|
|
41
|
+
return {
|
|
42
|
+
SupportedExt.PDF,
|
|
43
|
+
SupportedExt.XLS,
|
|
44
|
+
SupportedExt.XLSX,
|
|
45
|
+
SupportedExt.XLSM,
|
|
46
|
+
SupportedExt.CSV,
|
|
47
|
+
SupportedExt.ODS,
|
|
48
|
+
SupportedExt.DOC,
|
|
49
|
+
SupportedExt.DOCX,
|
|
50
|
+
SupportedExt.ODT,
|
|
51
|
+
SupportedExt.PPT,
|
|
52
|
+
SupportedExt.PPTX,
|
|
53
|
+
SupportedExt.ODP,
|
|
54
|
+
SupportedExt.HTLM,
|
|
55
|
+
SupportedExt.EPUB,
|
|
56
|
+
SupportedExt.PNG,
|
|
57
|
+
SupportedExt.JPG,
|
|
58
|
+
SupportedExt.JPEG,
|
|
59
|
+
SupportedExt.WEBP,
|
|
60
|
+
SupportedExt.GIF,
|
|
61
|
+
SupportedExt.TIFF,
|
|
62
|
+
}
|
|
63
|
+
|
|
35
64
|
|
|
36
65
|
_MARKER_CONVERSION_ERRORS = tuple()
|
|
37
66
|
|
extract_python/miner_u.py
CHANGED
|
@@ -3,7 +3,7 @@ import shutil
|
|
|
3
3
|
from collections.abc import AsyncGenerator, Callable, Iterable
|
|
4
4
|
from copy import copy
|
|
5
5
|
from enum import StrEnum
|
|
6
|
-
from functools import partial
|
|
6
|
+
from functools import cache, partial
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from tempfile import TemporaryDirectory
|
|
9
9
|
from typing import Any, ClassVar, Self
|
|
@@ -26,6 +26,7 @@ from .objects import (
|
|
|
26
26
|
PageIndexes,
|
|
27
27
|
Result,
|
|
28
28
|
Status,
|
|
29
|
+
SupportedExt,
|
|
29
30
|
)
|
|
30
31
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
31
32
|
from .utils import path_to_artifacts_dirname
|
|
@@ -82,6 +83,16 @@ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
|
82
83
|
config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
|
|
83
84
|
language: LanguageAlpha2 = Field(frozen=True, default="en")
|
|
84
85
|
|
|
86
|
+
@classmethod
|
|
87
|
+
@cache
|
|
88
|
+
def supported_formats(cls) -> set[SupportedExt]:
|
|
89
|
+
return {
|
|
90
|
+
SupportedExt.PDF,
|
|
91
|
+
SupportedExt.DOCX,
|
|
92
|
+
SupportedExt.PPTX,
|
|
93
|
+
SupportedExt.XLSX,
|
|
94
|
+
}
|
|
95
|
+
|
|
85
96
|
|
|
86
97
|
@Pipeline.register(PipelineType.MINER_U)
|
|
87
98
|
class MinerUPipeline(Pipeline):
|
extract_python/objects.py
CHANGED
|
@@ -38,7 +38,34 @@ class BaseModel(_BaseModel):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class SupportedExt(StrEnum):
|
|
41
|
+
ADOC = ".adoc"
|
|
42
|
+
ASCIIDOC = ".asciidoc"
|
|
43
|
+
BMP = ".bmp"
|
|
44
|
+
CSV = ".csv"
|
|
45
|
+
DOC = ".doc"
|
|
46
|
+
DOCX = ".docx"
|
|
47
|
+
EPUB = ".epub"
|
|
48
|
+
GIF = ".gif"
|
|
49
|
+
HTLM = ".html"
|
|
50
|
+
JPEG = ".jpeg"
|
|
51
|
+
JPG = ".jpg"
|
|
52
|
+
MD = ".md"
|
|
53
|
+
ODP = ".odp"
|
|
54
|
+
ODS = ".ods"
|
|
55
|
+
ODT = ".odt"
|
|
41
56
|
PDF = ".pdf"
|
|
57
|
+
PNG = ".png"
|
|
58
|
+
PPT = ".ppt"
|
|
59
|
+
PPTX = ".pptx"
|
|
60
|
+
TEX = ".tex"
|
|
61
|
+
TIFF = ".tiff"
|
|
62
|
+
TXT = ".txt"
|
|
63
|
+
WEBP = ".webp"
|
|
64
|
+
XHTML = ".xhtml"
|
|
65
|
+
XLS = ".xls"
|
|
66
|
+
XLSM = ".xlsm"
|
|
67
|
+
XLSX = ".xlsx"
|
|
68
|
+
XLTX = ".xltx"
|
|
42
69
|
|
|
43
70
|
def to_docling(self) -> InputFormat:
|
|
44
71
|
return InputFormat(self.value[1:])
|
extract_python/pipeline.py
CHANGED
|
@@ -8,7 +8,7 @@ from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_value
|
|
|
8
8
|
from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
|
|
9
9
|
from pydantic import Field
|
|
10
10
|
|
|
11
|
-
from .objects import InputDoc, OutputFormat, Result
|
|
11
|
+
from .objects import InputDoc, OutputFormat, Result, SupportedExt
|
|
12
12
|
|
|
13
13
|
StructuredContent = str
|
|
14
14
|
|
|
@@ -28,6 +28,10 @@ class PipelineConfig(RegistrableConfig, ABC):
|
|
|
28
28
|
|
|
29
29
|
task_group: ClassVar[str] = Field(frozen=True)
|
|
30
30
|
|
|
31
|
+
@classmethod
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def supported_formats(cls) -> set[SupportedExt]: ...
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
class Pipeline(RegistrableFromConfig, ABC):
|
|
33
37
|
@abstractmethod
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
+
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
+
extract_python/docling_.py,sha256=L7RPOYTN4kuodsgfi7NVsXILRerbAZUDaYxqVofxOog,9267
|
|
4
|
+
extract_python/marker_.py,sha256=EUGpBRAe9mE0QbSdMFdvE16_m-c-DeAtwZ8F79w2Mcg,4908
|
|
5
|
+
extract_python/miner_u.py,sha256=Bse66I5Yj-PiOgejr3JjXXDkjCh46M9KuwTEB8QK5g4,7750
|
|
6
|
+
extract_python/objects.py,sha256=hqa9ONk9KwylvQa6DvKIEQnnCgfy-T-d5SU2LpfmTcQ,7815
|
|
7
|
+
extract_python/pipeline.py,sha256=0qkuqEcxEbc3_sy8gNbLPwq8IIlC8cfGaqk_5fNpOCM,1207
|
|
8
|
+
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
+
extract_python-0.2.1.dist-info/METADATA,sha256=iaIXzaha4s-kqJzkedwJn5VsvQGZhy3KJQJpDns8pR4,1132
|
|
10
|
+
extract_python-0.2.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
extract_python-0.2.1.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
-
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
-
extract_python/docling_.py,sha256=FMDsPVz05sGMPvIOX90lOLygWp6nC5DEjRfgx_ESPJ4,8530
|
|
4
|
-
extract_python/marker_.py,sha256=z3PkUUStC-E78HhqByMwJ7re6-I7YUQzSxWToegHrUQ,4060
|
|
5
|
-
extract_python/miner_u.py,sha256=f5pvLvay1ThBXNOI1R276aWSWsk5mhIPzWVjCy2u_lw,7493
|
|
6
|
-
extract_python/objects.py,sha256=gTyGA5gaMAmW5P_PbAO2LNMqtP69CxlknebBFTojiwQ,7322
|
|
7
|
-
extract_python/pipeline.py,sha256=qUgGar1rlYQgNz78BcUT1nQRsG3hy5UwpCl0e-0V77I,1098
|
|
8
|
-
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
-
extract_python-0.1.0.dist-info/METADATA,sha256=wyYMrleKk9yUU1UaTYT0EsGpw_e3qbE8LOBanyLv0Qg,1132
|
|
10
|
-
extract_python-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
-
extract_python-0.1.0.dist-info/RECORD,,
|
|
File without changes
|