extract-python 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/docling_.py +47 -90
- extract_python/marker_.py +29 -0
- extract_python/miner_u.py +12 -1
- extract_python/objects.py +70 -2
- extract_python/pipeline.py +5 -1
- {extract_python-0.1.0.dist-info → extract_python-0.3.0.dist-info}/METADATA +1 -1
- extract_python-0.3.0.dist-info/RECORD +11 -0
- extract_python-0.1.0.dist-info/RECORD +0 -11
- {extract_python-0.1.0.dist-info → extract_python-0.3.0.dist-info}/WHEEL +0 -0
extract_python/docling_.py
CHANGED
|
@@ -3,28 +3,23 @@ import tempfile
|
|
|
3
3
|
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
4
|
from functools import cache
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Annotated, ClassVar, TypeVar
|
|
7
7
|
|
|
8
|
-
from docling.
|
|
9
|
-
from docling.datamodel.base_models import InputFormat
|
|
8
|
+
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
10
9
|
from docling.datamodel.document import ConversionResult
|
|
11
10
|
from docling.datamodel.pipeline_options import (
|
|
12
11
|
EasyOcrOptions,
|
|
13
12
|
PdfPipelineOptions,
|
|
14
13
|
PipelineOptions,
|
|
15
|
-
VlmPipelineOptions,
|
|
16
14
|
)
|
|
17
|
-
from docling.document_converter import DocumentConverter, FormatOption
|
|
18
|
-
from docling.models.factories import get_ocr_factory
|
|
19
|
-
from docling.pipeline.base_pipeline import BasePipeline
|
|
15
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
20
16
|
from docling_core.types.doc import ImageRefMode
|
|
21
17
|
from docling_core.types.io import DocumentStream
|
|
22
18
|
from icij_common.registrable import FromConfig
|
|
23
|
-
from pydantic import
|
|
19
|
+
from pydantic import AfterValidator, Field
|
|
24
20
|
|
|
25
21
|
from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
|
|
26
22
|
from .objects import (
|
|
27
|
-
BaseModel,
|
|
28
23
|
Error,
|
|
29
24
|
InputDoc,
|
|
30
25
|
MarkdownDoc,
|
|
@@ -32,6 +27,7 @@ from .objects import (
|
|
|
32
27
|
PageIndexes,
|
|
33
28
|
Result,
|
|
34
29
|
Status,
|
|
30
|
+
SupportedExt,
|
|
35
31
|
)
|
|
36
32
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
37
33
|
from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
|
|
@@ -39,73 +35,27 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
|
|
|
39
35
|
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
40
36
|
|
|
41
37
|
|
|
42
|
-
|
|
43
|
-
|
|
38
|
+
def _validate_pipeline_opts(opts: PipelineOptions) -> None:
|
|
39
|
+
if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
|
|
40
|
+
msg = "generate_picture_images should be set to true"
|
|
41
|
+
raise ValueError(msg)
|
|
44
42
|
|
|
45
|
-
@model_validator(mode="before")
|
|
46
|
-
@classmethod
|
|
47
|
-
def validate_ocr_options(cls, data: Any) -> Any:
|
|
48
|
-
if isinstance(data, dict):
|
|
49
|
-
ocr_options = data.get("ocr_options")
|
|
50
|
-
if not isinstance(ocr_options, dict):
|
|
51
|
-
return data
|
|
52
|
-
allow_external_plugins = ocr_options.get("allow_external_plugins", False)
|
|
53
|
-
ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
|
|
54
|
-
kind = ocr_options.pop("kind")
|
|
55
|
-
data["ocr_options"] = ocr_factory.create_options(kind=kind, **ocr_options)
|
|
56
|
-
return data
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
OptionsByPipeline = list[
|
|
60
|
-
tuple[Literal["pdf"], _PdfPipelineOptions]
|
|
61
|
-
| tuple[Literal["vlm"], VlmPipelineOptions]
|
|
62
|
-
]
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def _default_pipeline_options() -> OptionsByPipeline:
|
|
66
|
-
pipeline_options = _PdfPipelineOptions(ocr_options=EasyOcrOptions())
|
|
67
|
-
return [("pdf", pipeline_options), ("vlm", VlmPipelineOptions())]
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class DoclingFormatOption(BaseModel):
|
|
71
|
-
pipeline_cls: str
|
|
72
|
-
backend_cls: str
|
|
73
|
-
|
|
74
|
-
def to_docling(
|
|
75
|
-
self, pipeline_options: dict[Literal["pdf", "vlm"], PipelineOptions]
|
|
76
|
-
) -> FormatOption:
|
|
77
|
-
pipeline_cls = _find_subcls(BasePipeline, self.pipeline_cls)
|
|
78
|
-
backend_cls = _find_subcls(AbstractDocumentBackend, self.backend_cls)
|
|
79
|
-
if "vlm" in self.pipeline_cls.lower():
|
|
80
|
-
pipeline_options = pipeline_options.get("vlm")
|
|
81
|
-
if pipeline_options is not None:
|
|
82
|
-
pipeline_options = VlmPipelineOptions.model_validate(pipeline_options)
|
|
83
|
-
elif "pdf" in self.pipeline_cls.lower():
|
|
84
|
-
pipeline_options = pipeline_options.get("pdf")
|
|
85
|
-
if pipeline_options is not None:
|
|
86
|
-
pipeline_options = _PdfPipelineOptions.model_validate(pipeline_options)
|
|
87
|
-
else:
|
|
88
|
-
raise ValueError(
|
|
89
|
-
f"invalid pipeline_cls: {pipeline_cls}, expected a VLM or PDF pipeline"
|
|
90
|
-
)
|
|
91
|
-
return FormatOption(
|
|
92
|
-
pipeline_cls=pipeline_cls,
|
|
93
|
-
pipeline_options=pipeline_options,
|
|
94
|
-
backend=backend_cls,
|
|
95
|
-
)
|
|
96
43
|
|
|
44
|
+
def _validate_options(
|
|
45
|
+
data: dict[InputFormat, FormatOption],
|
|
46
|
+
) -> dict[InputFormat, FormatOption]:
|
|
47
|
+
for opts in data.values():
|
|
48
|
+
_validate_pipeline_opts(opts.pipeline_options)
|
|
49
|
+
return data
|
|
97
50
|
|
|
98
|
-
@cache
|
|
99
|
-
def _default_format_options() -> dict[InputFormat, DoclingFormatOption]:
|
|
100
|
-
supported_fmt = {InputFormat.PDF}
|
|
101
|
-
return {
|
|
102
|
-
fmt: DoclingFormatOption(
|
|
103
|
-
pipeline_cls=opt.pipeline_cls.__name__, backend_cls=opt.backend.__name__
|
|
104
|
-
)
|
|
105
|
-
for fmt, opt in DocumentConverter().format_to_options.items()
|
|
106
|
-
if fmt in supported_fmt
|
|
107
|
-
}
|
|
108
51
|
|
|
52
|
+
_DEFAULT_FORMAT_OPTS = {
|
|
53
|
+
InputFormat.PDF: PdfFormatOption(
|
|
54
|
+
pipeline_options=PdfPipelineOptions(
|
|
55
|
+
ocr_options=EasyOcrOptions(), generate_picture_images=True
|
|
56
|
+
)
|
|
57
|
+
),
|
|
58
|
+
}
|
|
109
59
|
|
|
110
60
|
T = TypeVar("T")
|
|
111
61
|
|
|
@@ -122,30 +72,37 @@ class DoclingPipelineConfig(PipelineConfig):
|
|
|
122
72
|
pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
|
|
123
73
|
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
124
74
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
format_options: dict[InputFormat, DoclingFormatOption] = Field(
|
|
129
|
-
default_factory=_default_format_options
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
def to_format_options(self) -> dict[InputFormat, FormatOption]:
|
|
133
|
-
pipeline_options = dict(self.pipeline_options)
|
|
134
|
-
return {
|
|
135
|
-
InputFormat(f): opt.to_docling(pipeline_options)
|
|
136
|
-
for f, opt in self.format_options.items()
|
|
137
|
-
}
|
|
75
|
+
format_options: Annotated[
|
|
76
|
+
dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
|
|
77
|
+
] = _DEFAULT_FORMAT_OPTS
|
|
138
78
|
|
|
79
|
+
_unsupported_input_formats: ClassVar[set[InputFormat]] = {
|
|
80
|
+
InputFormat.AUDIO,
|
|
81
|
+
InputFormat.METS_GBS,
|
|
82
|
+
InputFormat.VTT,
|
|
83
|
+
}
|
|
139
84
|
|
|
140
|
-
|
|
85
|
+
@classmethod
|
|
86
|
+
@cache
|
|
87
|
+
def supported_exts(cls) -> set[SupportedExt]:
|
|
88
|
+
supported = set()
|
|
89
|
+
for f in InputFormat:
|
|
90
|
+
if f in cls._unsupported_input_formats:
|
|
91
|
+
continue
|
|
92
|
+
for ext in FormatToExtensions[f]:
|
|
93
|
+
supported.add(SupportedExt(f".{ext.lower()}"))
|
|
94
|
+
return supported
|
|
141
95
|
|
|
142
96
|
|
|
143
97
|
@Pipeline.register(PipelineType.DOCLING)
|
|
144
98
|
class DoclingPipeline(Pipeline):
|
|
145
99
|
def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
100
|
+
allowed_format = [
|
|
101
|
+
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
102
|
+
]
|
|
103
|
+
self._converter = DocumentConverter(
|
|
104
|
+
allowed_formats=allowed_format, format_options=format_options
|
|
105
|
+
)
|
|
149
106
|
|
|
150
107
|
async def extract_content(
|
|
151
108
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
@@ -157,7 +114,7 @@ class DoclingPipeline(Pipeline):
|
|
|
157
114
|
|
|
158
115
|
@classmethod
|
|
159
116
|
def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
|
|
160
|
-
return cls(config.
|
|
117
|
+
return cls(config.format_options)
|
|
161
118
|
|
|
162
119
|
|
|
163
120
|
def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
|
extract_python/marker_.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import gc
|
|
2
2
|
from collections.abc import AsyncGenerator, Iterable
|
|
3
3
|
from copy import deepcopy
|
|
4
|
+
from functools import cache
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any, ClassVar, Self
|
|
6
7
|
|
|
@@ -20,6 +21,7 @@ from .objects import (
|
|
|
20
21
|
PageIndexes,
|
|
21
22
|
Result,
|
|
22
23
|
Status,
|
|
24
|
+
SupportedExt,
|
|
23
25
|
)
|
|
24
26
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
25
27
|
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
@@ -32,6 +34,33 @@ class MarkerPipelineConfig(PipelineConfig):
|
|
|
32
34
|
|
|
33
35
|
config: dict[str, Any] = dict()
|
|
34
36
|
|
|
37
|
+
@classmethod
|
|
38
|
+
@cache
|
|
39
|
+
def supported_exts(cls) -> set[SupportedExt]:
|
|
40
|
+
# Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
|
|
41
|
+
return {
|
|
42
|
+
SupportedExt.PDF,
|
|
43
|
+
SupportedExt.XLS,
|
|
44
|
+
SupportedExt.XLSX,
|
|
45
|
+
SupportedExt.XLSM,
|
|
46
|
+
SupportedExt.CSV,
|
|
47
|
+
SupportedExt.ODS,
|
|
48
|
+
SupportedExt.DOC,
|
|
49
|
+
SupportedExt.DOCX,
|
|
50
|
+
SupportedExt.ODT,
|
|
51
|
+
SupportedExt.PPT,
|
|
52
|
+
SupportedExt.PPTX,
|
|
53
|
+
SupportedExt.ODP,
|
|
54
|
+
SupportedExt.HTLM,
|
|
55
|
+
SupportedExt.EPUB,
|
|
56
|
+
SupportedExt.PNG,
|
|
57
|
+
SupportedExt.JPG,
|
|
58
|
+
SupportedExt.JPEG,
|
|
59
|
+
SupportedExt.WEBP,
|
|
60
|
+
SupportedExt.GIF,
|
|
61
|
+
SupportedExt.TIFF,
|
|
62
|
+
}
|
|
63
|
+
|
|
35
64
|
|
|
36
65
|
_MARKER_CONVERSION_ERRORS = tuple()
|
|
37
66
|
|
extract_python/miner_u.py
CHANGED
|
@@ -3,7 +3,7 @@ import shutil
|
|
|
3
3
|
from collections.abc import AsyncGenerator, Callable, Iterable
|
|
4
4
|
from copy import copy
|
|
5
5
|
from enum import StrEnum
|
|
6
|
-
from functools import partial
|
|
6
|
+
from functools import cache, partial
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from tempfile import TemporaryDirectory
|
|
9
9
|
from typing import Any, ClassVar, Self
|
|
@@ -26,6 +26,7 @@ from .objects import (
|
|
|
26
26
|
PageIndexes,
|
|
27
27
|
Result,
|
|
28
28
|
Status,
|
|
29
|
+
SupportedExt,
|
|
29
30
|
)
|
|
30
31
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
31
32
|
from .utils import path_to_artifacts_dirname
|
|
@@ -82,6 +83,16 @@ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
|
82
83
|
config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
|
|
83
84
|
language: LanguageAlpha2 = Field(frozen=True, default="en")
|
|
84
85
|
|
|
86
|
+
@classmethod
|
|
87
|
+
@cache
|
|
88
|
+
def supported_exts(cls) -> set[SupportedExt]:
|
|
89
|
+
return {
|
|
90
|
+
SupportedExt.PDF,
|
|
91
|
+
SupportedExt.DOCX,
|
|
92
|
+
SupportedExt.PPTX,
|
|
93
|
+
SupportedExt.XLSX,
|
|
94
|
+
}
|
|
95
|
+
|
|
85
96
|
|
|
86
97
|
@Pipeline.register(PipelineType.MINER_U)
|
|
87
98
|
class MinerUPipeline(Pipeline):
|
extract_python/objects.py
CHANGED
|
@@ -21,7 +21,12 @@ from pydantic import AfterValidator, RootModel, TypeAdapter
|
|
|
21
21
|
from pydantic import BaseModel as _BaseModel
|
|
22
22
|
|
|
23
23
|
try:
|
|
24
|
-
from docling.datamodel.base_models import
|
|
24
|
+
from docling.datamodel.base_models import (
|
|
25
|
+
ConversionStatus,
|
|
26
|
+
ErrorItem,
|
|
27
|
+
FormatToExtensions,
|
|
28
|
+
InputFormat,
|
|
29
|
+
)
|
|
25
30
|
from docling.datamodel.document import InputDocument
|
|
26
31
|
from docling_core.types.io import DocumentStream
|
|
27
32
|
except ImportError:
|
|
@@ -33,15 +38,78 @@ logger = logging.getLogger(__name__)
|
|
|
33
38
|
base_config = merge_configs(icij_config(), no_enum_values_config())
|
|
34
39
|
|
|
35
40
|
|
|
41
|
+
@cache
|
|
42
|
+
def _ext_to_docling_input_format() -> dict:
|
|
43
|
+
from .docling_ import DoclingPipelineConfig # noqa: PLC0415
|
|
44
|
+
|
|
45
|
+
mapping = dict()
|
|
46
|
+
supported = DoclingPipelineConfig.supported_exts()
|
|
47
|
+
for input_f, exts in FormatToExtensions.items():
|
|
48
|
+
for ext in exts:
|
|
49
|
+
try:
|
|
50
|
+
ext = SupportedExt(f".{ext.lower()}") # noqa: PLW2901
|
|
51
|
+
except ValueError:
|
|
52
|
+
continue
|
|
53
|
+
if ext in supported:
|
|
54
|
+
mapping[ext] = input_f
|
|
55
|
+
return mapping
|
|
56
|
+
|
|
57
|
+
|
|
36
58
|
class BaseModel(_BaseModel):
|
|
37
59
|
model_config = base_config
|
|
38
60
|
|
|
39
61
|
|
|
40
62
|
class SupportedExt(StrEnum):
|
|
63
|
+
ADOC = ".adoc"
|
|
64
|
+
ASC = ".asc"
|
|
65
|
+
ASCIIDOC = ".asciidoc"
|
|
66
|
+
BMP = ".bmp"
|
|
67
|
+
CSV = ".csv"
|
|
68
|
+
DOC = ".doc"
|
|
69
|
+
DOCX = ".docx"
|
|
70
|
+
DOTX = ".dotx"
|
|
71
|
+
DOTM = ".dotm"
|
|
72
|
+
DOCM = ".docm"
|
|
73
|
+
EPUB = ".epub"
|
|
74
|
+
GIF = ".gif"
|
|
75
|
+
HTLM = ".html"
|
|
76
|
+
HTM = ".htm"
|
|
77
|
+
JPEG = ".jpeg"
|
|
78
|
+
JPG = ".jpg"
|
|
79
|
+
JSON = ".json"
|
|
80
|
+
LATEX = ".latex"
|
|
81
|
+
MD = ".md"
|
|
82
|
+
NXML = ".nxml"
|
|
83
|
+
ODP = ".odp"
|
|
84
|
+
ODS = ".ods"
|
|
85
|
+
ODT = ".odt"
|
|
41
86
|
PDF = ".pdf"
|
|
87
|
+
PNG = ".png"
|
|
88
|
+
PPSX = ".ppsx"
|
|
89
|
+
PPT = ".ppt"
|
|
90
|
+
PPTM = ".pptm"
|
|
91
|
+
PPSM = ".ppsm"
|
|
92
|
+
POTX = ".potx"
|
|
93
|
+
POTM = ".potm"
|
|
94
|
+
PPTX = ".pptx"
|
|
95
|
+
QMD = ".qmd"
|
|
96
|
+
RMD = ".rmd"
|
|
97
|
+
TEX = ".tex"
|
|
98
|
+
TIF = ".tif"
|
|
99
|
+
TIFF = ".tiff"
|
|
100
|
+
TXT = ".txt"
|
|
101
|
+
TEXT = ".text"
|
|
102
|
+
WEBP = ".webp"
|
|
103
|
+
XBRL = ".xbrl"
|
|
104
|
+
XHTML = ".xhtml"
|
|
105
|
+
XLS = ".xls"
|
|
106
|
+
XLSM = ".xlsm"
|
|
107
|
+
XLSX = ".xlsx"
|
|
108
|
+
XLTX = ".xltx"
|
|
109
|
+
XML = ".xml"
|
|
42
110
|
|
|
43
111
|
def to_docling(self) -> InputFormat:
|
|
44
|
-
return
|
|
112
|
+
return _ext_to_docling_input_format()[self]
|
|
45
113
|
|
|
46
114
|
|
|
47
115
|
class OutputFormat(StrEnum):
|
extract_python/pipeline.py
CHANGED
|
@@ -8,7 +8,7 @@ from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_value
|
|
|
8
8
|
from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
|
|
9
9
|
from pydantic import Field
|
|
10
10
|
|
|
11
|
-
from .objects import InputDoc, OutputFormat, Result
|
|
11
|
+
from .objects import InputDoc, OutputFormat, Result, SupportedExt
|
|
12
12
|
|
|
13
13
|
StructuredContent = str
|
|
14
14
|
|
|
@@ -28,6 +28,10 @@ class PipelineConfig(RegistrableConfig, ABC):
|
|
|
28
28
|
|
|
29
29
|
task_group: ClassVar[str] = Field(frozen=True)
|
|
30
30
|
|
|
31
|
+
@classmethod
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def supported_exts(cls) -> set[SupportedExt]: ...
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
class Pipeline(RegistrableFromConfig, ABC):
|
|
33
37
|
@abstractmethod
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
+
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
+
extract_python/docling_.py,sha256=00g7RV33iftjvMLoKaEs2lUZP3LslSCKMpGeSys1Suc,6616
|
|
4
|
+
extract_python/marker_.py,sha256=3Q8H-TeM2_GenB6OOqIuytqgI1VE93Ek99_kW0cJHEw,4905
|
|
5
|
+
extract_python/miner_u.py,sha256=WdaftyINZdnALqSuu1qKaZJKKyIHVRn-wBke-Na78O0,7747
|
|
6
|
+
extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
|
|
7
|
+
extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
|
|
8
|
+
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
+
extract_python-0.3.0.dist-info/METADATA,sha256=MwFN5PsmkUEv8sbhuS6joh7r17W6rrikpJb4Yr-rdKk,1132
|
|
10
|
+
extract_python-0.3.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
extract_python-0.3.0.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
|
|
2
|
-
extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
|
|
3
|
-
extract_python/docling_.py,sha256=FMDsPVz05sGMPvIOX90lOLygWp6nC5DEjRfgx_ESPJ4,8530
|
|
4
|
-
extract_python/marker_.py,sha256=z3PkUUStC-E78HhqByMwJ7re6-I7YUQzSxWToegHrUQ,4060
|
|
5
|
-
extract_python/miner_u.py,sha256=f5pvLvay1ThBXNOI1R276aWSWsk5mhIPzWVjCy2u_lw,7493
|
|
6
|
-
extract_python/objects.py,sha256=gTyGA5gaMAmW5P_PbAO2LNMqtP69CxlknebBFTojiwQ,7322
|
|
7
|
-
extract_python/pipeline.py,sha256=qUgGar1rlYQgNz78BcUT1nQRsG3hy5UwpCl0e-0V77I,1098
|
|
8
|
-
extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
|
|
9
|
-
extract_python-0.1.0.dist-info/METADATA,sha256=wyYMrleKk9yUU1UaTYT0EsGpw_e3qbE8LOBanyLv0Qg,1132
|
|
10
|
-
extract_python-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
-
extract_python-0.1.0.dist-info/RECORD,,
|
|
File without changes
|