extract-python 0.5.15__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python/constants.py +1 -1
- extract_python/docling_.py +33 -51
- extract_python/marker_.py +18 -85
- extract_python/miner_u.py +10 -38
- extract_python/utils.py +15 -2
- {extract_python-0.5.15.dist-info → extract_python-0.7.0.dist-info}/METADATA +2 -2
- extract_python-0.7.0.dist-info/RECORD +9 -0
- extract_python-0.5.15.dist-info/RECORD +0 -9
- {extract_python-0.5.15.dist-info → extract_python-0.7.0.dist-info}/WHEEL +0 -0
extract_python/constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
ARTIFACTS = "artifacts"
|
|
2
|
-
DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div
|
|
2
|
+
DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
|
extract_python/docling_.py
CHANGED
|
@@ -8,7 +8,6 @@ from functools import partial
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any, Self
|
|
10
10
|
|
|
11
|
-
from docling.datamodel.base_models import InputFormat
|
|
12
11
|
from docling.datamodel.document import ConversionResult
|
|
13
12
|
from docling.datamodel.pipeline_options import PipelineOptions
|
|
14
13
|
from docling.document_converter import DocumentConverter, FormatOption
|
|
@@ -18,26 +17,23 @@ from docling_core.types.doc import ImageRefMode
|
|
|
18
17
|
from docling_core.types.io import DocumentStream
|
|
19
18
|
from extract_core import (
|
|
20
19
|
BaseModel,
|
|
21
|
-
Device,
|
|
22
20
|
DoclingFormatOption,
|
|
23
21
|
DoclingPipelineConfig,
|
|
24
22
|
Error,
|
|
25
23
|
InputDoc,
|
|
26
24
|
MarkdownDoc,
|
|
27
25
|
OutputFormat,
|
|
28
|
-
PageIndexes,
|
|
29
26
|
Pipeline,
|
|
30
27
|
PipelineType,
|
|
31
28
|
Result,
|
|
32
29
|
Status,
|
|
33
30
|
)
|
|
34
31
|
from icij_common.pydantic_utils import merge_configs
|
|
35
|
-
from icij_common.registrable import FromConfig
|
|
36
32
|
from pydantic import ConfigDict, field_serializer
|
|
37
33
|
from pydantic_core.core_schema import SerializerFunctionWrapHandler
|
|
38
34
|
|
|
39
35
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
40
|
-
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
|
|
36
|
+
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
|
|
41
37
|
|
|
42
38
|
logger = logging.getLogger(__name__)
|
|
43
39
|
|
|
@@ -46,16 +42,12 @@ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "mode
|
|
|
46
42
|
|
|
47
43
|
@Pipeline.register(PipelineType.DOCLING)
|
|
48
44
|
class DoclingPipeline(Pipeline):
|
|
49
|
-
def __init__(
|
|
50
|
-
|
|
51
|
-
format_options
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
super().__init__(device)
|
|
56
|
-
format_options = dict()
|
|
57
|
-
for k, v in format_options.items():
|
|
58
|
-
format_options[k] = v.to_docling(self._device)
|
|
45
|
+
def __init__(self, config: DoclingPipelineConfig):
|
|
46
|
+
super().__init__(config)
|
|
47
|
+
format_options = {
|
|
48
|
+
k: v.to_docling(self._device)
|
|
49
|
+
for k, v in self._config.format_options.items()
|
|
50
|
+
}
|
|
59
51
|
logger.info(
|
|
60
52
|
"resolved format options to: %s",
|
|
61
53
|
lambda: partial(json.dumps, format_options, indent=2),
|
|
@@ -81,15 +73,6 @@ class DoclingPipeline(Pipeline):
|
|
|
81
73
|
doc = next(docs)
|
|
82
74
|
yield _to_result(res, doc, output_format, output_path=output_path)
|
|
83
75
|
|
|
84
|
-
@classmethod
|
|
85
|
-
def _from_config(
|
|
86
|
-
cls,
|
|
87
|
-
config: DoclingPipelineConfig,
|
|
88
|
-
*,
|
|
89
|
-
device: Device = Device.CPU,
|
|
90
|
-
) -> FromConfig:
|
|
91
|
-
return cls(config.format_options, device=device)
|
|
92
|
-
|
|
93
76
|
|
|
94
77
|
def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
|
95
78
|
for d in docs:
|
|
@@ -131,39 +114,38 @@ def _to_markdown_doc(
|
|
|
131
114
|
raise FileExistsError(f"directory {md_dir} already exists")
|
|
132
115
|
# Let's avoid issue of duplicated input file names flattened top level
|
|
133
116
|
md_filename = md_dir_name + OutputFormat.MARKDOWN
|
|
134
|
-
total_length = 0
|
|
135
|
-
n_pages = len(res.pages)
|
|
136
|
-
|
|
137
117
|
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
|
|
138
118
|
tmp_dir = Path(td)
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
artifacts_dir=Path(ARTIFACTS),
|
|
150
|
-
**kwargs,
|
|
151
|
-
)
|
|
152
|
-
content = page_path.read_text()
|
|
153
|
-
if page_i > 0:
|
|
154
|
-
content += "\n"
|
|
155
|
-
if page_i < n_pages - 1:
|
|
156
|
-
content += page_sep
|
|
157
|
-
total_length += len(content)
|
|
158
|
-
end_indices.append(total_length)
|
|
159
|
-
f.write(content)
|
|
160
|
-
f.flush()
|
|
161
|
-
page_path.unlink()
|
|
119
|
+
md_path = tmp_dir / md_filename
|
|
120
|
+
current_page_path = tmp_dir / "page.md"
|
|
121
|
+
with chdir(tmp_dir):
|
|
122
|
+
# We do a chdir to bypass a Docling bug which only allows to maintain
|
|
123
|
+
# relative image ref when saving the markdown to a relative path
|
|
124
|
+
pages = _docling_pages_it(res, current_page_path, **kwargs)
|
|
125
|
+
with md_path.open("wb") as f:
|
|
126
|
+
pages = write_pages(pages, page_sep, f)
|
|
127
|
+
# Clean up the tmp page file before move everything to the end destination
|
|
128
|
+
current_page_path.unlink(missing_ok=True)
|
|
162
129
|
shutil.move(tmp_dir, md_dir)
|
|
163
|
-
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
164
130
|
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
|
165
131
|
|
|
166
132
|
|
|
133
|
+
def _docling_pages_it(
|
|
134
|
+
res: ConversionResult, output_path: Path, **kwargs
|
|
135
|
+
) -> Iterable[str]:
|
|
136
|
+
n_pages = len(res.pages)
|
|
137
|
+
for page_i in range(n_pages):
|
|
138
|
+
res.document.save_as_markdown(
|
|
139
|
+
output_path,
|
|
140
|
+
page_no=page_i + 1,
|
|
141
|
+
image_mode=ImageRefMode.REFERENCED,
|
|
142
|
+
artifacts_dir=Path(ARTIFACTS),
|
|
143
|
+
**kwargs,
|
|
144
|
+
)
|
|
145
|
+
content = output_path.read_text()
|
|
146
|
+
yield content
|
|
147
|
+
|
|
148
|
+
|
|
167
149
|
class SerializableFormatOptions(DoclingFormatOption):
|
|
168
150
|
# Utility class to serialize Python format options into a JSON which can be
|
|
169
151
|
# correctly deserialized into a docling FormatOption
|
extract_python/marker_.py
CHANGED
|
@@ -2,82 +2,32 @@ import asyncio
|
|
|
2
2
|
import gc
|
|
3
3
|
from collections.abc import AsyncGenerator, Iterable
|
|
4
4
|
from copy import deepcopy
|
|
5
|
-
from functools import cache
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
8
7
|
|
|
9
8
|
from extract_core import (
|
|
10
|
-
BasePipelineConfig,
|
|
11
|
-
Device,
|
|
12
9
|
InputDoc,
|
|
13
10
|
MarkdownDoc,
|
|
14
11
|
OutputFormat,
|
|
15
|
-
PageIndexes,
|
|
16
12
|
Pipeline,
|
|
17
13
|
PipelineType,
|
|
18
14
|
Result,
|
|
19
15
|
Status,
|
|
20
|
-
SupportedExt,
|
|
21
16
|
)
|
|
22
|
-
from pydantic import Field
|
|
23
17
|
|
|
24
|
-
from .constants import ARTIFACTS
|
|
25
|
-
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
18
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
19
|
+
from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
|
|
26
20
|
|
|
27
21
|
if TYPE_CHECKING:
|
|
28
22
|
from marker.converters.pdf import PdfConverter
|
|
29
23
|
from PIL import Image
|
|
30
24
|
|
|
31
25
|
|
|
32
|
-
class MarkerPipelineConfig(BasePipelineConfig):
|
|
33
|
-
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
|
|
34
|
-
|
|
35
|
-
config: dict[str, Any] = Field(default_factory=dict)
|
|
36
|
-
|
|
37
|
-
@classmethod
|
|
38
|
-
@cache
|
|
39
|
-
def supported_exts(cls) -> set[SupportedExt]:
|
|
40
|
-
# Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
|
|
41
|
-
return {
|
|
42
|
-
SupportedExt.PDF,
|
|
43
|
-
SupportedExt.XLS,
|
|
44
|
-
SupportedExt.XLSX,
|
|
45
|
-
SupportedExt.XLSM,
|
|
46
|
-
SupportedExt.CSV,
|
|
47
|
-
SupportedExt.ODS,
|
|
48
|
-
SupportedExt.DOC,
|
|
49
|
-
SupportedExt.DOCX,
|
|
50
|
-
SupportedExt.ODT,
|
|
51
|
-
SupportedExt.PPT,
|
|
52
|
-
SupportedExt.PPTX,
|
|
53
|
-
SupportedExt.ODP,
|
|
54
|
-
SupportedExt.HTLM,
|
|
55
|
-
SupportedExt.EPUB,
|
|
56
|
-
SupportedExt.PNG,
|
|
57
|
-
SupportedExt.JPG,
|
|
58
|
-
SupportedExt.JPEG,
|
|
59
|
-
SupportedExt.WEBP,
|
|
60
|
-
SupportedExt.GIF,
|
|
61
|
-
SupportedExt.TIFF,
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
|
|
65
26
|
_MARKER_CONVERSION_ERRORS = tuple()
|
|
66
27
|
|
|
67
28
|
|
|
68
29
|
@Pipeline.register(PipelineType.MARKER)
|
|
69
30
|
class MarkerPipeline(Pipeline):
|
|
70
|
-
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
marker_config: dict[str, Any] | None = None,
|
|
73
|
-
*,
|
|
74
|
-
device: Device = Device.CPU,
|
|
75
|
-
):
|
|
76
|
-
super().__init__(device)
|
|
77
|
-
if marker_config is None:
|
|
78
|
-
marker_config = dict()
|
|
79
|
-
self._marker_config = marker_config
|
|
80
|
-
|
|
81
31
|
async def extract_content(
|
|
82
32
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
83
33
|
) -> AsyncGenerator[Result, None]:
|
|
@@ -85,7 +35,7 @@ class MarkerPipeline(Pipeline):
|
|
|
85
35
|
from marker.converters.pdf import PdfConverter # noqa: PLC0415
|
|
86
36
|
from marker.models import create_model_dict # noqa: PLC0415
|
|
87
37
|
|
|
88
|
-
config = deepcopy(self.
|
|
38
|
+
config = deepcopy(self._config.config)
|
|
89
39
|
config["output_format"] = output_format.to_marker()
|
|
90
40
|
config_parser = ConfigParser(config)
|
|
91
41
|
renderer = config_parser.get_renderer()
|
|
@@ -98,15 +48,6 @@ class MarkerPipeline(Pipeline):
|
|
|
98
48
|
for doc in docs:
|
|
99
49
|
yield await _process_doc(doc, converter, output_format, output_path)
|
|
100
50
|
|
|
101
|
-
@classmethod
|
|
102
|
-
def _from_config(
|
|
103
|
-
cls,
|
|
104
|
-
config: MarkerPipelineConfig,
|
|
105
|
-
*,
|
|
106
|
-
device: Device = Device.CPU,
|
|
107
|
-
) -> Self:
|
|
108
|
-
return cls(config.config, device=device)
|
|
109
|
-
|
|
110
51
|
|
|
111
52
|
@report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
|
|
112
53
|
async def _process_doc(
|
|
@@ -121,7 +62,9 @@ async def _process_doc(
|
|
|
121
62
|
content, _, images = text_from_rendered(rendered)
|
|
122
63
|
match output_format:
|
|
123
64
|
case OutputFormat.MARKDOWN:
|
|
124
|
-
output = _to_markdown_doc(
|
|
65
|
+
output = _to_markdown_doc(
|
|
66
|
+
doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
|
|
67
|
+
)
|
|
125
68
|
case _:
|
|
126
69
|
raise NotImplementedError(f"unsupported output format {output_format}")
|
|
127
70
|
input_doc = doc.without_content()
|
|
@@ -129,7 +72,12 @@ async def _process_doc(
|
|
|
129
72
|
|
|
130
73
|
|
|
131
74
|
def _to_markdown_doc(
|
|
132
|
-
input_doc: InputDoc,
|
|
75
|
+
input_doc: InputDoc,
|
|
76
|
+
content: str,
|
|
77
|
+
images: dict[str, "Image"],
|
|
78
|
+
output_path: Path,
|
|
79
|
+
*,
|
|
80
|
+
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
133
81
|
) -> MarkdownDoc:
|
|
134
82
|
from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
|
|
135
83
|
|
|
@@ -143,24 +91,9 @@ def _to_markdown_doc(
|
|
|
143
91
|
im.save(artifacts_dir / im_name)
|
|
144
92
|
del images
|
|
145
93
|
gc.collect()
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
md_path
|
|
150
|
-
|
|
151
|
-
)
|
|
152
|
-
total_length = 0
|
|
153
|
-
end_indices = []
|
|
154
|
-
with md_path.open("w", encoding="utf-8") as f:
|
|
155
|
-
for page_i, page_content in enumerate(content):
|
|
156
|
-
content = page_content
|
|
157
|
-
if page_i > 0:
|
|
158
|
-
content += "\n"
|
|
159
|
-
if page_i < n_pages - 1:
|
|
160
|
-
content += page_sep
|
|
161
|
-
total_length += len(content)
|
|
162
|
-
end_indices.append(total_length)
|
|
163
|
-
f.write(content)
|
|
164
|
-
f.flush()
|
|
165
|
-
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
94
|
+
pages = content.split(MarkdownRenderer.page_separator)
|
|
95
|
+
md_path = output_path / md_dir_name / md_dir_name
|
|
96
|
+
md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
|
|
97
|
+
with md_path.open("wb") as f:
|
|
98
|
+
pages = write_pages(pages, page_sep, f)
|
|
166
99
|
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
extract_python/miner_u.py
CHANGED
|
@@ -5,17 +5,13 @@ from collections.abc import AsyncGenerator, Callable, Iterable
|
|
|
5
5
|
from functools import partial
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from tempfile import TemporaryDirectory
|
|
8
|
-
from typing import Self
|
|
9
8
|
|
|
10
9
|
from extract_core import (
|
|
11
10
|
ConversionOutput,
|
|
12
|
-
Device,
|
|
13
11
|
InputDoc,
|
|
14
12
|
MinerUBackend,
|
|
15
|
-
MinerUConfig,
|
|
16
13
|
MinerUPipelineConfig,
|
|
17
14
|
OutputFormat,
|
|
18
|
-
PageIndexes,
|
|
19
15
|
Pipeline,
|
|
20
16
|
PipelineType,
|
|
21
17
|
Result,
|
|
@@ -23,7 +19,7 @@ from extract_core import (
|
|
|
23
19
|
)
|
|
24
20
|
|
|
25
21
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
26
|
-
from .utils import path_to_artifacts_dirname, reset_env
|
|
22
|
+
from .utils import path_to_artifacts_dirname, reset_env, write_pages
|
|
27
23
|
|
|
28
24
|
_MINER_U_CONVERSION_ERRORS = tuple()
|
|
29
25
|
MDMakeFunction = Callable[[list, str, str], str | None]
|
|
@@ -31,13 +27,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
|
|
|
31
27
|
|
|
32
28
|
@Pipeline.register(PipelineType.MINER_U)
|
|
33
29
|
class MinerUPipeline(Pipeline):
|
|
34
|
-
def __init__(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
self._config = config
|
|
39
|
-
self._language = language
|
|
40
|
-
self._md_make_fn = _parse_md_make_fn(config.backend)
|
|
30
|
+
def __init__(self, config: MinerUPipelineConfig):
|
|
31
|
+
super().__init__(config)
|
|
32
|
+
self._language = self._config.language
|
|
33
|
+
self._md_make_fn = _parse_md_make_fn(self._config.config.backend)
|
|
41
34
|
|
|
42
35
|
async def extract_content(
|
|
43
36
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
@@ -59,7 +52,7 @@ class MinerUPipeline(Pipeline):
|
|
|
59
52
|
pdf_file_names=pdfs_names,
|
|
60
53
|
pdf_bytes_list=pdfs_bytes,
|
|
61
54
|
p_lang_list=p_lang_list,
|
|
62
|
-
**self._config.as_parse_kwargs(),
|
|
55
|
+
**self._config.config.as_parse_kwargs(),
|
|
63
56
|
)
|
|
64
57
|
res_paths = [
|
|
65
58
|
_revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
|
|
@@ -73,15 +66,6 @@ class MinerUPipeline(Pipeline):
|
|
|
73
66
|
output_path=output_path,
|
|
74
67
|
)
|
|
75
68
|
|
|
76
|
-
@classmethod
|
|
77
|
-
def _from_config(
|
|
78
|
-
cls,
|
|
79
|
-
config: MinerUPipelineConfig,
|
|
80
|
-
*,
|
|
81
|
-
device: Device = Device.CPU,
|
|
82
|
-
) -> Self:
|
|
83
|
-
return cls(config.config, language=config.language, device=device)
|
|
84
|
-
|
|
85
69
|
|
|
86
70
|
def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
|
|
87
71
|
output_path = output_dir / pdf_filename
|
|
@@ -163,21 +147,9 @@ def _dump_md_content(
|
|
|
163
147
|
|
|
164
148
|
if md_make_mode is None:
|
|
165
149
|
md_make_mode = MakeMode.MM_MD
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
n_pages = len(pdf_info)
|
|
170
|
-
for page_i, page in enumerate(pdf_info):
|
|
171
|
-
content = md_make_fn([page], md_make_mode, str(im_dir))
|
|
172
|
-
if page_i > 0:
|
|
173
|
-
content += "\n"
|
|
174
|
-
if page_i < n_pages - 1:
|
|
175
|
-
content += page_sep
|
|
176
|
-
total_length += len(content)
|
|
177
|
-
end_indices.append(total_length)
|
|
178
|
-
f.write(content)
|
|
179
|
-
f.flush()
|
|
180
|
-
end_indices = PageIndexes.from_page_end_indices(end_indices)
|
|
150
|
+
pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
|
|
151
|
+
with md_path.open("wb") as f:
|
|
152
|
+
pages = write_pages(pages, page_sep, f)
|
|
181
153
|
output_path = md_path.parent.relative_to(output_path)
|
|
182
|
-
output = ConversionOutput(path=output_path, pages=
|
|
154
|
+
output = ConversionOutput(path=output_path, pages=pages)
|
|
183
155
|
return output
|
extract_python/utils.py
CHANGED
|
@@ -5,9 +5,9 @@ from copy import copy
|
|
|
5
5
|
from functools import wraps
|
|
6
6
|
from itertools import tee
|
|
7
7
|
from pathlib import Path, PurePath
|
|
8
|
-
from typing import Protocol, TypeVar
|
|
8
|
+
from typing import BinaryIO, Protocol, TypeVar
|
|
9
9
|
|
|
10
|
-
from extract_core import Error, InputDoc, Result, Status
|
|
10
|
+
from extract_core import Error, InputDoc, Pages, Result, Status
|
|
11
11
|
|
|
12
12
|
R = TypeVar("R")
|
|
13
13
|
In = TypeVar("In")
|
|
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
|
|
|
73
73
|
finally:
|
|
74
74
|
os.environ.clear()
|
|
75
75
|
os.environ.update(old_env)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
|
|
79
|
+
pages_byte_sizes = []
|
|
80
|
+
pages = iter(pages)
|
|
81
|
+
content = None
|
|
82
|
+
for p in pages:
|
|
83
|
+
if content:
|
|
84
|
+
pages_byte_sizes.append(out.write((content + page_sep).encode()))
|
|
85
|
+
content = p
|
|
86
|
+
if content:
|
|
87
|
+
pages_byte_sizes.append(out.write(content.encode()))
|
|
88
|
+
return Pages.from_pages_bytes_sizes(pages_byte_sizes)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Structured content extraction
|
|
5
5
|
Project-URL: Homepage, https://github.com/ICIJ/extract-python
|
|
6
6
|
Project-URL: Repository, https://github.com/ICIJ/extract-python
|
|
7
7
|
Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
|
|
8
8
|
Author-email: Clément Doumouro <cdoumouro@icij.org>
|
|
9
9
|
Requires-Python: <3.14,>=3.11
|
|
10
|
-
Requires-Dist: extract-core~=0.
|
|
10
|
+
Requires-Dist: extract-core~=0.6.0
|
|
11
11
|
Requires-Dist: icij-common~=0.8.2
|
|
12
12
|
Provides-Extra: benches
|
|
13
13
|
Requires-Dist: html2image~=2.0.7; extra == 'benches'
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
|
|
2
|
+
extract_python/constants.py,sha256=659V40LcTWJhX3IbuJLSSvI5AsGJh9ciMrGCfzJn2zA,98
|
|
3
|
+
extract_python/docling_.py,sha256=j1rVhKG7m1ef43VDsS6XGP0INPRY1Rcovzf1mjZ57tU,7352
|
|
4
|
+
extract_python/marker_.py,sha256=R_SXhqk5GmEWqJrYgg3tRdXKHms7n0FueNr-aOCDvLc,3358
|
|
5
|
+
extract_python/miner_u.py,sha256=MtXmnG-dFIGa3dXVrixfUU32yc88US0dhu7E3x6wQIM,5415
|
|
6
|
+
extract_python/utils.py,sha256=9IWW9_VVdUPHOHhdDgkXx16R1X1FPz8-nTBNYsLCFfA,2443
|
|
7
|
+
extract_python-0.7.0.dist-info/METADATA,sha256=my-lfG6yqNEat77SC6mAfFerRRmTtksQMKYwHsg8aVE,1218
|
|
8
|
+
extract_python-0.7.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
extract_python-0.7.0.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
extract_python/__init__.py,sha256=DA2LUro6vMjfS8fb2MsqO95FbJEZHyZ7kFyn42q02Wk,759
|
|
2
|
-
extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
|
|
3
|
-
extract_python/docling_.py,sha256=1ujMmtD63RaSdR1gvWbQAm396JODj44uBWtz9M4cFyI,7864
|
|
4
|
-
extract_python/marker_.py,sha256=oxN1unJ9x8YW5jds1STCc2wvQ30KzQNy3dXbCIuTuQc,5311
|
|
5
|
-
extract_python/miner_u.py,sha256=Ien3H7vZXLCACVjSMP2NAiog7yvvPq7oGgLGcfLZfpA,6159
|
|
6
|
-
extract_python/utils.py,sha256=HL-84NkjfJEiWp8GPRaJIiBL2Cywp4ABN41EkxYYnPI,2004
|
|
7
|
-
extract_python-0.5.15.dist-info/METADATA,sha256=S3upxGMF81cp6kMaqteJJ5gMBmQ2dQe4Xcil8DGq8s0,1219
|
|
8
|
-
extract_python-0.5.15.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
-
extract_python-0.5.15.dist-info/RECORD,,
|
|
File without changes
|