extract-python 0.5.14__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_python-0.5.14 → extract_python-0.6.0}/PKG-INFO +2 -2
- {extract_python-0.5.14 → extract_python-0.6.0}/extract_python/docling_.py +13 -20
- {extract_python-0.5.14 → extract_python-0.6.0}/extract_python/marker_.py +2 -60
- {extract_python-0.5.14 → extract_python-0.6.0}/extract_python/miner_u.py +5 -20
- {extract_python-0.5.14 → extract_python-0.6.0}/pyproject.toml +15 -1
- extract_python-0.6.0/uv.lock +5525 -0
- extract_python-0.5.14/uv.lock +0 -5379
- {extract_python-0.5.14 → extract_python-0.6.0}/.gitignore +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/.python-version +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/README.md +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/benches/__init__.py +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/benches/compare.ipynb +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/benches/compare.py +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/benches/constants.py +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/data/.gitignore +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/extract_python/__init__.py +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/extract_python/constants.py +0 -0
- {extract_python-0.5.14 → extract_python-0.6.0}/extract_python/utils.py +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Structured content extraction
|
|
5
5
|
Project-URL: Homepage, https://github.com/ICIJ/extract-python
|
|
6
6
|
Project-URL: Repository, https://github.com/ICIJ/extract-python
|
|
7
7
|
Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
|
|
8
8
|
Author-email: Clément Doumouro <cdoumouro@icij.org>
|
|
9
9
|
Requires-Python: <3.14,>=3.11
|
|
10
|
-
Requires-Dist: extract-core~=0.
|
|
10
|
+
Requires-Dist: extract-core~=0.6.0
|
|
11
11
|
Requires-Dist: icij-common~=0.8.2
|
|
12
12
|
Provides-Extra: benches
|
|
13
13
|
Requires-Dist: html2image~=2.0.7; extra == 'benches'
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
2
4
|
import shutil
|
|
3
5
|
import tempfile
|
|
4
6
|
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
7
|
+
from functools import partial
|
|
5
8
|
from pathlib import Path
|
|
6
9
|
from typing import Any, Self
|
|
7
10
|
|
|
8
|
-
from docling.datamodel.base_models import InputFormat
|
|
9
11
|
from docling.datamodel.document import ConversionResult
|
|
10
12
|
from docling.datamodel.pipeline_options import PipelineOptions
|
|
11
13
|
from docling.document_converter import DocumentConverter, FormatOption
|
|
@@ -15,7 +17,6 @@ from docling_core.types.doc import ImageRefMode
|
|
|
15
17
|
from docling_core.types.io import DocumentStream
|
|
16
18
|
from extract_core import (
|
|
17
19
|
BaseModel,
|
|
18
|
-
Device,
|
|
19
20
|
DoclingFormatOption,
|
|
20
21
|
DoclingPipelineConfig,
|
|
21
22
|
Error,
|
|
@@ -29,28 +30,29 @@ from extract_core import (
|
|
|
29
30
|
Status,
|
|
30
31
|
)
|
|
31
32
|
from icij_common.pydantic_utils import merge_configs
|
|
32
|
-
from icij_common.registrable import FromConfig
|
|
33
33
|
from pydantic import ConfigDict, field_serializer
|
|
34
34
|
from pydantic_core.core_schema import SerializerFunctionWrapHandler
|
|
35
35
|
|
|
36
36
|
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
|
|
37
37
|
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
|
|
38
38
|
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
39
41
|
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
@Pipeline.register(PipelineType.DOCLING)
|
|
43
45
|
class DoclingPipeline(Pipeline):
|
|
44
|
-
def __init__(
|
|
45
|
-
|
|
46
|
-
format_options: dict["InputFormat", DoclingFormatOption] | None = None,
|
|
47
|
-
*,
|
|
48
|
-
device: Device = Device.CPU,
|
|
49
|
-
):
|
|
50
|
-
super().__init__(device)
|
|
46
|
+
def __init__(self, config: DoclingPipelineConfig):
|
|
47
|
+
super().__init__(config)
|
|
51
48
|
format_options = {
|
|
52
|
-
k: v.to_docling(self._device)
|
|
49
|
+
k: v.to_docling(self._device)
|
|
50
|
+
for k, v in self._config.format_options.items()
|
|
53
51
|
}
|
|
52
|
+
logger.info(
|
|
53
|
+
"resolved format options to: %s",
|
|
54
|
+
lambda: partial(json.dumps, format_options, indent=2),
|
|
55
|
+
)
|
|
54
56
|
allowed_format = [
|
|
55
57
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
56
58
|
]
|
|
@@ -72,15 +74,6 @@ class DoclingPipeline(Pipeline):
|
|
|
72
74
|
doc = next(docs)
|
|
73
75
|
yield _to_result(res, doc, output_format, output_path=output_path)
|
|
74
76
|
|
|
75
|
-
@classmethod
|
|
76
|
-
def _from_config(
|
|
77
|
-
cls,
|
|
78
|
-
config: DoclingPipelineConfig,
|
|
79
|
-
*,
|
|
80
|
-
device: Device = Device.CPU,
|
|
81
|
-
) -> FromConfig:
|
|
82
|
-
return cls(config.format_options, device=device)
|
|
83
|
-
|
|
84
77
|
|
|
85
78
|
def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
|
86
79
|
for d in docs:
|
|
@@ -2,13 +2,10 @@ import asyncio
|
|
|
2
2
|
import gc
|
|
3
3
|
from collections.abc import AsyncGenerator, Iterable
|
|
4
4
|
from copy import deepcopy
|
|
5
|
-
from functools import cache
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
8
7
|
|
|
9
8
|
from extract_core import (
|
|
10
|
-
BasePipelineConfig,
|
|
11
|
-
Device,
|
|
12
9
|
InputDoc,
|
|
13
10
|
MarkdownDoc,
|
|
14
11
|
OutputFormat,
|
|
@@ -17,9 +14,7 @@ from extract_core import (
|
|
|
17
14
|
PipelineType,
|
|
18
15
|
Result,
|
|
19
16
|
Status,
|
|
20
|
-
SupportedExt,
|
|
21
17
|
)
|
|
22
|
-
from pydantic import Field
|
|
23
18
|
|
|
24
19
|
from .constants import ARTIFACTS
|
|
25
20
|
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
@@ -29,55 +24,11 @@ if TYPE_CHECKING:
|
|
|
29
24
|
from PIL import Image
|
|
30
25
|
|
|
31
26
|
|
|
32
|
-
class MarkerPipelineConfig(BasePipelineConfig):
|
|
33
|
-
pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
|
|
34
|
-
|
|
35
|
-
config: dict[str, Any] = Field(default_factory=dict)
|
|
36
|
-
|
|
37
|
-
@classmethod
|
|
38
|
-
@cache
|
|
39
|
-
def supported_exts(cls) -> set[SupportedExt]:
|
|
40
|
-
# Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
|
|
41
|
-
return {
|
|
42
|
-
SupportedExt.PDF,
|
|
43
|
-
SupportedExt.XLS,
|
|
44
|
-
SupportedExt.XLSX,
|
|
45
|
-
SupportedExt.XLSM,
|
|
46
|
-
SupportedExt.CSV,
|
|
47
|
-
SupportedExt.ODS,
|
|
48
|
-
SupportedExt.DOC,
|
|
49
|
-
SupportedExt.DOCX,
|
|
50
|
-
SupportedExt.ODT,
|
|
51
|
-
SupportedExt.PPT,
|
|
52
|
-
SupportedExt.PPTX,
|
|
53
|
-
SupportedExt.ODP,
|
|
54
|
-
SupportedExt.HTLM,
|
|
55
|
-
SupportedExt.EPUB,
|
|
56
|
-
SupportedExt.PNG,
|
|
57
|
-
SupportedExt.JPG,
|
|
58
|
-
SupportedExt.JPEG,
|
|
59
|
-
SupportedExt.WEBP,
|
|
60
|
-
SupportedExt.GIF,
|
|
61
|
-
SupportedExt.TIFF,
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
|
|
65
27
|
_MARKER_CONVERSION_ERRORS = tuple()
|
|
66
28
|
|
|
67
29
|
|
|
68
30
|
@Pipeline.register(PipelineType.MARKER)
|
|
69
31
|
class MarkerPipeline(Pipeline):
|
|
70
|
-
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
marker_config: dict[str, Any] | None = None,
|
|
73
|
-
*,
|
|
74
|
-
device: Device = Device.CPU,
|
|
75
|
-
):
|
|
76
|
-
super().__init__(device)
|
|
77
|
-
if marker_config is None:
|
|
78
|
-
marker_config = dict()
|
|
79
|
-
self._marker_config = marker_config
|
|
80
|
-
|
|
81
32
|
async def extract_content(
|
|
82
33
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
83
34
|
) -> AsyncGenerator[Result, None]:
|
|
@@ -85,7 +36,7 @@ class MarkerPipeline(Pipeline):
|
|
|
85
36
|
from marker.converters.pdf import PdfConverter # noqa: PLC0415
|
|
86
37
|
from marker.models import create_model_dict # noqa: PLC0415
|
|
87
38
|
|
|
88
|
-
config = deepcopy(self.
|
|
39
|
+
config = deepcopy(self._config.config)
|
|
89
40
|
config["output_format"] = output_format.to_marker()
|
|
90
41
|
config_parser = ConfigParser(config)
|
|
91
42
|
renderer = config_parser.get_renderer()
|
|
@@ -98,15 +49,6 @@ class MarkerPipeline(Pipeline):
|
|
|
98
49
|
for doc in docs:
|
|
99
50
|
yield await _process_doc(doc, converter, output_format, output_path)
|
|
100
51
|
|
|
101
|
-
@classmethod
|
|
102
|
-
def _from_config(
|
|
103
|
-
cls,
|
|
104
|
-
config: MarkerPipelineConfig,
|
|
105
|
-
*,
|
|
106
|
-
device: Device = Device.CPU,
|
|
107
|
-
) -> Self:
|
|
108
|
-
return cls(config.config, device=device)
|
|
109
|
-
|
|
110
52
|
|
|
111
53
|
@report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
|
|
112
54
|
async def _process_doc(
|
|
@@ -5,14 +5,11 @@ from collections.abc import AsyncGenerator, Callable, Iterable
|
|
|
5
5
|
from functools import partial
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from tempfile import TemporaryDirectory
|
|
8
|
-
from typing import Self
|
|
9
8
|
|
|
10
9
|
from extract_core import (
|
|
11
10
|
ConversionOutput,
|
|
12
|
-
Device,
|
|
13
11
|
InputDoc,
|
|
14
12
|
MinerUBackend,
|
|
15
|
-
MinerUConfig,
|
|
16
13
|
MinerUPipelineConfig,
|
|
17
14
|
OutputFormat,
|
|
18
15
|
PageIndexes,
|
|
@@ -31,13 +28,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
|
|
|
31
28
|
|
|
32
29
|
@Pipeline.register(PipelineType.MINER_U)
|
|
33
30
|
class MinerUPipeline(Pipeline):
|
|
34
|
-
def __init__(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
self._config = config
|
|
39
|
-
self._language = language
|
|
40
|
-
self._md_make_fn = _parse_md_make_fn(config.backend)
|
|
31
|
+
def __init__(self, config: MinerUPipelineConfig):
|
|
32
|
+
super().__init__(config)
|
|
33
|
+
self._language = self._config.language
|
|
34
|
+
self._md_make_fn = _parse_md_make_fn(self._config.config.backend)
|
|
41
35
|
|
|
42
36
|
async def extract_content(
|
|
43
37
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
@@ -59,7 +53,7 @@ class MinerUPipeline(Pipeline):
|
|
|
59
53
|
pdf_file_names=pdfs_names,
|
|
60
54
|
pdf_bytes_list=pdfs_bytes,
|
|
61
55
|
p_lang_list=p_lang_list,
|
|
62
|
-
**self._config.as_parse_kwargs(),
|
|
56
|
+
**self._config.config.as_parse_kwargs(),
|
|
63
57
|
)
|
|
64
58
|
res_paths = [
|
|
65
59
|
_revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
|
|
@@ -73,15 +67,6 @@ class MinerUPipeline(Pipeline):
|
|
|
73
67
|
output_path=output_path,
|
|
74
68
|
)
|
|
75
69
|
|
|
76
|
-
@classmethod
|
|
77
|
-
def _from_config(
|
|
78
|
-
cls,
|
|
79
|
-
config: MinerUPipelineConfig,
|
|
80
|
-
*,
|
|
81
|
-
device: Device = Device.CPU,
|
|
82
|
-
) -> Self:
|
|
83
|
-
return cls(config.config, language=config.language, device=device)
|
|
84
|
-
|
|
85
70
|
|
|
86
71
|
def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
|
|
87
72
|
output_path = output_dir / pdf_filename
|
|
@@ -9,7 +9,7 @@ readme = "README.md"
|
|
|
9
9
|
requires-python = ">=3.11,<3.14"
|
|
10
10
|
dependencies = [
|
|
11
11
|
"icij-common~=0.8.2",
|
|
12
|
-
"extract-core~=0.
|
|
12
|
+
"extract-core~=0.6.0",
|
|
13
13
|
]
|
|
14
14
|
|
|
15
15
|
[project.optional-dependencies]
|
|
@@ -51,14 +51,28 @@ override-dependencies = [
|
|
|
51
51
|
"pillow==11.3.0",
|
|
52
52
|
]
|
|
53
53
|
|
|
54
|
+
[[tool.uv.index]]
|
|
55
|
+
name = "pytorch-cpu"
|
|
56
|
+
url = "https://download.pytorch.org/whl/cpu"
|
|
57
|
+
explicit = true
|
|
58
|
+
|
|
59
|
+
|
|
54
60
|
[tool.uv.sources]
|
|
55
61
|
extract-core = { path = "../extract-core", editable = true }
|
|
62
|
+
torch = [
|
|
63
|
+
{ index = "pytorch-cpu" },
|
|
64
|
+
]
|
|
65
|
+
torchvision = [
|
|
66
|
+
{ index = "pytorch-cpu" },
|
|
67
|
+
]
|
|
56
68
|
|
|
57
69
|
[dependency-groups]
|
|
58
70
|
dev = [
|
|
59
71
|
"pytest~=8.3.5",
|
|
60
72
|
"pytest-asyncio~=0.25.3",
|
|
61
73
|
"ruff==0.15.2",
|
|
74
|
+
"torch==2.12.0",
|
|
75
|
+
"torchvision==0.27.0",
|
|
62
76
|
]
|
|
63
77
|
|
|
64
78
|
[project.urls]
|