extract-python 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_python-0.1.0/.gitignore +19 -0
- extract_python-0.1.0/PKG-INFO +24 -0
- extract_python-0.1.0/README.md +0 -0
- extract_python-0.1.0/extract_python/__init__.py +41 -0
- extract_python-0.1.0/extract_python/constants.py +6 -0
- extract_python-0.1.0/extract_python/docling_.py +233 -0
- extract_python-0.1.0/extract_python/marker_.py +118 -0
- extract_python-0.1.0/extract_python/miner_u.py +212 -0
- extract_python-0.1.0/extract_python/objects.py +254 -0
- extract_python-0.1.0/extract_python/pipeline.py +36 -0
- extract_python-0.1.0/extract_python/utils.py +70 -0
- extract_python-0.1.0/pyproject.toml +78 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: extract-python
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Structured content extraction
|
|
5
|
+
Project-URL: Homepage, https://github.com/ICIJ/extract-python
|
|
6
|
+
Project-URL: Repository, https://github.com/ICIJ/extract-python
|
|
7
|
+
Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
|
|
8
|
+
Author-email: Clément Doumouro <cdoumouro@icij.org>
|
|
9
|
+
Requires-Python: <3.14,>=3.11
|
|
10
|
+
Requires-Dist: icij-common~=0.8.2
|
|
11
|
+
Provides-Extra: benches
|
|
12
|
+
Requires-Dist: html2image~=2.0.7; extra == 'benches'
|
|
13
|
+
Requires-Dist: markdown2>=2.5.4; extra == 'benches'
|
|
14
|
+
Requires-Dist: notebook>=7.4.5; extra == 'benches'
|
|
15
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == 'benches'
|
|
16
|
+
Provides-Extra: docling
|
|
17
|
+
Requires-Dist: docling-slim[feat-ocr-easyocr,feat-ocr-mac,feat-ocr-tesserocr,standard]~=2.96; extra == 'docling'
|
|
18
|
+
Provides-Extra: marker
|
|
19
|
+
Requires-Dist: marker-pdf~=1.10; extra == 'marker'
|
|
20
|
+
Provides-Extra: mineru
|
|
21
|
+
Requires-Dist: mineru[mlx]~=3.2; (sys_platform == 'darwin') and extra == 'mineru'
|
|
22
|
+
Requires-Dist: mineru[pipeline,vlm]~=3.2; extra == 'mineru'
|
|
23
|
+
Requires-Dist: pydantic-extra-types[pycountry]~=2.11; extra == 'mineru'
|
|
24
|
+
Requires-Dist: six~=1.17; extra == 'mineru'
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from .objects import InputDoc, OutputFormat, Status
|
|
2
|
+
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from .docling_ import (
|
|
6
|
+
DOCLING_DEFAULT_ARTIFACTS_PATH,
|
|
7
|
+
DoclingPipeline,
|
|
8
|
+
DoclingPipelineConfig,
|
|
9
|
+
)
|
|
10
|
+
except ImportError:
|
|
11
|
+
DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline, DoclingPipelineConfig = (
|
|
12
|
+
None,
|
|
13
|
+
None,
|
|
14
|
+
None,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from .marker_ import MarkerPipeline, MarkerPipelineConfig
|
|
19
|
+
except ImportError:
|
|
20
|
+
MarkerPipeline, MarkerPipelineConfig = None, None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from .miner_u import MinerUPipeline, MinerUPipelineConfig
|
|
25
|
+
except ImportError:
|
|
26
|
+
MinerUPipeline, MinerUPipelineConfig = None, None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"DoclingPipeline",
|
|
31
|
+
"DoclingPipelineConfig",
|
|
32
|
+
"InputDoc",
|
|
33
|
+
"DOCLING_DEFAULT_ARTIFACTS_PATH",
|
|
34
|
+
"MarkerPipeline",
|
|
35
|
+
"MarkerPipelineConfig",
|
|
36
|
+
"OutputFormat",
|
|
37
|
+
"Pipeline",
|
|
38
|
+
"PipelineType",
|
|
39
|
+
"PipelineConfig",
|
|
40
|
+
"Status",
|
|
41
|
+
]
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
import tempfile
|
|
3
|
+
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
|
+
from functools import cache
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, ClassVar, Literal, TypeVar
|
|
7
|
+
|
|
8
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
9
|
+
from docling.datamodel.base_models import InputFormat
|
|
10
|
+
from docling.datamodel.document import ConversionResult
|
|
11
|
+
from docling.datamodel.pipeline_options import (
|
|
12
|
+
EasyOcrOptions,
|
|
13
|
+
PdfPipelineOptions,
|
|
14
|
+
PipelineOptions,
|
|
15
|
+
VlmPipelineOptions,
|
|
16
|
+
)
|
|
17
|
+
from docling.document_converter import DocumentConverter, FormatOption
|
|
18
|
+
from docling.models.factories import get_ocr_factory
|
|
19
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
|
20
|
+
from docling_core.types.doc import ImageRefMode
|
|
21
|
+
from docling_core.types.io import DocumentStream
|
|
22
|
+
from icij_common.registrable import FromConfig
|
|
23
|
+
from pydantic import Field, model_validator
|
|
24
|
+
|
|
25
|
+
from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
|
|
26
|
+
from .objects import (
|
|
27
|
+
BaseModel,
|
|
28
|
+
Error,
|
|
29
|
+
InputDoc,
|
|
30
|
+
MarkdownDoc,
|
|
31
|
+
OutputFormat,
|
|
32
|
+
PageIndexes,
|
|
33
|
+
Result,
|
|
34
|
+
Status,
|
|
35
|
+
)
|
|
36
|
+
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
37
|
+
from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
|
|
38
|
+
|
|
39
|
+
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _PdfPipelineOptions(PdfPipelineOptions):
|
|
43
|
+
generate_picture_images: bool = Field(default=True, frozen=True)
|
|
44
|
+
|
|
45
|
+
@model_validator(mode="before")
|
|
46
|
+
@classmethod
|
|
47
|
+
def validate_ocr_options(cls, data: Any) -> Any:
|
|
48
|
+
if isinstance(data, dict):
|
|
49
|
+
ocr_options = data.get("ocr_options")
|
|
50
|
+
if not isinstance(ocr_options, dict):
|
|
51
|
+
return data
|
|
52
|
+
allow_external_plugins = ocr_options.get("allow_external_plugins", False)
|
|
53
|
+
ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
|
|
54
|
+
kind = ocr_options.pop("kind")
|
|
55
|
+
data["ocr_options"] = ocr_factory.create_options(kind=kind, **ocr_options)
|
|
56
|
+
return data
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
OptionsByPipeline = list[
|
|
60
|
+
tuple[Literal["pdf"], _PdfPipelineOptions]
|
|
61
|
+
| tuple[Literal["vlm"], VlmPipelineOptions]
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _default_pipeline_options() -> OptionsByPipeline:
|
|
66
|
+
pipeline_options = _PdfPipelineOptions(ocr_options=EasyOcrOptions())
|
|
67
|
+
return [("pdf", pipeline_options), ("vlm", VlmPipelineOptions())]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class DoclingFormatOption(BaseModel):
|
|
71
|
+
pipeline_cls: str
|
|
72
|
+
backend_cls: str
|
|
73
|
+
|
|
74
|
+
def to_docling(
|
|
75
|
+
self, pipeline_options: dict[Literal["pdf", "vlm"], PipelineOptions]
|
|
76
|
+
) -> FormatOption:
|
|
77
|
+
pipeline_cls = _find_subcls(BasePipeline, self.pipeline_cls)
|
|
78
|
+
backend_cls = _find_subcls(AbstractDocumentBackend, self.backend_cls)
|
|
79
|
+
if "vlm" in self.pipeline_cls.lower():
|
|
80
|
+
pipeline_options = pipeline_options.get("vlm")
|
|
81
|
+
if pipeline_options is not None:
|
|
82
|
+
pipeline_options = VlmPipelineOptions.model_validate(pipeline_options)
|
|
83
|
+
elif "pdf" in self.pipeline_cls.lower():
|
|
84
|
+
pipeline_options = pipeline_options.get("pdf")
|
|
85
|
+
if pipeline_options is not None:
|
|
86
|
+
pipeline_options = _PdfPipelineOptions.model_validate(pipeline_options)
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"invalid pipeline_cls: {pipeline_cls}, expected a VLM or PDF pipeline"
|
|
90
|
+
)
|
|
91
|
+
return FormatOption(
|
|
92
|
+
pipeline_cls=pipeline_cls,
|
|
93
|
+
pipeline_options=pipeline_options,
|
|
94
|
+
backend=backend_cls,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@cache
|
|
99
|
+
def _default_format_options() -> dict[InputFormat, DoclingFormatOption]:
|
|
100
|
+
supported_fmt = {InputFormat.PDF}
|
|
101
|
+
return {
|
|
102
|
+
fmt: DoclingFormatOption(
|
|
103
|
+
pipeline_cls=opt.pipeline_cls.__name__, backend_cls=opt.backend.__name__
|
|
104
|
+
)
|
|
105
|
+
for fmt, opt in DocumentConverter().format_to_options.items()
|
|
106
|
+
if fmt in supported_fmt
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
T = TypeVar("T")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
114
|
+
for c in all_subclasses(cls):
|
|
115
|
+
if c.__name__ == name:
|
|
116
|
+
return c
|
|
117
|
+
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@PipelineConfig.register()
|
|
121
|
+
class DoclingPipelineConfig(PipelineConfig):
|
|
122
|
+
pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
|
|
123
|
+
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
124
|
+
|
|
125
|
+
pipeline_options: OptionsByPipeline = Field(
|
|
126
|
+
default_factory=_default_pipeline_options
|
|
127
|
+
)
|
|
128
|
+
format_options: dict[InputFormat, DoclingFormatOption] = Field(
|
|
129
|
+
default_factory=_default_format_options
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def to_format_options(self) -> dict[InputFormat, FormatOption]:
|
|
133
|
+
pipeline_options = dict(self.pipeline_options)
|
|
134
|
+
return {
|
|
135
|
+
InputFormat(f): opt.to_docling(pipeline_options)
|
|
136
|
+
for f, opt in self.format_options.items()
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@Pipeline.register(PipelineType.DOCLING)
|
|
144
|
+
class DoclingPipeline(Pipeline):
|
|
145
|
+
def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
|
|
146
|
+
if format_options is None:
|
|
147
|
+
format_options = DEFAULT_FORMAT_OPTIONS
|
|
148
|
+
self._converter = DocumentConverter(format_options=format_options)
|
|
149
|
+
|
|
150
|
+
async def extract_content(
|
|
151
|
+
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
152
|
+
) -> AsyncGenerator[Result, None]:
|
|
153
|
+
docs, path_or_streams = map_and_preserve(_to_docling, docs)
|
|
154
|
+
outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
|
|
155
|
+
for doc, res in zip(docs, outputs, strict=True):
|
|
156
|
+
yield _to_result(res, doc, output_format, output_path=output_path)
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
|
|
160
|
+
return cls(config.to_format_options())
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
|
|
164
|
+
for d in docs:
|
|
165
|
+
yield d.to_docling()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _to_result(
|
|
169
|
+
res: ConversionResult,
|
|
170
|
+
input_document: InputDoc,
|
|
171
|
+
output_format: OutputFormat,
|
|
172
|
+
output_path: Path,
|
|
173
|
+
**kwargs,
|
|
174
|
+
) -> Result:
|
|
175
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
output = None
|
|
177
|
+
status = Status.from_docling(res.status)
|
|
178
|
+
if status.allows_conversion:
|
|
179
|
+
match output_format:
|
|
180
|
+
case OutputFormat.MARKDOWN:
|
|
181
|
+
output = _to_markdown_doc(res, output_path, **kwargs)
|
|
182
|
+
case _:
|
|
183
|
+
raise NotImplementedError(f"unsupported output format {output_format}")
|
|
184
|
+
errors = [Error.from_docling(e) for e in res.errors]
|
|
185
|
+
input_doc = input_document.without_content()
|
|
186
|
+
return Result(input=input_doc, status=status, errors=errors, output=output)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _to_markdown_doc(
|
|
190
|
+
res: ConversionResult,
|
|
191
|
+
output_path: Path,
|
|
192
|
+
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
193
|
+
**kwargs,
|
|
194
|
+
) -> MarkdownDoc:
|
|
195
|
+
# TODO: Should we add a hash to avoid collision between files with same names
|
|
196
|
+
# nested in the tree structured
|
|
197
|
+
md_dir_name = path_to_artifacts_dirname(res.input.file)
|
|
198
|
+
md_dir = output_path / md_dir_name
|
|
199
|
+
if md_dir.exists():
|
|
200
|
+
raise FileExistsError(f"directory {md_dir} already exists")
|
|
201
|
+
# Let's avoid issue of duplicated input file names flattened top level
|
|
202
|
+
md_filename = md_dir_name + OutputFormat.MARKDOWN
|
|
203
|
+
total_length = 0
|
|
204
|
+
n_pages = len(res.pages)
|
|
205
|
+
|
|
206
|
+
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
|
|
207
|
+
tmp_dir = Path(td)
|
|
208
|
+
page_path = Path("page.md")
|
|
209
|
+
# We do a chdir to bypass a Docling bug which only allows to maintain relative
|
|
210
|
+
# image ref when saving the markdown to a relative path
|
|
211
|
+
with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
|
|
212
|
+
end_indices = []
|
|
213
|
+
for page_i in range(n_pages):
|
|
214
|
+
res.document.save_as_markdown(
|
|
215
|
+
page_path,
|
|
216
|
+
page_no=page_i + 1,
|
|
217
|
+
image_mode=ImageRefMode.REFERENCED,
|
|
218
|
+
artifacts_dir=Path(ARTIFACTS),
|
|
219
|
+
**kwargs,
|
|
220
|
+
)
|
|
221
|
+
content = page_path.read_text()
|
|
222
|
+
if page_i > 0:
|
|
223
|
+
content += "\n"
|
|
224
|
+
if page_i < n_pages - 1:
|
|
225
|
+
content += page_sep
|
|
226
|
+
total_length += len(content)
|
|
227
|
+
end_indices.append(total_length)
|
|
228
|
+
f.write(content)
|
|
229
|
+
f.flush()
|
|
230
|
+
page_path.unlink()
|
|
231
|
+
shutil.move(tmp_dir, md_dir)
|
|
232
|
+
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
233
|
+
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
from collections.abc import AsyncGenerator, Iterable
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, ClassVar, Self
|
|
6
|
+
|
|
7
|
+
from marker.config.parser import ConfigParser
|
|
8
|
+
from marker.converters.pdf import PdfConverter
|
|
9
|
+
from marker.models import create_model_dict
|
|
10
|
+
from marker.output import text_from_rendered
|
|
11
|
+
from marker.renderers.markdown import MarkdownRenderer
|
|
12
|
+
from PIL.Image import Image
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from .constants import ARTIFACTS, CPU_GROUP
|
|
16
|
+
from .objects import (
|
|
17
|
+
InputDoc,
|
|
18
|
+
MarkdownDoc,
|
|
19
|
+
OutputFormat,
|
|
20
|
+
PageIndexes,
|
|
21
|
+
Result,
|
|
22
|
+
Status,
|
|
23
|
+
)
|
|
24
|
+
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
25
|
+
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@PipelineConfig.register()
|
|
29
|
+
class MarkerPipelineConfig(PipelineConfig):
|
|
30
|
+
pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
|
|
31
|
+
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
32
|
+
|
|
33
|
+
config: dict[str, Any] = dict()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_MARKER_CONVERSION_ERRORS = tuple()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@Pipeline.register(PipelineType.MARKER)
|
|
40
|
+
class MarkerPipeline(Pipeline):
|
|
41
|
+
def __init__(self, marker_config: dict[str, Any] | None = None):
|
|
42
|
+
if marker_config is None:
|
|
43
|
+
marker_config = dict()
|
|
44
|
+
self._marker_config = marker_config
|
|
45
|
+
|
|
46
|
+
async def extract_content(
|
|
47
|
+
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
48
|
+
) -> AsyncGenerator[Result, None]:
|
|
49
|
+
config = deepcopy(self._marker_config)
|
|
50
|
+
config["output_format"] = output_format.to_marker()
|
|
51
|
+
config_parser = ConfigParser(config)
|
|
52
|
+
renderer = config_parser.get_renderer()
|
|
53
|
+
converter = PdfConverter(
|
|
54
|
+
config=config_parser.generate_config_dict(),
|
|
55
|
+
artifact_dict=create_model_dict(),
|
|
56
|
+
processor_list=config_parser.get_processors(),
|
|
57
|
+
renderer=renderer,
|
|
58
|
+
)
|
|
59
|
+
for doc in docs:
|
|
60
|
+
yield _process_doc(doc, converter, output_format, output_path)
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def _from_config(cls, config: MarkerPipelineConfig) -> Self:
|
|
64
|
+
return cls(config.config)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
|
|
68
|
+
def _process_doc(
|
|
69
|
+
doc: InputDoc,
|
|
70
|
+
converter: PdfConverter,
|
|
71
|
+
output_format: OutputFormat,
|
|
72
|
+
output_path: Path,
|
|
73
|
+
) -> Result:
|
|
74
|
+
rendered = converter(str(doc.path))
|
|
75
|
+
content, _, images = text_from_rendered(rendered)
|
|
76
|
+
match output_format:
|
|
77
|
+
case OutputFormat.MARKDOWN:
|
|
78
|
+
output = _to_markdown_doc(doc, content, images, output_path)
|
|
79
|
+
case _:
|
|
80
|
+
raise NotImplementedError(f"unsupported output format {output_format}")
|
|
81
|
+
input_doc = doc.without_content()
|
|
82
|
+
return Result(input=input_doc, status=Status.SUCCESS, output=output)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _to_markdown_doc(
|
|
86
|
+
input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
|
|
87
|
+
) -> MarkdownDoc:
|
|
88
|
+
# TODO: Should we add a hash to avoid collision between files with same names
|
|
89
|
+
# nested in the tree structured
|
|
90
|
+
md_dir_name = path_to_artifacts_dirname(input_doc.path)
|
|
91
|
+
md_dir = output_path / md_dir_name
|
|
92
|
+
artifacts_dir = md_dir / ARTIFACTS
|
|
93
|
+
artifacts_dir.mkdir(parents=True)
|
|
94
|
+
for im_name, im in images.items():
|
|
95
|
+
im.save(artifacts_dir / im_name)
|
|
96
|
+
del images
|
|
97
|
+
gc.collect()
|
|
98
|
+
page_sep = MarkdownRenderer.page_separator
|
|
99
|
+
content = content.split(page_sep)
|
|
100
|
+
n_pages = len(content)
|
|
101
|
+
md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
|
|
102
|
+
OutputFormat.MARKDOWN.value
|
|
103
|
+
)
|
|
104
|
+
total_length = 0
|
|
105
|
+
end_indices = []
|
|
106
|
+
with md_path.open("w", encoding="utf-8") as f:
|
|
107
|
+
for page_i, page_content in enumerate(content):
|
|
108
|
+
content = page_content
|
|
109
|
+
if page_i > 0:
|
|
110
|
+
content += "\n"
|
|
111
|
+
if page_i < n_pages - 1:
|
|
112
|
+
content += page_sep
|
|
113
|
+
total_length += len(content)
|
|
114
|
+
end_indices.append(total_length)
|
|
115
|
+
f.write(content)
|
|
116
|
+
f.flush()
|
|
117
|
+
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
118
|
+
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import shutil
|
|
3
|
+
from collections.abc import AsyncGenerator, Callable, Iterable
|
|
4
|
+
from copy import copy
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
from functools import partial
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from tempfile import TemporaryDirectory
|
|
9
|
+
from typing import Any, ClassVar, Self
|
|
10
|
+
|
|
11
|
+
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
|
|
12
|
+
union_make as pipeline_union_make,
|
|
13
|
+
)
|
|
14
|
+
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
15
|
+
from mineru.cli.common import aio_do_parse
|
|
16
|
+
from mineru.utils.enum_class import MakeMode
|
|
17
|
+
from pydantic import Field
|
|
18
|
+
from pydantic_extra_types.language_code import LanguageAlpha2
|
|
19
|
+
|
|
20
|
+
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP, MINER_U_GROUP
|
|
21
|
+
from .objects import (
|
|
22
|
+
BaseModel,
|
|
23
|
+
ConversionOutput,
|
|
24
|
+
InputDoc,
|
|
25
|
+
OutputFormat,
|
|
26
|
+
PageIndexes,
|
|
27
|
+
Result,
|
|
28
|
+
Status,
|
|
29
|
+
)
|
|
30
|
+
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
31
|
+
from .utils import path_to_artifacts_dirname
|
|
32
|
+
|
|
33
|
+
_MINER_U_CONVERSION_ERRORS = tuple()
|
|
34
|
+
MDMakeFunction = Callable[[list, str, str], str | None]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class MinerUBackend(StrEnum):
|
|
38
|
+
PIPELINE = "pipeline"
|
|
39
|
+
VLM = "vlm"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MinerUConfig(BaseModel):
|
|
43
|
+
backend: MinerUBackend = MinerUBackend.PIPELINE
|
|
44
|
+
enable_formula_extraction: bool = True
|
|
45
|
+
enable_table_extraction: bool = True
|
|
46
|
+
# TODO: use enum or literal here
|
|
47
|
+
parse_method: str = "auto"
|
|
48
|
+
|
|
49
|
+
default_kwargs: ClassVar[dict] = {
|
|
50
|
+
"server_url": None,
|
|
51
|
+
# We don't dump md directly we process, we dump the middle json in order to be
|
|
52
|
+
# able to get page indexes
|
|
53
|
+
"parse_method": "auto",
|
|
54
|
+
"dump_md": False,
|
|
55
|
+
"dump_middle_json": True,
|
|
56
|
+
"f_draw_layout_bbox": False,
|
|
57
|
+
"f_draw_span_bbox": False,
|
|
58
|
+
"f_dump_model_output": False, # might be useful for debug though
|
|
59
|
+
"f_dump_orig_pdf": False,
|
|
60
|
+
"f_dump_content_list": False, # might be useful for debug though
|
|
61
|
+
"start_page_id": 0,
|
|
62
|
+
"f_make_md_mode": MakeMode.MM_MD,
|
|
63
|
+
"image_analysis": True,
|
|
64
|
+
"end_page_id": None,
|
|
65
|
+
"client_side_output_generation": False,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def as_parse_kwargs(self) -> dict[str, Any]:
|
|
69
|
+
kwargs = copy(self.default_kwargs)
|
|
70
|
+
kwargs["backend"] = self.backend
|
|
71
|
+
kwargs["parse_method"] = self.parse_method
|
|
72
|
+
kwargs["formula_enable"] = self.enable_formula_extraction
|
|
73
|
+
kwargs["table_enable"] = self.enable_table_extraction
|
|
74
|
+
return kwargs
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@PipelineConfig.register() # noqa: F821
|
|
78
|
+
class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
79
|
+
pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
|
|
80
|
+
task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
|
|
81
|
+
|
|
82
|
+
config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
|
|
83
|
+
language: LanguageAlpha2 = Field(frozen=True, default="en")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@Pipeline.register(PipelineType.MINER_U)
|
|
87
|
+
class MinerUPipeline(Pipeline):
|
|
88
|
+
def __init__(self, config: MinerUConfig, language: str):
|
|
89
|
+
self._config = config
|
|
90
|
+
self._language = language
|
|
91
|
+
self._md_make_fn = _parse_md_make_fn(config.backend)
|
|
92
|
+
|
|
93
|
+
async def extract_content(
|
|
94
|
+
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
95
|
+
) -> AsyncGenerator[Result, None]:
|
|
96
|
+
docs = list(docs)
|
|
97
|
+
# TODO: exclude files which are not pdf and return an error
|
|
98
|
+
pdfs_bytes = [d.path.read_bytes() for d in docs]
|
|
99
|
+
pdfs_names = [d.path.name for d in docs]
|
|
100
|
+
p_lang_list = [self._language for _ in pdfs_names]
|
|
101
|
+
# TODO: we should only process valid PDFs
|
|
102
|
+
with TemporaryDirectory(prefix="mineru-") as workdir:
|
|
103
|
+
workdir = Path(workdir) # noqa: PLW2901
|
|
104
|
+
await aio_do_parse(
|
|
105
|
+
output_dir=workdir,
|
|
106
|
+
pdf_file_names=pdfs_names,
|
|
107
|
+
pdf_bytes_list=pdfs_bytes,
|
|
108
|
+
p_lang_list=p_lang_list,
|
|
109
|
+
**self._config.as_parse_kwargs(),
|
|
110
|
+
)
|
|
111
|
+
res_paths = [
|
|
112
|
+
_revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
|
|
113
|
+
]
|
|
114
|
+
for doc, res_path in zip(docs, res_paths, strict=True):
|
|
115
|
+
yield _process_doc(
|
|
116
|
+
doc,
|
|
117
|
+
md_make_fn=self._md_make_fn,
|
|
118
|
+
res_path=res_path,
|
|
119
|
+
output_format=output_format,
|
|
120
|
+
output_path=output_path,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
@classmethod
|
|
124
|
+
def _from_config(cls, config: MinerUPipelineConfig) -> Self:
|
|
125
|
+
return cls(config.config, language=config.language)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
|
|
129
|
+
output_path = output_dir / pdf_filename
|
|
130
|
+
if not output_path.exists():
|
|
131
|
+
msg = f"couldn't find result for {pdf_filename}"
|
|
132
|
+
raise FileNotFoundError(msg)
|
|
133
|
+
dirs = [p for p in output_path.iterdir() if p.is_dir()]
|
|
134
|
+
if len(dirs) != 1:
|
|
135
|
+
msg = f"expected exactly one result directory, found: {dirs}"
|
|
136
|
+
raise ValueError(msg)
|
|
137
|
+
return output_dir / dirs[0]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
|
|
141
|
+
match backend:
|
|
142
|
+
case MinerUBackend.PIPELINE:
|
|
143
|
+
return pipeline_union_make
|
|
144
|
+
case MinerUBackend.VLM:
|
|
145
|
+
return vlm_union_make
|
|
146
|
+
case _:
|
|
147
|
+
raise ValueError(f"Unsupported backend: {backend}")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _process_doc(
|
|
151
|
+
doc: InputDoc,
|
|
152
|
+
*,
|
|
153
|
+
md_make_fn: MDMakeFunction,
|
|
154
|
+
res_path: Path,
|
|
155
|
+
output_format: OutputFormat,
|
|
156
|
+
output_path: Path,
|
|
157
|
+
) -> Result:
|
|
158
|
+
md_dir_name = path_to_artifacts_dirname(doc.path)
|
|
159
|
+
md_dir = Path(output_path) / md_dir_name
|
|
160
|
+
md_dir.mkdir(parents=True, exist_ok=False)
|
|
161
|
+
artifacts_dir = md_dir / ARTIFACTS
|
|
162
|
+
md_path = (md_dir / md_dir_name).with_suffix(OutputFormat.MARKDOWN.value)
|
|
163
|
+
# Fail early
|
|
164
|
+
match output_format:
|
|
165
|
+
case OutputFormat.MARKDOWN:
|
|
166
|
+
im_rel_dir = artifacts_dir.relative_to(md_dir)
|
|
167
|
+
dump_content_fn = partial(
|
|
168
|
+
_dump_md_content,
|
|
169
|
+
md_make_fn=md_make_fn,
|
|
170
|
+
output_path=output_path,
|
|
171
|
+
md_path=md_path,
|
|
172
|
+
im_dir=im_rel_dir,
|
|
173
|
+
)
|
|
174
|
+
case _:
|
|
175
|
+
raise NotImplementedError(f"unsupported output format {output_format}")
|
|
176
|
+
middle_json_path = res_path / f"{doc.path.name}_middle.json"
|
|
177
|
+
middle_json = json.loads(middle_json_path.read_text())
|
|
178
|
+
pdf_info = middle_json["pdf_info"]
|
|
179
|
+
shutil.move(res_path / "images", artifacts_dir)
|
|
180
|
+
output = dump_content_fn(pdf_info)
|
|
181
|
+
input_doc = doc.without_content()
|
|
182
|
+
return Result(input=input_doc, status=Status.SUCCESS, output=output)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _dump_md_content(
|
|
186
|
+
pdf_info: list[dict],
|
|
187
|
+
*,
|
|
188
|
+
md_make_fn: MDMakeFunction,
|
|
189
|
+
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
190
|
+
output_path: Path,
|
|
191
|
+
md_path: Path,
|
|
192
|
+
im_dir: Path,
|
|
193
|
+
md_make_mode: str = MakeMode.MM_MD,
|
|
194
|
+
) -> ConversionOutput:
|
|
195
|
+
total_length = 0
|
|
196
|
+
end_indices = []
|
|
197
|
+
with md_path.open("w") as f:
|
|
198
|
+
n_pages = len(pdf_info)
|
|
199
|
+
for page_i, page in enumerate(pdf_info):
|
|
200
|
+
content = md_make_fn([page], md_make_mode, str(im_dir))
|
|
201
|
+
if page_i > 0:
|
|
202
|
+
content += "\n"
|
|
203
|
+
if page_i < n_pages - 1:
|
|
204
|
+
content += page_sep
|
|
205
|
+
total_length += len(content)
|
|
206
|
+
end_indices.append(total_length)
|
|
207
|
+
f.write(content)
|
|
208
|
+
f.flush()
|
|
209
|
+
end_indices = PageIndexes.from_page_end_indices(end_indices)
|
|
210
|
+
output_path = md_path.parent.relative_to(output_path)
|
|
211
|
+
output = ConversionOutput(path=output_path, pages=end_indices)
|
|
212
|
+
return output
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import traceback
|
|
6
|
+
import uuid
|
|
7
|
+
from abc import ABC
|
|
8
|
+
from enum import StrEnum
|
|
9
|
+
from functools import cache
|
|
10
|
+
from io import BytesIO
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Annotated, Any, NoReturn, Self
|
|
13
|
+
|
|
14
|
+
from icij_common.pydantic_utils import (
|
|
15
|
+
icij_config,
|
|
16
|
+
merge_configs,
|
|
17
|
+
no_enum_values_config,
|
|
18
|
+
safe_copy,
|
|
19
|
+
)
|
|
20
|
+
from pydantic import AfterValidator, RootModel, TypeAdapter
|
|
21
|
+
from pydantic import BaseModel as _BaseModel
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem, InputFormat
|
|
25
|
+
from docling.datamodel.document import InputDocument
|
|
26
|
+
from docling_core.types.io import DocumentStream
|
|
27
|
+
except ImportError:
|
|
28
|
+
ConversionStatus, ErrorItem, InputFormat = None, None, None
|
|
29
|
+
InputDocument = None
|
|
30
|
+
DocumentStream = None
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
base_config = merge_configs(icij_config(), no_enum_values_config())
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class BaseModel(_BaseModel):
|
|
37
|
+
model_config = base_config
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SupportedExt(StrEnum):
|
|
41
|
+
PDF = ".pdf"
|
|
42
|
+
|
|
43
|
+
def to_docling(self) -> InputFormat:
|
|
44
|
+
return InputFormat(self.value[1:])
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class OutputFormat(StrEnum):
|
|
48
|
+
MARKDOWN = ".md"
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def suffix(self) -> str:
|
|
52
|
+
return self.value[1:]
|
|
53
|
+
|
|
54
|
+
def to_marker(self) -> str:
|
|
55
|
+
match self:
|
|
56
|
+
case OutputFormat.MARKDOWN:
|
|
57
|
+
return "markdown"
|
|
58
|
+
case _:
|
|
59
|
+
raise ValueError(f"{self} is unsupported by marker")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class Status(StrEnum):
|
|
63
|
+
FAILURE = "failure"
|
|
64
|
+
SUCCESS = "success"
|
|
65
|
+
PARTIAL_SUCCESS = "partial_success"
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_docling(cls, v: Any) -> Self:
|
|
69
|
+
from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
|
|
70
|
+
|
|
71
|
+
if v is ConversionStatus.SUCCESS:
|
|
72
|
+
return cls.SUCCESS
|
|
73
|
+
if v is ConversionStatus.PARTIAL_SUCCESS:
|
|
74
|
+
return cls.PARTIAL_SUCCESS
|
|
75
|
+
if isinstance(v, ConversionStatus):
|
|
76
|
+
return cls.FAILURE
|
|
77
|
+
raise TypeError(f"can't convert {v!r} to {cls.__name__!r}")
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def allows_conversion(self) -> bool:
|
|
81
|
+
return self is Status.SUCCESS or self is Status.PARTIAL_SUCCESS
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class Error(BaseModel):
|
|
85
|
+
id: str
|
|
86
|
+
title: str
|
|
87
|
+
detail: str
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def from_exception(cls, exception: BaseException) -> Self:
|
|
91
|
+
title = exception.__class__.__name__
|
|
92
|
+
trace_lines = traceback.format_exception(
|
|
93
|
+
None, value=exception, tb=exception.__traceback__
|
|
94
|
+
)
|
|
95
|
+
detail = f"{exception}\n{''.join(trace_lines)}"
|
|
96
|
+
error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
|
|
97
|
+
error = cls(id=error_id, title=title, detail=detail)
|
|
98
|
+
return error
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_docling(cls, docling_error: ErrorItem) -> Self:
|
|
102
|
+
title = "DoclingConversionError"
|
|
103
|
+
error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
|
|
104
|
+
detail = (
|
|
105
|
+
f"error in module {docling_error.module_name} of"
|
|
106
|
+
f" {docling_error.component_type}:\n{docling_error.error_message}"
|
|
107
|
+
)
|
|
108
|
+
return cls(id=error_id, title=title, detail=detail)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _id_title(title: str) -> str:
|
|
112
|
+
id_title = []
|
|
113
|
+
for i, letter in enumerate(title):
|
|
114
|
+
if i and letter.isupper():
|
|
115
|
+
id_title.append("-")
|
|
116
|
+
id_title.append(letter.lower())
|
|
117
|
+
return "".join(id_title)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class InputDoc(BaseModel):
|
|
121
|
+
ext: SupportedExt
|
|
122
|
+
path: Path
|
|
123
|
+
content: bytes | None = None
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def from_path(cls, path: str | Path) -> Self:
|
|
127
|
+
if isinstance(path, str):
|
|
128
|
+
path = Path(path)
|
|
129
|
+
ext = SupportedExt(path.suffix)
|
|
130
|
+
return cls(path=path, ext=ext)
|
|
131
|
+
|
|
132
|
+
def to_docling(self) -> Path | DocumentStream:
|
|
133
|
+
if self.content is not None:
|
|
134
|
+
return DocumentStream(name=str(self.path), stream=BytesIO(self.content))
|
|
135
|
+
if not self.path.suffix:
|
|
136
|
+
return DocumentStream(
|
|
137
|
+
name=str(self.path), stream=BytesIO(self.path.read_bytes())
|
|
138
|
+
)
|
|
139
|
+
return self.path
|
|
140
|
+
|
|
141
|
+
def without_content(self) -> Self:
|
|
142
|
+
return safe_copy(self, update={"content": None})
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class PageIndexes(RootModel[list[tuple[int, int]]]):
|
|
146
|
+
# Stores page end index
|
|
147
|
+
@classmethod
|
|
148
|
+
def from_page_end_indices(cls, lengths: list[int]) -> Self:
|
|
149
|
+
return [
|
|
150
|
+
((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class ConversionOutput(BaseModel):
|
|
155
|
+
path: Path
|
|
156
|
+
pages: PageIndexes = []
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class MarkdownDoc(ConversionOutput):
|
|
160
|
+
@classmethod
|
|
161
|
+
@property
|
|
162
|
+
@cache
|
|
163
|
+
def _valid_conversion_statuses(cls) -> set[ConversionStatus]:
|
|
164
|
+
from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
|
|
165
|
+
|
|
166
|
+
return {ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _input_should_not_have_content(value: InputDoc) -> InputDoc:
|
|
170
|
+
if value.content is not None:
|
|
171
|
+
raise ValueError(f"response input can't have content, but got {value}")
|
|
172
|
+
return value
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class _BaseResult(BaseModel, ABC):
|
|
176
|
+
input: InputDoc
|
|
177
|
+
status: Status
|
|
178
|
+
errors: list[Error] = []
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class Result(_BaseResult):
|
|
182
|
+
# TODO: we could also use generics here when we add more output formats
|
|
183
|
+
output: ConversionOutput | None
|
|
184
|
+
|
|
185
|
+
def to_response(self) -> ResponseResult:
|
|
186
|
+
return ResponseResult(
|
|
187
|
+
input=self.input.without_content(),
|
|
188
|
+
status=self.status,
|
|
189
|
+
errors=self.errors,
|
|
190
|
+
output_path=self.output.path,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class ResponseResult(_BaseResult):
|
|
195
|
+
input: Annotated[InputDoc, AfterValidator(func=_input_should_not_have_content)]
|
|
196
|
+
output_path: Path
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class ExtractionResponse(BaseModel):
|
|
200
|
+
results: list[ResponseResult]
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
_INPUT_DOCS_ADAPTER = TypeAdapter(list[InputDoc | Path])
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def parse_extraction_request(
|
|
207
|
+
docs: str | list[dict | str], *, data_dir: Path
|
|
208
|
+
) -> list[InputDoc]:
|
|
209
|
+
if isinstance(docs, str):
|
|
210
|
+
logger.debug("exploring files in %s", data_dir.absolute())
|
|
211
|
+
docs_dir = Path(data_dir) / docs
|
|
212
|
+
docs = _as_input_docs(docs_dir)
|
|
213
|
+
msg = "found %s"
|
|
214
|
+
if len(docs) > 10:
|
|
215
|
+
msg = msg + ", and more..."
|
|
216
|
+
logger.debug("found %s", docs[:10])
|
|
217
|
+
return docs
|
|
218
|
+
docs = _INPUT_DOCS_ADAPTER.validate_python(docs)
|
|
219
|
+
if not docs:
|
|
220
|
+
return []
|
|
221
|
+
if isinstance(docs[0], Path):
|
|
222
|
+
doc_meta = []
|
|
223
|
+
unknown_exts = []
|
|
224
|
+
for doc in docs:
|
|
225
|
+
_, ext = os.path.splitext(str(doc))
|
|
226
|
+
if not ext:
|
|
227
|
+
unknown_exts.append(doc)
|
|
228
|
+
else:
|
|
229
|
+
doc_meta.append(InputDoc.from_path(path=doc.relative_to(data_dir)))
|
|
230
|
+
if unknown_exts:
|
|
231
|
+
raise ValueError(f"found files with unknown extensions {unknown_exts}")
|
|
232
|
+
return doc_meta
|
|
233
|
+
return docs
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _raise(err: OSError) -> NoReturn:
|
|
237
|
+
raise err
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _as_input_docs(
|
|
241
|
+
docs_dir: Path, *, supported_ext: set[str] | None = None
|
|
242
|
+
) -> list[InputDoc]:
|
|
243
|
+
if supported_ext is None:
|
|
244
|
+
supported_ext = {v.value for v in SupportedExt}
|
|
245
|
+
docs = []
|
|
246
|
+
for root, _, files in os.walk(docs_dir, onerror=_raise):
|
|
247
|
+
root = Path(root) # noqa: PLW2901
|
|
248
|
+
for f in files:
|
|
249
|
+
ext = Path(f).suffix
|
|
250
|
+
if not ext or ext not in supported_ext:
|
|
251
|
+
continue
|
|
252
|
+
docs.append(InputDoc.from_path(path=root / f))
|
|
253
|
+
docs = sorted(docs, key=lambda x: x.path)
|
|
254
|
+
return docs
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import AsyncGenerator, Iterable
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import ClassVar
|
|
6
|
+
|
|
7
|
+
from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_values_config
|
|
8
|
+
from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from .objects import InputDoc, OutputFormat, Result
|
|
12
|
+
|
|
13
|
+
StructuredContent = str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PipelineType(StrEnum):
|
|
17
|
+
DOCLING = "docling"
|
|
18
|
+
MARKER = "marker"
|
|
19
|
+
MINER_U = "miner_u"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PipelineConfig(RegistrableConfig, ABC):
|
|
23
|
+
# TODO: move this icij_config() to RegistrableConfig
|
|
24
|
+
model_config = merge_configs(icij_config(), no_enum_values_config())
|
|
25
|
+
|
|
26
|
+
registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
|
|
27
|
+
pipeline: PipelineType
|
|
28
|
+
|
|
29
|
+
task_group: ClassVar[str] = Field(frozen=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Pipeline(RegistrableFromConfig, ABC):
|
|
33
|
+
@abstractmethod
|
|
34
|
+
async def extract_content(
|
|
35
|
+
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
36
|
+
) -> AsyncGenerator[Result, None]: ...
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections.abc import Callable, Generator, Iterable, Iterator
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from itertools import tee
|
|
6
|
+
from pathlib import Path, PurePath
|
|
7
|
+
from typing import Protocol, TypeVar
|
|
8
|
+
|
|
9
|
+
from .objects import Error, InputDoc, Result, Status
|
|
10
|
+
|
|
11
|
+
R = TypeVar("R")
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def map_and_preserve(
|
|
16
|
+
fn: Callable[[Iterable[T]], Iterator[R]], inputs: Iterable[T]
|
|
17
|
+
) -> tuple[Iterable[T], Iterator[R]]:
|
|
18
|
+
save_inputs, function_inputs = tee(inputs)
|
|
19
|
+
outputs = iter(fn(function_inputs))
|
|
20
|
+
return save_inputs, outputs
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def all_subclasses(cls: type[T]) -> set[type[T]]:
|
|
24
|
+
return set(cls.__subclasses__()).union(
|
|
25
|
+
[s for c in cls.__subclasses__() for s in all_subclasses(c)]
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def path_to_artifacts_dirname(path: PurePath, sep: str = "_") -> str:
|
|
30
|
+
dirname = f"{path.name[: -len(path.suffix)]}"
|
|
31
|
+
ext = path.suffix
|
|
32
|
+
if ext:
|
|
33
|
+
dirname += sep + ext[1:]
|
|
34
|
+
return dirname
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DocProcessingFn(Protocol):
|
|
38
|
+
def __call__(self, doc: InputDoc, *arg, **kwargs) -> Result: ...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def report_recoverable_errors(
|
|
42
|
+
recoverable_errors: tuple[type[Exception], ...] = tuple(),
|
|
43
|
+
) -> Callable[[DocProcessingFn], DocProcessingFn]:
|
|
44
|
+
def make_decorator(f: DocProcessingFn) -> DocProcessingFn:
|
|
45
|
+
@wraps(f)
|
|
46
|
+
def wrapped(doc: InputDoc, *args, **kwargs) -> Result:
|
|
47
|
+
try:
|
|
48
|
+
return f(doc, *args, **kwargs)
|
|
49
|
+
except recoverable_errors as e:
|
|
50
|
+
error = Error.from_exception(e)
|
|
51
|
+
return Result(
|
|
52
|
+
input=doc.without_content(),
|
|
53
|
+
status=Status.FAILURE,
|
|
54
|
+
errors=[error],
|
|
55
|
+
output=None,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
return wrapped
|
|
59
|
+
|
|
60
|
+
return make_decorator
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@contextmanager
|
|
64
|
+
def chdir(path: Path) -> Generator[None, None, None]:
|
|
65
|
+
cwd = Path.cwd()
|
|
66
|
+
try:
|
|
67
|
+
os.chdir(path)
|
|
68
|
+
yield
|
|
69
|
+
finally:
|
|
70
|
+
os.chdir(cwd)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "extract-python"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Structured content extraction"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "Clément Doumouro", email = "cdoumouro@icij.org" },
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11,<3.14"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"icij-common~=0.8.2",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[tool.hatch.build.targets.sdist]
|
|
15
|
+
include = ["extract_python"]
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
benches = [
|
|
19
|
+
"html2image~=2.0.7",
|
|
20
|
+
"markdown2>=2.5.4",
|
|
21
|
+
"notebook>=7.4.5",
|
|
22
|
+
"pypdfium2>=4.30.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
docling = [
|
|
26
|
+
"docling-slim[standard,feat-ocr-easyocr,feat-ocr-tesserocr,feat-ocr-mac]~=2.96",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
marker = [
|
|
30
|
+
"marker-pdf~=1.10",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
mineru = [
|
|
34
|
+
"mineru[pipeline,vlm]~=3.2",
|
|
35
|
+
"mineru[mlx]~=3.2; sys_platform == 'darwin'",
|
|
36
|
+
"pydantic-extra-types[pycountry]~=2.11",
|
|
37
|
+
"six~=1.17",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[tool.uv]
|
|
41
|
+
conflicts = [
|
|
42
|
+
[
|
|
43
|
+
{ extra = "marker" },
|
|
44
|
+
{ extra = "mineru" },
|
|
45
|
+
],
|
|
46
|
+
]
|
|
47
|
+
required-environments = [
|
|
48
|
+
"sys_platform == 'darwin' and platform_machine == 'arm64'",
|
|
49
|
+
"sys_platform == 'linux'",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
[dependency-groups]
|
|
54
|
+
dev = [
|
|
55
|
+
"pytest~=8.3.5",
|
|
56
|
+
"pytest-asyncio~=0.25.3",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
[project.urls]
|
|
60
|
+
Homepage = "https://github.com/ICIJ/extract-python"
|
|
61
|
+
Repository = "https://github.com/ICIJ/extract-python"
|
|
62
|
+
Issues = "https://github.com/ICIJ/extract-python/issues"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
[tool.pytest.ini_options]
|
|
66
|
+
asyncio_mode = "auto"
|
|
67
|
+
asyncio_default_fixture_loop_scope = "session"
|
|
68
|
+
log_cli = true
|
|
69
|
+
log_cli_level = "INFO"
|
|
70
|
+
log_cli_format = "[%(levelname)s][%(asctime)s.%(msecs)03d][%(name)s]: %(message)s"
|
|
71
|
+
markers = [
|
|
72
|
+
"integration: integration test",
|
|
73
|
+
"miner_u: minerU test which require conflicting extras",
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
[build-system]
|
|
77
|
+
requires = ["hatchling"]
|
|
78
|
+
build-backend = "hatchling.build"
|