extract-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ from .objects import InputDoc, OutputFormat, Status
2
+ from .pipeline import Pipeline, PipelineConfig, PipelineType
3
+
4
+ try:
5
+ from .docling_ import (
6
+ DOCLING_DEFAULT_ARTIFACTS_PATH,
7
+ DoclingPipeline,
8
+ DoclingPipelineConfig,
9
+ )
10
+ except ImportError:
11
+ DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline, DoclingPipelineConfig = (
12
+ None,
13
+ None,
14
+ None,
15
+ )
16
+
17
+ try:
18
+ from .marker_ import MarkerPipeline, MarkerPipelineConfig
19
+ except ImportError:
20
+ MarkerPipeline, MarkerPipelineConfig = None, None
21
+
22
+
23
+ try:
24
+ from .miner_u import MinerUPipeline, MinerUPipelineConfig
25
+ except ImportError:
26
+ MinerUPipeline, MinerUPipelineConfig = None, None
27
+
28
+
29
+ __all__ = [
30
+ "DoclingPipeline",
31
+ "DoclingPipelineConfig",
32
+ "InputDoc",
33
+ "DOCLING_DEFAULT_ARTIFACTS_PATH",
34
+ "MarkerPipeline",
35
+ "MarkerPipelineConfig",
36
+ "OutputFormat",
37
+ "Pipeline",
38
+ "PipelineType",
39
+ "PipelineConfig",
40
+ "Status",
41
+ ]
@@ -0,0 +1,6 @@
1
+ ARTIFACTS = "artifacts"
2
+ CPU_GROUP = "cpu"
3
+ MINER_U_GROUP = "miner-u"
4
+ EXTRACT_CONTENT_TASK = "extract-content"
5
+ EXTRACT_CONTENT_MINER_U_TASK = "extract-content-miner-u"
6
+ DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
@@ -0,0 +1,233 @@
1
+ import shutil
2
+ import tempfile
3
+ from collections.abc import AsyncGenerator, Iterable, Iterator
4
+ from functools import cache
5
+ from pathlib import Path
6
+ from typing import Any, ClassVar, Literal, TypeVar
7
+
8
+ from docling.backend.abstract_backend import AbstractDocumentBackend
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import ConversionResult
11
+ from docling.datamodel.pipeline_options import (
12
+ EasyOcrOptions,
13
+ PdfPipelineOptions,
14
+ PipelineOptions,
15
+ VlmPipelineOptions,
16
+ )
17
+ from docling.document_converter import DocumentConverter, FormatOption
18
+ from docling.models.factories import get_ocr_factory
19
+ from docling.pipeline.base_pipeline import BasePipeline
20
+ from docling_core.types.doc import ImageRefMode
21
+ from docling_core.types.io import DocumentStream
22
+ from icij_common.registrable import FromConfig
23
+ from pydantic import Field, model_validator
24
+
25
+ from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
26
+ from .objects import (
27
+ BaseModel,
28
+ Error,
29
+ InputDoc,
30
+ MarkdownDoc,
31
+ OutputFormat,
32
+ PageIndexes,
33
+ Result,
34
+ Status,
35
+ )
36
+ from .pipeline import Pipeline, PipelineConfig, PipelineType
37
+ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
38
+
39
+ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
40
+
41
+
42
+ class _PdfPipelineOptions(PdfPipelineOptions):
43
+ generate_picture_images: bool = Field(default=True, frozen=True)
44
+
45
+ @model_validator(mode="before")
46
+ @classmethod
47
+ def validate_ocr_options(cls, data: Any) -> Any:
48
+ if isinstance(data, dict):
49
+ ocr_options = data.get("ocr_options")
50
+ if not isinstance(ocr_options, dict):
51
+ return data
52
+ allow_external_plugins = ocr_options.get("allow_external_plugins", False)
53
+ ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
54
+ kind = ocr_options.pop("kind")
55
+ data["ocr_options"] = ocr_factory.create_options(kind=kind, **ocr_options)
56
+ return data
57
+
58
+
59
+ OptionsByPipeline = list[
60
+ tuple[Literal["pdf"], _PdfPipelineOptions]
61
+ | tuple[Literal["vlm"], VlmPipelineOptions]
62
+ ]
63
+
64
+
65
+ def _default_pipeline_options() -> OptionsByPipeline:
66
+ pipeline_options = _PdfPipelineOptions(ocr_options=EasyOcrOptions())
67
+ return [("pdf", pipeline_options), ("vlm", VlmPipelineOptions())]
68
+
69
+
70
+ class DoclingFormatOption(BaseModel):
71
+ pipeline_cls: str
72
+ backend_cls: str
73
+
74
+ def to_docling(
75
+ self, pipeline_options: dict[Literal["pdf", "vlm"], PipelineOptions]
76
+ ) -> FormatOption:
77
+ pipeline_cls = _find_subcls(BasePipeline, self.pipeline_cls)
78
+ backend_cls = _find_subcls(AbstractDocumentBackend, self.backend_cls)
79
+ if "vlm" in self.pipeline_cls.lower():
80
+ pipeline_options = pipeline_options.get("vlm")
81
+ if pipeline_options is not None:
82
+ pipeline_options = VlmPipelineOptions.model_validate(pipeline_options)
83
+ elif "pdf" in self.pipeline_cls.lower():
84
+ pipeline_options = pipeline_options.get("pdf")
85
+ if pipeline_options is not None:
86
+ pipeline_options = _PdfPipelineOptions.model_validate(pipeline_options)
87
+ else:
88
+ raise ValueError(
89
+ f"invalid pipeline_cls: {pipeline_cls}, expected a VLM or PDF pipeline"
90
+ )
91
+ return FormatOption(
92
+ pipeline_cls=pipeline_cls,
93
+ pipeline_options=pipeline_options,
94
+ backend=backend_cls,
95
+ )
96
+
97
+
98
+ @cache
99
+ def _default_format_options() -> dict[InputFormat, DoclingFormatOption]:
100
+ supported_fmt = {InputFormat.PDF}
101
+ return {
102
+ fmt: DoclingFormatOption(
103
+ pipeline_cls=opt.pipeline_cls.__name__, backend_cls=opt.backend.__name__
104
+ )
105
+ for fmt, opt in DocumentConverter().format_to_options.items()
106
+ if fmt in supported_fmt
107
+ }
108
+
109
+
110
+ T = TypeVar("T")
111
+
112
+
113
+ def _find_subcls(cls: type[T], name: str) -> type[T]:
114
+ for c in all_subclasses(cls):
115
+ if c.__name__ == name:
116
+ return c
117
+ raise ValueError(f"unknown {cls.__name__} subclass {name}")
118
+
119
+
120
+ @PipelineConfig.register()
121
+ class DoclingPipelineConfig(PipelineConfig):
122
+ pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
123
+ task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
124
+
125
+ pipeline_options: OptionsByPipeline = Field(
126
+ default_factory=_default_pipeline_options
127
+ )
128
+ format_options: dict[InputFormat, DoclingFormatOption] = Field(
129
+ default_factory=_default_format_options
130
+ )
131
+
132
+ def to_format_options(self) -> dict[InputFormat, FormatOption]:
133
+ pipeline_options = dict(self.pipeline_options)
134
+ return {
135
+ InputFormat(f): opt.to_docling(pipeline_options)
136
+ for f, opt in self.format_options.items()
137
+ }
138
+
139
+
140
+ DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
141
+
142
+
143
+ @Pipeline.register(PipelineType.DOCLING)
144
+ class DoclingPipeline(Pipeline):
145
+ def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
146
+ if format_options is None:
147
+ format_options = DEFAULT_FORMAT_OPTIONS
148
+ self._converter = DocumentConverter(format_options=format_options)
149
+
150
+ async def extract_content(
151
+ self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
152
+ ) -> AsyncGenerator[Result, None]:
153
+ docs, path_or_streams = map_and_preserve(_to_docling, docs)
154
+ outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
155
+ for doc, res in zip(docs, outputs, strict=True):
156
+ yield _to_result(res, doc, output_format, output_path=output_path)
157
+
158
+ @classmethod
159
+ def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
160
+ return cls(config.to_format_options())
161
+
162
+
163
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
164
+ for d in docs:
165
+ yield d.to_docling()
166
+
167
+
168
+ def _to_result(
169
+ res: ConversionResult,
170
+ input_document: InputDoc,
171
+ output_format: OutputFormat,
172
+ output_path: Path,
173
+ **kwargs,
174
+ ) -> Result:
175
+ output_path.mkdir(parents=True, exist_ok=True)
176
+ output = None
177
+ status = Status.from_docling(res.status)
178
+ if status.allows_conversion:
179
+ match output_format:
180
+ case OutputFormat.MARKDOWN:
181
+ output = _to_markdown_doc(res, output_path, **kwargs)
182
+ case _:
183
+ raise NotImplementedError(f"unsupported output format {output_format}")
184
+ errors = [Error.from_docling(e) for e in res.errors]
185
+ input_doc = input_document.without_content()
186
+ return Result(input=input_doc, status=status, errors=errors, output=output)
187
+
188
+
189
+ def _to_markdown_doc(
190
+ res: ConversionResult,
191
+ output_path: Path,
192
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
193
+ **kwargs,
194
+ ) -> MarkdownDoc:
195
+ # TODO: Should we add a hash to avoid collision between files with same names
196
+ # nested in the tree structured
197
+ md_dir_name = path_to_artifacts_dirname(res.input.file)
198
+ md_dir = output_path / md_dir_name
199
+ if md_dir.exists():
200
+ raise FileExistsError(f"directory {md_dir} already exists")
201
+ # Let's avoid issue of duplicated input file names flattened top level
202
+ md_filename = md_dir_name + OutputFormat.MARKDOWN
203
+ total_length = 0
204
+ n_pages = len(res.pages)
205
+
206
+ with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
207
+ tmp_dir = Path(td)
208
+ page_path = Path("page.md")
209
+ # We do a chdir to bypass a Docling bug which only allows to maintain relative
210
+ # image ref when saving the markdown to a relative path
211
+ with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
212
+ end_indices = []
213
+ for page_i in range(n_pages):
214
+ res.document.save_as_markdown(
215
+ page_path,
216
+ page_no=page_i + 1,
217
+ image_mode=ImageRefMode.REFERENCED,
218
+ artifacts_dir=Path(ARTIFACTS),
219
+ **kwargs,
220
+ )
221
+ content = page_path.read_text()
222
+ if page_i > 0:
223
+ content += "\n"
224
+ if page_i < n_pages - 1:
225
+ content += page_sep
226
+ total_length += len(content)
227
+ end_indices.append(total_length)
228
+ f.write(content)
229
+ f.flush()
230
+ page_path.unlink()
231
+ shutil.move(tmp_dir, md_dir)
232
+ pages = PageIndexes.from_page_end_indices(end_indices)
233
+ return MarkdownDoc(path=Path(md_dir_name), pages=pages)
@@ -0,0 +1,118 @@
1
+ import gc
2
+ from collections.abc import AsyncGenerator, Iterable
3
+ from copy import deepcopy
4
+ from pathlib import Path
5
+ from typing import Any, ClassVar, Self
6
+
7
+ from marker.config.parser import ConfigParser
8
+ from marker.converters.pdf import PdfConverter
9
+ from marker.models import create_model_dict
10
+ from marker.output import text_from_rendered
11
+ from marker.renderers.markdown import MarkdownRenderer
12
+ from PIL.Image import Image
13
+ from pydantic import Field
14
+
15
+ from .constants import ARTIFACTS, CPU_GROUP
16
+ from .objects import (
17
+ InputDoc,
18
+ MarkdownDoc,
19
+ OutputFormat,
20
+ PageIndexes,
21
+ Result,
22
+ Status,
23
+ )
24
+ from .pipeline import Pipeline, PipelineConfig, PipelineType
25
+ from .utils import path_to_artifacts_dirname, report_recoverable_errors
26
+
27
+
28
+ @PipelineConfig.register()
29
+ class MarkerPipelineConfig(PipelineConfig):
30
+ pipeline: PipelineType = Field(frozen=True, default=PipelineType.MARKER)
31
+ task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
32
+
33
+ config: dict[str, Any] = dict()
34
+
35
+
36
+ _MARKER_CONVERSION_ERRORS = tuple()
37
+
38
+
39
+ @Pipeline.register(PipelineType.MARKER)
40
+ class MarkerPipeline(Pipeline):
41
+ def __init__(self, marker_config: dict[str, Any] | None = None):
42
+ if marker_config is None:
43
+ marker_config = dict()
44
+ self._marker_config = marker_config
45
+
46
+ async def extract_content(
47
+ self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
48
+ ) -> AsyncGenerator[Result, None]:
49
+ config = deepcopy(self._marker_config)
50
+ config["output_format"] = output_format.to_marker()
51
+ config_parser = ConfigParser(config)
52
+ renderer = config_parser.get_renderer()
53
+ converter = PdfConverter(
54
+ config=config_parser.generate_config_dict(),
55
+ artifact_dict=create_model_dict(),
56
+ processor_list=config_parser.get_processors(),
57
+ renderer=renderer,
58
+ )
59
+ for doc in docs:
60
+ yield _process_doc(doc, converter, output_format, output_path)
61
+
62
+ @classmethod
63
+ def _from_config(cls, config: MarkerPipelineConfig) -> Self:
64
+ return cls(config.config)
65
+
66
+
67
+ @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
68
+ def _process_doc(
69
+ doc: InputDoc,
70
+ converter: PdfConverter,
71
+ output_format: OutputFormat,
72
+ output_path: Path,
73
+ ) -> Result:
74
+ rendered = converter(str(doc.path))
75
+ content, _, images = text_from_rendered(rendered)
76
+ match output_format:
77
+ case OutputFormat.MARKDOWN:
78
+ output = _to_markdown_doc(doc, content, images, output_path)
79
+ case _:
80
+ raise NotImplementedError(f"unsupported output format {output_format}")
81
+ input_doc = doc.without_content()
82
+ return Result(input=input_doc, status=Status.SUCCESS, output=output)
83
+
84
+
85
+ def _to_markdown_doc(
86
+ input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
87
+ ) -> MarkdownDoc:
88
+ # TODO: Should we add a hash to avoid collision between files with same names
89
+ # nested in the tree structured
90
+ md_dir_name = path_to_artifacts_dirname(input_doc.path)
91
+ md_dir = output_path / md_dir_name
92
+ artifacts_dir = md_dir / ARTIFACTS
93
+ artifacts_dir.mkdir(parents=True)
94
+ for im_name, im in images.items():
95
+ im.save(artifacts_dir / im_name)
96
+ del images
97
+ gc.collect()
98
+ page_sep = MarkdownRenderer.page_separator
99
+ content = content.split(page_sep)
100
+ n_pages = len(content)
101
+ md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
102
+ OutputFormat.MARKDOWN.value
103
+ )
104
+ total_length = 0
105
+ end_indices = []
106
+ with md_path.open("w", encoding="utf-8") as f:
107
+ for page_i, page_content in enumerate(content):
108
+ content = page_content
109
+ if page_i > 0:
110
+ content += "\n"
111
+ if page_i < n_pages - 1:
112
+ content += page_sep
113
+ total_length += len(content)
114
+ end_indices.append(total_length)
115
+ f.write(content)
116
+ f.flush()
117
+ pages = PageIndexes.from_page_end_indices(end_indices)
118
+ return MarkdownDoc(path=Path(md_dir_name), pages=pages)
@@ -0,0 +1,212 @@
1
+ import json
2
+ import shutil
3
+ from collections.abc import AsyncGenerator, Callable, Iterable
4
+ from copy import copy
5
+ from enum import StrEnum
6
+ from functools import partial
7
+ from pathlib import Path
8
+ from tempfile import TemporaryDirectory
9
+ from typing import Any, ClassVar, Self
10
+
11
+ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
12
+ union_make as pipeline_union_make,
13
+ )
14
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
15
+ from mineru.cli.common import aio_do_parse
16
+ from mineru.utils.enum_class import MakeMode
17
+ from pydantic import Field
18
+ from pydantic_extra_types.language_code import LanguageAlpha2
19
+
20
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP, MINER_U_GROUP
21
+ from .objects import (
22
+ BaseModel,
23
+ ConversionOutput,
24
+ InputDoc,
25
+ OutputFormat,
26
+ PageIndexes,
27
+ Result,
28
+ Status,
29
+ )
30
+ from .pipeline import Pipeline, PipelineConfig, PipelineType
31
+ from .utils import path_to_artifacts_dirname
32
+
33
+ _MINER_U_CONVERSION_ERRORS = tuple()
34
+ MDMakeFunction = Callable[[list, str, str], str | None]
35
+
36
+
37
+ class MinerUBackend(StrEnum):
38
+ PIPELINE = "pipeline"
39
+ VLM = "vlm"
40
+
41
+
42
+ class MinerUConfig(BaseModel):
43
+ backend: MinerUBackend = MinerUBackend.PIPELINE
44
+ enable_formula_extraction: bool = True
45
+ enable_table_extraction: bool = True
46
+ # TODO: use enum or literal here
47
+ parse_method: str = "auto"
48
+
49
+ default_kwargs: ClassVar[dict] = {
50
+ "server_url": None,
51
+ # We don't dump md directly we process, we dump the middle json in order to be
52
+ # able to get page indexes
53
+ "parse_method": "auto",
54
+ "dump_md": False,
55
+ "dump_middle_json": True,
56
+ "f_draw_layout_bbox": False,
57
+ "f_draw_span_bbox": False,
58
+ "f_dump_model_output": False, # might be useful for debug though
59
+ "f_dump_orig_pdf": False,
60
+ "f_dump_content_list": False, # might be useful for debug though
61
+ "start_page_id": 0,
62
+ "f_make_md_mode": MakeMode.MM_MD,
63
+ "image_analysis": True,
64
+ "end_page_id": None,
65
+ "client_side_output_generation": False,
66
+ }
67
+
68
+ def as_parse_kwargs(self) -> dict[str, Any]:
69
+ kwargs = copy(self.default_kwargs)
70
+ kwargs["backend"] = self.backend
71
+ kwargs["parse_method"] = self.parse_method
72
+ kwargs["formula_enable"] = self.enable_formula_extraction
73
+ kwargs["table_enable"] = self.enable_table_extraction
74
+ return kwargs
75
+
76
+
77
+ @PipelineConfig.register() # noqa: F821
78
+ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
79
+ pipeline: PipelineType = Field(frozen=True, default=PipelineType.MINER_U)
80
+ task_group: ClassVar[str] = Field(frozen=True, default=MINER_U_GROUP)
81
+
82
+ config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
83
+ language: LanguageAlpha2 = Field(frozen=True, default="en")
84
+
85
+
86
+ @Pipeline.register(PipelineType.MINER_U)
87
+ class MinerUPipeline(Pipeline):
88
+ def __init__(self, config: MinerUConfig, language: str):
89
+ self._config = config
90
+ self._language = language
91
+ self._md_make_fn = _parse_md_make_fn(config.backend)
92
+
93
+ async def extract_content(
94
+ self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
95
+ ) -> AsyncGenerator[Result, None]:
96
+ docs = list(docs)
97
+ # TODO: exclude files which are not pdf and return an error
98
+ pdfs_bytes = [d.path.read_bytes() for d in docs]
99
+ pdfs_names = [d.path.name for d in docs]
100
+ p_lang_list = [self._language for _ in pdfs_names]
101
+ # TODO: we should only process valid PDFs
102
+ with TemporaryDirectory(prefix="mineru-") as workdir:
103
+ workdir = Path(workdir) # noqa: PLW2901
104
+ await aio_do_parse(
105
+ output_dir=workdir,
106
+ pdf_file_names=pdfs_names,
107
+ pdf_bytes_list=pdfs_bytes,
108
+ p_lang_list=p_lang_list,
109
+ **self._config.as_parse_kwargs(),
110
+ )
111
+ res_paths = [
112
+ _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
113
+ ]
114
+ for doc, res_path in zip(docs, res_paths, strict=True):
115
+ yield _process_doc(
116
+ doc,
117
+ md_make_fn=self._md_make_fn,
118
+ res_path=res_path,
119
+ output_format=output_format,
120
+ output_path=output_path,
121
+ )
122
+
123
+ @classmethod
124
+ def _from_config(cls, config: MinerUPipelineConfig) -> Self:
125
+ return cls(config.config, language=config.language)
126
+
127
+
128
+ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
129
+ output_path = output_dir / pdf_filename
130
+ if not output_path.exists():
131
+ msg = f"couldn't find result for {pdf_filename}"
132
+ raise FileNotFoundError(msg)
133
+ dirs = [p for p in output_path.iterdir() if p.is_dir()]
134
+ if len(dirs) != 1:
135
+ msg = f"expected exactly one result directory, found: {dirs}"
136
+ raise ValueError(msg)
137
+ return output_dir / dirs[0]
138
+
139
+
140
+ def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
141
+ match backend:
142
+ case MinerUBackend.PIPELINE:
143
+ return pipeline_union_make
144
+ case MinerUBackend.VLM:
145
+ return vlm_union_make
146
+ case _:
147
+ raise ValueError(f"Unsupported backend: {backend}")
148
+
149
+
150
+ def _process_doc(
151
+ doc: InputDoc,
152
+ *,
153
+ md_make_fn: MDMakeFunction,
154
+ res_path: Path,
155
+ output_format: OutputFormat,
156
+ output_path: Path,
157
+ ) -> Result:
158
+ md_dir_name = path_to_artifacts_dirname(doc.path)
159
+ md_dir = Path(output_path) / md_dir_name
160
+ md_dir.mkdir(parents=True, exist_ok=False)
161
+ artifacts_dir = md_dir / ARTIFACTS
162
+ md_path = (md_dir / md_dir_name).with_suffix(OutputFormat.MARKDOWN.value)
163
+ # Fail early
164
+ match output_format:
165
+ case OutputFormat.MARKDOWN:
166
+ im_rel_dir = artifacts_dir.relative_to(md_dir)
167
+ dump_content_fn = partial(
168
+ _dump_md_content,
169
+ md_make_fn=md_make_fn,
170
+ output_path=output_path,
171
+ md_path=md_path,
172
+ im_dir=im_rel_dir,
173
+ )
174
+ case _:
175
+ raise NotImplementedError(f"unsupported output format {output_format}")
176
+ middle_json_path = res_path / f"{doc.path.name}_middle.json"
177
+ middle_json = json.loads(middle_json_path.read_text())
178
+ pdf_info = middle_json["pdf_info"]
179
+ shutil.move(res_path / "images", artifacts_dir)
180
+ output = dump_content_fn(pdf_info)
181
+ input_doc = doc.without_content()
182
+ return Result(input=input_doc, status=Status.SUCCESS, output=output)
183
+
184
+
185
+ def _dump_md_content(
186
+ pdf_info: list[dict],
187
+ *,
188
+ md_make_fn: MDMakeFunction,
189
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
190
+ output_path: Path,
191
+ md_path: Path,
192
+ im_dir: Path,
193
+ md_make_mode: str = MakeMode.MM_MD,
194
+ ) -> ConversionOutput:
195
+ total_length = 0
196
+ end_indices = []
197
+ with md_path.open("w") as f:
198
+ n_pages = len(pdf_info)
199
+ for page_i, page in enumerate(pdf_info):
200
+ content = md_make_fn([page], md_make_mode, str(im_dir))
201
+ if page_i > 0:
202
+ content += "\n"
203
+ if page_i < n_pages - 1:
204
+ content += page_sep
205
+ total_length += len(content)
206
+ end_indices.append(total_length)
207
+ f.write(content)
208
+ f.flush()
209
+ end_indices = PageIndexes.from_page_end_indices(end_indices)
210
+ output_path = md_path.parent.relative_to(output_path)
211
+ output = ConversionOutput(path=output_path, pages=end_indices)
212
+ return output
@@ -0,0 +1,254 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import traceback
6
+ import uuid
7
+ from abc import ABC
8
+ from enum import StrEnum
9
+ from functools import cache
10
+ from io import BytesIO
11
+ from pathlib import Path
12
+ from typing import Annotated, Any, NoReturn, Self
13
+
14
+ from icij_common.pydantic_utils import (
15
+ icij_config,
16
+ merge_configs,
17
+ no_enum_values_config,
18
+ safe_copy,
19
+ )
20
+ from pydantic import AfterValidator, RootModel, TypeAdapter
21
+ from pydantic import BaseModel as _BaseModel
22
+
23
+ try:
24
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem, InputFormat
25
+ from docling.datamodel.document import InputDocument
26
+ from docling_core.types.io import DocumentStream
27
+ except ImportError:
28
+ ConversionStatus, ErrorItem, InputFormat = None, None, None
29
+ InputDocument = None
30
+ DocumentStream = None
31
+
32
+ logger = logging.getLogger(__name__)
33
+ base_config = merge_configs(icij_config(), no_enum_values_config())
34
+
35
+
36
+ class BaseModel(_BaseModel):
37
+ model_config = base_config
38
+
39
+
40
+ class SupportedExt(StrEnum):
41
+ PDF = ".pdf"
42
+
43
+ def to_docling(self) -> InputFormat:
44
+ return InputFormat(self.value[1:])
45
+
46
+
47
+ class OutputFormat(StrEnum):
48
+ MARKDOWN = ".md"
49
+
50
+ @property
51
+ def suffix(self) -> str:
52
+ return self.value[1:]
53
+
54
+ def to_marker(self) -> str:
55
+ match self:
56
+ case OutputFormat.MARKDOWN:
57
+ return "markdown"
58
+ case _:
59
+ raise ValueError(f"{self} is unsupported by marker")
60
+
61
+
62
+ class Status(StrEnum):
63
+ FAILURE = "failure"
64
+ SUCCESS = "success"
65
+ PARTIAL_SUCCESS = "partial_success"
66
+
67
+ @classmethod
68
+ def from_docling(cls, v: Any) -> Self:
69
+ from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
70
+
71
+ if v is ConversionStatus.SUCCESS:
72
+ return cls.SUCCESS
73
+ if v is ConversionStatus.PARTIAL_SUCCESS:
74
+ return cls.PARTIAL_SUCCESS
75
+ if isinstance(v, ConversionStatus):
76
+ return cls.FAILURE
77
+ raise TypeError(f"can't convert {v!r} to {cls.__name__!r}")
78
+
79
+ @property
80
+ def allows_conversion(self) -> bool:
81
+ return self is Status.SUCCESS or self is Status.PARTIAL_SUCCESS
82
+
83
+
84
+ class Error(BaseModel):
85
+ id: str
86
+ title: str
87
+ detail: str
88
+
89
+ @classmethod
90
+ def from_exception(cls, exception: BaseException) -> Self:
91
+ title = exception.__class__.__name__
92
+ trace_lines = traceback.format_exception(
93
+ None, value=exception, tb=exception.__traceback__
94
+ )
95
+ detail = f"{exception}\n{''.join(trace_lines)}"
96
+ error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
97
+ error = cls(id=error_id, title=title, detail=detail)
98
+ return error
99
+
100
+ @classmethod
101
+ def from_docling(cls, docling_error: ErrorItem) -> Self:
102
+ title = "DoclingConversionError"
103
+ error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
104
+ detail = (
105
+ f"error in module {docling_error.module_name} of"
106
+ f" {docling_error.component_type}:\n{docling_error.error_message}"
107
+ )
108
+ return cls(id=error_id, title=title, detail=detail)
109
+
110
+
111
+ def _id_title(title: str) -> str:
112
+ id_title = []
113
+ for i, letter in enumerate(title):
114
+ if i and letter.isupper():
115
+ id_title.append("-")
116
+ id_title.append(letter.lower())
117
+ return "".join(id_title)
118
+
119
+
120
+ class InputDoc(BaseModel):
121
+ ext: SupportedExt
122
+ path: Path
123
+ content: bytes | None = None
124
+
125
+ @classmethod
126
+ def from_path(cls, path: str | Path) -> Self:
127
+ if isinstance(path, str):
128
+ path = Path(path)
129
+ ext = SupportedExt(path.suffix)
130
+ return cls(path=path, ext=ext)
131
+
132
+ def to_docling(self) -> Path | DocumentStream:
133
+ if self.content is not None:
134
+ return DocumentStream(name=str(self.path), stream=BytesIO(self.content))
135
+ if not self.path.suffix:
136
+ return DocumentStream(
137
+ name=str(self.path), stream=BytesIO(self.path.read_bytes())
138
+ )
139
+ return self.path
140
+
141
+ def without_content(self) -> Self:
142
+ return safe_copy(self, update={"content": None})
143
+
144
+
145
+ class PageIndexes(RootModel[list[tuple[int, int]]]):
146
+ # Stores page end index
147
+ @classmethod
148
+ def from_page_end_indices(cls, lengths: list[int]) -> Self:
149
+ return [
150
+ ((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
151
+ ]
152
+
153
+
154
+ class ConversionOutput(BaseModel):
155
+ path: Path
156
+ pages: PageIndexes = []
157
+
158
+
159
+ class MarkdownDoc(ConversionOutput):
160
+ @classmethod
161
+ @property
162
+ @cache
163
+ def _valid_conversion_statuses(cls) -> set[ConversionStatus]:
164
+ from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
165
+
166
+ return {ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS}
167
+
168
+
169
+ def _input_should_not_have_content(value: InputDoc) -> InputDoc:
170
+ if value.content is not None:
171
+ raise ValueError(f"response input can't have content, but got {value}")
172
+ return value
173
+
174
+
175
+ class _BaseResult(BaseModel, ABC):
176
+ input: InputDoc
177
+ status: Status
178
+ errors: list[Error] = []
179
+
180
+
181
+ class Result(_BaseResult):
182
+ # TODO: we could also use generics here when we add more output formats
183
+ output: ConversionOutput | None
184
+
185
+ def to_response(self) -> ResponseResult:
186
+ return ResponseResult(
187
+ input=self.input.without_content(),
188
+ status=self.status,
189
+ errors=self.errors,
190
+ output_path=self.output.path,
191
+ )
192
+
193
+
194
+ class ResponseResult(_BaseResult):
195
+ input: Annotated[InputDoc, AfterValidator(func=_input_should_not_have_content)]
196
+ output_path: Path
197
+
198
+
199
+ class ExtractionResponse(BaseModel):
200
+ results: list[ResponseResult]
201
+
202
+
203
+ _INPUT_DOCS_ADAPTER = TypeAdapter(list[InputDoc | Path])
204
+
205
+
206
+ def parse_extraction_request(
207
+ docs: str | list[dict | str], *, data_dir: Path
208
+ ) -> list[InputDoc]:
209
+ if isinstance(docs, str):
210
+ logger.debug("exploring files in %s", data_dir.absolute())
211
+ docs_dir = Path(data_dir) / docs
212
+ docs = _as_input_docs(docs_dir)
213
+ msg = "found %s"
214
+ if len(docs) > 10:
215
+ msg = msg + ", and more..."
216
+ logger.debug("found %s", docs[:10])
217
+ return docs
218
+ docs = _INPUT_DOCS_ADAPTER.validate_python(docs)
219
+ if not docs:
220
+ return []
221
+ if isinstance(docs[0], Path):
222
+ doc_meta = []
223
+ unknown_exts = []
224
+ for doc in docs:
225
+ _, ext = os.path.splitext(str(doc))
226
+ if not ext:
227
+ unknown_exts.append(doc)
228
+ else:
229
+ doc_meta.append(InputDoc.from_path(path=doc.relative_to(data_dir)))
230
+ if unknown_exts:
231
+ raise ValueError(f"found files with unknown extensions {unknown_exts}")
232
+ return doc_meta
233
+ return docs
234
+
235
+
236
+ def _raise(err: OSError) -> NoReturn:
237
+ raise err
238
+
239
+
240
+ def _as_input_docs(
241
+ docs_dir: Path, *, supported_ext: set[str] | None = None
242
+ ) -> list[InputDoc]:
243
+ if supported_ext is None:
244
+ supported_ext = {v.value for v in SupportedExt}
245
+ docs = []
246
+ for root, _, files in os.walk(docs_dir, onerror=_raise):
247
+ root = Path(root) # noqa: PLW2901
248
+ for f in files:
249
+ ext = Path(f).suffix
250
+ if not ext or ext not in supported_ext:
251
+ continue
252
+ docs.append(InputDoc.from_path(path=root / f))
253
+ docs = sorted(docs, key=lambda x: x.path)
254
+ return docs
@@ -0,0 +1,36 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import AsyncGenerator, Iterable
3
+ from enum import StrEnum
4
+ from pathlib import Path
5
+ from typing import ClassVar
6
+
7
+ from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_values_config
8
+ from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
9
+ from pydantic import Field
10
+
11
+ from .objects import InputDoc, OutputFormat, Result
12
+
13
+ StructuredContent = str
14
+
15
+
16
+ class PipelineType(StrEnum):
17
+ DOCLING = "docling"
18
+ MARKER = "marker"
19
+ MINER_U = "miner_u"
20
+
21
+
22
+ class PipelineConfig(RegistrableConfig, ABC):
23
+ # TODO: move this icij_config() to RegistrableConfig
24
+ model_config = merge_configs(icij_config(), no_enum_values_config())
25
+
26
+ registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
27
+ pipeline: PipelineType
28
+
29
+ task_group: ClassVar[str] = Field(frozen=True)
30
+
31
+
32
+ class Pipeline(RegistrableFromConfig, ABC):
33
+ @abstractmethod
34
+ async def extract_content(
35
+ self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
36
+ ) -> AsyncGenerator[Result, None]: ...
@@ -0,0 +1,70 @@
1
+ import os
2
+ from collections.abc import Callable, Generator, Iterable, Iterator
3
+ from contextlib import contextmanager
4
+ from functools import wraps
5
+ from itertools import tee
6
+ from pathlib import Path, PurePath
7
+ from typing import Protocol, TypeVar
8
+
9
+ from .objects import Error, InputDoc, Result, Status
10
+
11
+ R = TypeVar("R")
12
+ T = TypeVar("T")
13
+
14
+
15
+ def map_and_preserve(
16
+ fn: Callable[[Iterable[T]], Iterator[R]], inputs: Iterable[T]
17
+ ) -> tuple[Iterable[T], Iterator[R]]:
18
+ save_inputs, function_inputs = tee(inputs)
19
+ outputs = iter(fn(function_inputs))
20
+ return save_inputs, outputs
21
+
22
+
23
+ def all_subclasses(cls: type[T]) -> set[type[T]]:
24
+ return set(cls.__subclasses__()).union(
25
+ [s for c in cls.__subclasses__() for s in all_subclasses(c)]
26
+ )
27
+
28
+
29
+ def path_to_artifacts_dirname(path: PurePath, sep: str = "_") -> str:
30
+ dirname = f"{path.name[: -len(path.suffix)]}"
31
+ ext = path.suffix
32
+ if ext:
33
+ dirname += sep + ext[1:]
34
+ return dirname
35
+
36
+
37
+ class DocProcessingFn(Protocol):
38
+ def __call__(self, doc: InputDoc, *arg, **kwargs) -> Result: ...
39
+
40
+
41
+ def report_recoverable_errors(
42
+ recoverable_errors: tuple[type[Exception], ...] = tuple(),
43
+ ) -> Callable[[DocProcessingFn], DocProcessingFn]:
44
+ def make_decorator(f: DocProcessingFn) -> DocProcessingFn:
45
+ @wraps(f)
46
+ def wrapped(doc: InputDoc, *args, **kwargs) -> Result:
47
+ try:
48
+ return f(doc, *args, **kwargs)
49
+ except recoverable_errors as e:
50
+ error = Error.from_exception(e)
51
+ return Result(
52
+ input=doc.without_content(),
53
+ status=Status.FAILURE,
54
+ errors=[error],
55
+ output=None,
56
+ )
57
+
58
+ return wrapped
59
+
60
+ return make_decorator
61
+
62
+
63
+ @contextmanager
64
+ def chdir(path: Path) -> Generator[None, None, None]:
65
+ cwd = Path.cwd()
66
+ try:
67
+ os.chdir(path)
68
+ yield
69
+ finally:
70
+ os.chdir(cwd)
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: extract-python
3
+ Version: 0.1.0
4
+ Summary: Structured content extraction
5
+ Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
+ Project-URL: Repository, https://github.com/ICIJ/extract-python
7
+ Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
+ Author-email: Clément Doumouro <cdoumouro@icij.org>
9
+ Requires-Python: <3.14,>=3.11
10
+ Requires-Dist: icij-common~=0.8.2
11
+ Provides-Extra: benches
12
+ Requires-Dist: html2image~=2.0.7; extra == 'benches'
13
+ Requires-Dist: markdown2>=2.5.4; extra == 'benches'
14
+ Requires-Dist: notebook>=7.4.5; extra == 'benches'
15
+ Requires-Dist: pypdfium2>=4.30.0; extra == 'benches'
16
+ Provides-Extra: docling
17
+ Requires-Dist: docling-slim[feat-ocr-easyocr,feat-ocr-mac,feat-ocr-tesserocr,standard]~=2.96; extra == 'docling'
18
+ Provides-Extra: marker
19
+ Requires-Dist: marker-pdf~=1.10; extra == 'marker'
20
+ Provides-Extra: mineru
21
+ Requires-Dist: mineru[mlx]~=3.2; (sys_platform == 'darwin') and extra == 'mineru'
22
+ Requires-Dist: mineru[pipeline,vlm]~=3.2; extra == 'mineru'
23
+ Requires-Dist: pydantic-extra-types[pycountry]~=2.11; extra == 'mineru'
24
+ Requires-Dist: six~=1.17; extra == 'mineru'
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=FMDsPVz05sGMPvIOX90lOLygWp6nC5DEjRfgx_ESPJ4,8530
4
+ extract_python/marker_.py,sha256=z3PkUUStC-E78HhqByMwJ7re6-I7YUQzSxWToegHrUQ,4060
5
+ extract_python/miner_u.py,sha256=f5pvLvay1ThBXNOI1R276aWSWsk5mhIPzWVjCy2u_lw,7493
6
+ extract_python/objects.py,sha256=gTyGA5gaMAmW5P_PbAO2LNMqtP69CxlknebBFTojiwQ,7322
7
+ extract_python/pipeline.py,sha256=qUgGar1rlYQgNz78BcUT1nQRsG3hy5UwpCl0e-0V77I,1098
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.1.0.dist-info/METADATA,sha256=wyYMrleKk9yUU1UaTYT0EsGpw_e3qbE8LOBanyLv0Qg,1132
10
+ extract_python-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any