extract-python 0.2.1__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_python-0.2.1 → extract_python-0.3.1}/PKG-INFO +1 -1
- extract_python-0.3.1/extract_python/docling_.py +203 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/marker_.py +17 -11
- {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/miner_u.py +43 -30
- {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/objects.py +43 -2
- {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/pipeline.py +1 -1
- {extract_python-0.2.1 → extract_python-0.3.1}/pyproject.toml +1 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/uv.lock +28 -2
- extract_python-0.2.1/extract_python/docling_.py +0 -258
- {extract_python-0.2.1 → extract_python-0.3.1}/.dockerignore +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/.github/workflows/publish.yml +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/.github/workflows/tests.yml +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/.gitignore +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/.python-version +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/Dockerfile +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/README.md +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/benches/__init__.py +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/benches/compare.ipynb +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/benches/compare.py +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/benches/constants.py +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/data/.gitignore +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/docker-compose.yml +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/extract +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/__init__.py +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/constants.py +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/utils.py +0 -0
- {extract_python-0.2.1 → extract_python-0.3.1}/qa/ruff.toml +0 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
import tempfile
|
|
3
|
+
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
|
+
from functools import cache
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
|
|
7
|
+
|
|
8
|
+
from icij_common.registrable import FromConfig
|
|
9
|
+
from pydantic import AfterValidator, Field
|
|
10
|
+
|
|
11
|
+
from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
|
|
12
|
+
from .objects import (
|
|
13
|
+
Error,
|
|
14
|
+
InputDoc,
|
|
15
|
+
MarkdownDoc,
|
|
16
|
+
OutputFormat,
|
|
17
|
+
PageIndexes,
|
|
18
|
+
Result,
|
|
19
|
+
Status,
|
|
20
|
+
SupportedExt,
|
|
21
|
+
)
|
|
22
|
+
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
23
|
+
from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
|
|
24
|
+
|
|
25
|
+
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from docling.datamodel.base_models import InputFormat
|
|
29
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
|
30
|
+
from docling.document_converter import ConversionResult, FormatOption
|
|
31
|
+
from docling_core.types.io import DocumentStream
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
|
|
35
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
36
|
+
|
|
37
|
+
if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
|
|
38
|
+
msg = "generate_picture_images should be set to true"
|
|
39
|
+
raise ValueError(msg)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _validate_options(
|
|
43
|
+
data: dict["InputFormat", "FormatOption"],
|
|
44
|
+
) -> dict["InputFormat", "FormatOption"]:
|
|
45
|
+
for opts in data.values():
|
|
46
|
+
_validate_pipeline_opts(opts.pipeline_options)
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@cache
|
|
51
|
+
def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
|
|
52
|
+
from docling.datamodel.pipeline_options import (
|
|
53
|
+
EasyOcrOptions,
|
|
54
|
+
PdfPipelineOptions,
|
|
55
|
+
)
|
|
56
|
+
from docling.document_converter import PdfFormatOption
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
InputFormat.PDF: PdfFormatOption(
|
|
60
|
+
pipeline_options=PdfPipelineOptions(
|
|
61
|
+
ocr_options=EasyOcrOptions(), generate_picture_images=True
|
|
62
|
+
)
|
|
63
|
+
),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
T = TypeVar("T")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
71
|
+
for c in all_subclasses(cls):
|
|
72
|
+
if c.__name__ == name:
|
|
73
|
+
return c
|
|
74
|
+
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@PipelineConfig.register()
|
|
78
|
+
class DoclingPipelineConfig(PipelineConfig):
|
|
79
|
+
pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
|
|
80
|
+
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
81
|
+
|
|
82
|
+
format_options: Annotated[
|
|
83
|
+
dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
|
|
84
|
+
] = Field(default_factory=_default_format_opts)
|
|
85
|
+
|
|
86
|
+
_unsupported_input_formats: ClassVar[set[InputFormat]] = {
|
|
87
|
+
InputFormat.AUDIO,
|
|
88
|
+
InputFormat.METS_GBS,
|
|
89
|
+
InputFormat.VTT,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
@cache
|
|
94
|
+
def supported_exts(cls) -> set[SupportedExt]:
|
|
95
|
+
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
96
|
+
|
|
97
|
+
supported = set()
|
|
98
|
+
for f in InputFormat:
|
|
99
|
+
if f in cls._unsupported_input_formats:
|
|
100
|
+
continue
|
|
101
|
+
for ext in FormatToExtensions[f]:
|
|
102
|
+
supported.add(SupportedExt(f".{ext.lower()}"))
|
|
103
|
+
return supported
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@Pipeline.register(PipelineType.DOCLING)
|
|
107
|
+
class DoclingPipeline(Pipeline):
|
|
108
|
+
def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
|
|
109
|
+
from docling.document_converter import DocumentConverter
|
|
110
|
+
|
|
111
|
+
allowed_format = [
|
|
112
|
+
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
113
|
+
]
|
|
114
|
+
self._converter = DocumentConverter(
|
|
115
|
+
allowed_formats=allowed_format, format_options=format_options
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
async def extract_content(
|
|
119
|
+
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
120
|
+
) -> AsyncGenerator[Result, None]:
|
|
121
|
+
docs, path_or_streams = map_and_preserve(_to_docling, docs)
|
|
122
|
+
outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
|
|
123
|
+
for doc, res in zip(docs, outputs, strict=True):
|
|
124
|
+
yield _to_result(res, doc, output_format, output_path=output_path)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
|
|
128
|
+
return cls(config.format_options)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
|
|
132
|
+
for d in docs:
|
|
133
|
+
yield d.to_docling()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _to_result(
|
|
137
|
+
res: "ConversionResult",
|
|
138
|
+
input_document: InputDoc,
|
|
139
|
+
output_format: OutputFormat,
|
|
140
|
+
output_path: Path,
|
|
141
|
+
**kwargs,
|
|
142
|
+
) -> Result:
|
|
143
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
144
|
+
output = None
|
|
145
|
+
status = Status.from_docling(res.status)
|
|
146
|
+
if status.allows_conversion:
|
|
147
|
+
match output_format:
|
|
148
|
+
case OutputFormat.MARKDOWN:
|
|
149
|
+
output = _to_markdown_doc(res, output_path, **kwargs)
|
|
150
|
+
case _:
|
|
151
|
+
raise NotImplementedError(f"unsupported output format {output_format}")
|
|
152
|
+
errors = [Error.from_docling(e) for e in res.errors]
|
|
153
|
+
input_doc = input_document.without_content()
|
|
154
|
+
return Result(input=input_doc, status=status, errors=errors, output=output)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _to_markdown_doc(
|
|
158
|
+
res: "ConversionResult",
|
|
159
|
+
output_path: Path,
|
|
160
|
+
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
161
|
+
**kwargs,
|
|
162
|
+
) -> MarkdownDoc:
|
|
163
|
+
from docling_core.types.doc import ImageRefMode
|
|
164
|
+
|
|
165
|
+
# TODO: Should we add a hash to avoid collision between files with same names
|
|
166
|
+
# nested in the tree structured
|
|
167
|
+
md_dir_name = path_to_artifacts_dirname(res.input.file)
|
|
168
|
+
md_dir = output_path / md_dir_name
|
|
169
|
+
if md_dir.exists():
|
|
170
|
+
raise FileExistsError(f"directory {md_dir} already exists")
|
|
171
|
+
# Let's avoid issue of duplicated input file names flattened top level
|
|
172
|
+
md_filename = md_dir_name + OutputFormat.MARKDOWN
|
|
173
|
+
total_length = 0
|
|
174
|
+
n_pages = len(res.pages)
|
|
175
|
+
|
|
176
|
+
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
|
|
177
|
+
tmp_dir = Path(td)
|
|
178
|
+
page_path = Path("page.md")
|
|
179
|
+
# We do a chdir to bypass a Docling bug which only allows to maintain relative
|
|
180
|
+
# image ref when saving the markdown to a relative path
|
|
181
|
+
with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
|
|
182
|
+
end_indices = []
|
|
183
|
+
for page_i in range(n_pages):
|
|
184
|
+
res.document.save_as_markdown(
|
|
185
|
+
page_path,
|
|
186
|
+
page_no=page_i + 1,
|
|
187
|
+
image_mode=ImageRefMode.REFERENCED,
|
|
188
|
+
artifacts_dir=Path(ARTIFACTS),
|
|
189
|
+
**kwargs,
|
|
190
|
+
)
|
|
191
|
+
content = page_path.read_text()
|
|
192
|
+
if page_i > 0:
|
|
193
|
+
content += "\n"
|
|
194
|
+
if page_i < n_pages - 1:
|
|
195
|
+
content += page_sep
|
|
196
|
+
total_length += len(content)
|
|
197
|
+
end_indices.append(total_length)
|
|
198
|
+
f.write(content)
|
|
199
|
+
f.flush()
|
|
200
|
+
page_path.unlink()
|
|
201
|
+
shutil.move(tmp_dir, md_dir)
|
|
202
|
+
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
203
|
+
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
|
@@ -3,14 +3,8 @@ from collections.abc import AsyncGenerator, Iterable
|
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from functools import cache
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, ClassVar, Self
|
|
7
|
-
|
|
8
|
-
from marker.config.parser import ConfigParser
|
|
9
|
-
from marker.converters.pdf import PdfConverter
|
|
10
|
-
from marker.models import create_model_dict
|
|
11
|
-
from marker.output import text_from_rendered
|
|
12
|
-
from marker.renderers.markdown import MarkdownRenderer
|
|
13
|
-
from PIL.Image import Image
|
|
6
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Self
|
|
7
|
+
|
|
14
8
|
from pydantic import Field
|
|
15
9
|
|
|
16
10
|
from .constants import ARTIFACTS, CPU_GROUP
|
|
@@ -26,6 +20,10 @@ from .objects import (
|
|
|
26
20
|
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
27
21
|
from .utils import path_to_artifacts_dirname, report_recoverable_errors
|
|
28
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from marker.converters.pdf import PdfConverter
|
|
25
|
+
from PIL import Image
|
|
26
|
+
|
|
29
27
|
|
|
30
28
|
@PipelineConfig.register()
|
|
31
29
|
class MarkerPipelineConfig(PipelineConfig):
|
|
@@ -36,7 +34,7 @@ class MarkerPipelineConfig(PipelineConfig):
|
|
|
36
34
|
|
|
37
35
|
@classmethod
|
|
38
36
|
@cache
|
|
39
|
-
def
|
|
37
|
+
def supported_exts(cls) -> set[SupportedExt]:
|
|
40
38
|
# Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
|
|
41
39
|
return {
|
|
42
40
|
SupportedExt.PDF,
|
|
@@ -75,6 +73,10 @@ class MarkerPipeline(Pipeline):
|
|
|
75
73
|
async def extract_content(
|
|
76
74
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
77
75
|
) -> AsyncGenerator[Result, None]:
|
|
76
|
+
from marker.config.parser import ConfigParser
|
|
77
|
+
from marker.converters.pdf import PdfConverter
|
|
78
|
+
from marker.models import create_model_dict
|
|
79
|
+
|
|
78
80
|
config = deepcopy(self._marker_config)
|
|
79
81
|
config["output_format"] = output_format.to_marker()
|
|
80
82
|
config_parser = ConfigParser(config)
|
|
@@ -96,10 +98,12 @@ class MarkerPipeline(Pipeline):
|
|
|
96
98
|
@report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
|
|
97
99
|
def _process_doc(
|
|
98
100
|
doc: InputDoc,
|
|
99
|
-
converter: PdfConverter,
|
|
101
|
+
converter: "PdfConverter",
|
|
100
102
|
output_format: OutputFormat,
|
|
101
103
|
output_path: Path,
|
|
102
104
|
) -> Result:
|
|
105
|
+
from marker.output import text_from_rendered
|
|
106
|
+
|
|
103
107
|
rendered = converter(str(doc.path))
|
|
104
108
|
content, _, images = text_from_rendered(rendered)
|
|
105
109
|
match output_format:
|
|
@@ -112,8 +116,10 @@ def _process_doc(
|
|
|
112
116
|
|
|
113
117
|
|
|
114
118
|
def _to_markdown_doc(
|
|
115
|
-
input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
|
|
119
|
+
input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
|
|
116
120
|
) -> MarkdownDoc:
|
|
121
|
+
from marker.renderers.markdown import MarkdownRenderer
|
|
122
|
+
|
|
117
123
|
# TODO: Should we add a hash to avoid collision between files with same names
|
|
118
124
|
# nested in the tree structured
|
|
119
125
|
md_dir_name = path_to_artifacts_dirname(input_doc.path)
|
|
@@ -8,12 +8,6 @@ from pathlib import Path
|
|
|
8
8
|
from tempfile import TemporaryDirectory
|
|
9
9
|
from typing import Any, ClassVar, Self
|
|
10
10
|
|
|
11
|
-
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
|
|
12
|
-
union_make as pipeline_union_make,
|
|
13
|
-
)
|
|
14
|
-
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
15
|
-
from mineru.cli.common import aio_do_parse
|
|
16
|
-
from mineru.utils.enum_class import MakeMode
|
|
17
11
|
from pydantic import Field
|
|
18
12
|
from pydantic_extra_types.language_code import LanguageAlpha2
|
|
19
13
|
|
|
@@ -47,33 +41,39 @@ class MinerUConfig(BaseModel):
|
|
|
47
41
|
# TODO: use enum or literal here
|
|
48
42
|
parse_method: str = "auto"
|
|
49
43
|
|
|
50
|
-
default_kwargs: ClassVar[dict] = {
|
|
51
|
-
"server_url": None,
|
|
52
|
-
# We don't dump md directly we process, we dump the middle json in order to be
|
|
53
|
-
# able to get page indexes
|
|
54
|
-
"parse_method": "auto",
|
|
55
|
-
"dump_md": False,
|
|
56
|
-
"dump_middle_json": True,
|
|
57
|
-
"f_draw_layout_bbox": False,
|
|
58
|
-
"f_draw_span_bbox": False,
|
|
59
|
-
"f_dump_model_output": False, # might be useful for debug though
|
|
60
|
-
"f_dump_orig_pdf": False,
|
|
61
|
-
"f_dump_content_list": False, # might be useful for debug though
|
|
62
|
-
"start_page_id": 0,
|
|
63
|
-
"f_make_md_mode": MakeMode.MM_MD,
|
|
64
|
-
"image_analysis": True,
|
|
65
|
-
"end_page_id": None,
|
|
66
|
-
"client_side_output_generation": False,
|
|
67
|
-
}
|
|
68
|
-
|
|
69
44
|
def as_parse_kwargs(self) -> dict[str, Any]:
|
|
70
|
-
kwargs = copy(self.
|
|
45
|
+
kwargs = copy(self._get_default_kwargs())
|
|
71
46
|
kwargs["backend"] = self.backend
|
|
72
47
|
kwargs["parse_method"] = self.parse_method
|
|
73
48
|
kwargs["formula_enable"] = self.enable_formula_extraction
|
|
74
49
|
kwargs["table_enable"] = self.enable_table_extraction
|
|
75
50
|
return kwargs
|
|
76
51
|
|
|
52
|
+
@classmethod
|
|
53
|
+
@cache
|
|
54
|
+
def _get_default_kwargs(cls) -> dict[str, Any]:
|
|
55
|
+
|
|
56
|
+
from mineru.utils.enum_class import MakeMode
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"server_url": None,
|
|
60
|
+
# We don't dump md directly we process, we dump the middle json in order to be
|
|
61
|
+
# able to get page indexes
|
|
62
|
+
"parse_method": "auto",
|
|
63
|
+
"dump_md": False,
|
|
64
|
+
"dump_middle_json": True,
|
|
65
|
+
"f_draw_layout_bbox": False,
|
|
66
|
+
"f_draw_span_bbox": False,
|
|
67
|
+
"f_dump_model_output": False, # might be useful for debug though
|
|
68
|
+
"f_dump_orig_pdf": False,
|
|
69
|
+
"f_dump_content_list": False, # might be useful for debug though
|
|
70
|
+
"start_page_id": 0,
|
|
71
|
+
"f_make_md_mode": MakeMode.MM_MD,
|
|
72
|
+
"image_analysis": True,
|
|
73
|
+
"end_page_id": None,
|
|
74
|
+
"client_side_output_generation": False,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
77
|
|
|
78
78
|
@PipelineConfig.register() # noqa: F821
|
|
79
79
|
class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
@@ -85,7 +85,7 @@ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
|
|
|
85
85
|
|
|
86
86
|
@classmethod
|
|
87
87
|
@cache
|
|
88
|
-
def
|
|
88
|
+
def supported_exts(cls) -> set[SupportedExt]:
|
|
89
89
|
return {
|
|
90
90
|
SupportedExt.PDF,
|
|
91
91
|
SupportedExt.DOCX,
|
|
@@ -104,6 +104,8 @@ class MinerUPipeline(Pipeline):
|
|
|
104
104
|
async def extract_content(
|
|
105
105
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
106
106
|
) -> AsyncGenerator[Result, None]:
|
|
107
|
+
from mineru.cli.common import aio_do_parse
|
|
108
|
+
|
|
107
109
|
docs = list(docs)
|
|
108
110
|
# TODO: exclude files which are not pdf and return an error
|
|
109
111
|
pdfs_bytes = [d.path.read_bytes() for d in docs]
|
|
@@ -149,11 +151,18 @@ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
|
|
|
149
151
|
|
|
150
152
|
|
|
151
153
|
def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
|
|
154
|
+
|
|
152
155
|
match backend:
|
|
153
156
|
case MinerUBackend.PIPELINE:
|
|
154
|
-
|
|
157
|
+
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
|
|
158
|
+
union_make,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return union_make
|
|
155
162
|
case MinerUBackend.VLM:
|
|
156
|
-
|
|
163
|
+
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
|
|
164
|
+
|
|
165
|
+
return union_make
|
|
157
166
|
case _:
|
|
158
167
|
raise ValueError(f"Unsupported backend: {backend}")
|
|
159
168
|
|
|
@@ -201,8 +210,12 @@ def _dump_md_content(
|
|
|
201
210
|
output_path: Path,
|
|
202
211
|
md_path: Path,
|
|
203
212
|
im_dir: Path,
|
|
204
|
-
md_make_mode: str =
|
|
213
|
+
md_make_mode: str | None = None,
|
|
205
214
|
) -> ConversionOutput:
|
|
215
|
+
from mineru.utils.enum_class import MakeMode
|
|
216
|
+
|
|
217
|
+
if md_make_mode is None:
|
|
218
|
+
md_make_mode = MakeMode.MM_MD
|
|
206
219
|
total_length = 0
|
|
207
220
|
end_indices = []
|
|
208
221
|
with md_path.open("w") as f:
|
|
@@ -21,7 +21,12 @@ from pydantic import AfterValidator, RootModel, TypeAdapter
|
|
|
21
21
|
from pydantic import BaseModel as _BaseModel
|
|
22
22
|
|
|
23
23
|
try:
|
|
24
|
-
from docling.datamodel.base_models import
|
|
24
|
+
from docling.datamodel.base_models import (
|
|
25
|
+
ConversionStatus,
|
|
26
|
+
ErrorItem,
|
|
27
|
+
FormatToExtensions,
|
|
28
|
+
InputFormat,
|
|
29
|
+
)
|
|
25
30
|
from docling.datamodel.document import InputDocument
|
|
26
31
|
from docling_core.types.io import DocumentStream
|
|
27
32
|
except ImportError:
|
|
@@ -33,42 +38,78 @@ logger = logging.getLogger(__name__)
|
|
|
33
38
|
base_config = merge_configs(icij_config(), no_enum_values_config())
|
|
34
39
|
|
|
35
40
|
|
|
41
|
+
@cache
|
|
42
|
+
def _ext_to_docling_input_format() -> dict:
|
|
43
|
+
from .docling_ import DoclingPipelineConfig # noqa: PLC0415
|
|
44
|
+
|
|
45
|
+
mapping = dict()
|
|
46
|
+
supported = DoclingPipelineConfig.supported_exts()
|
|
47
|
+
for input_f, exts in FormatToExtensions.items():
|
|
48
|
+
for ext in exts:
|
|
49
|
+
try:
|
|
50
|
+
ext = SupportedExt(f".{ext.lower()}") # noqa: PLW2901
|
|
51
|
+
except ValueError:
|
|
52
|
+
continue
|
|
53
|
+
if ext in supported:
|
|
54
|
+
mapping[ext] = input_f
|
|
55
|
+
return mapping
|
|
56
|
+
|
|
57
|
+
|
|
36
58
|
class BaseModel(_BaseModel):
|
|
37
59
|
model_config = base_config
|
|
38
60
|
|
|
39
61
|
|
|
40
62
|
class SupportedExt(StrEnum):
|
|
41
63
|
ADOC = ".adoc"
|
|
64
|
+
ASC = ".asc"
|
|
42
65
|
ASCIIDOC = ".asciidoc"
|
|
43
66
|
BMP = ".bmp"
|
|
44
67
|
CSV = ".csv"
|
|
45
68
|
DOC = ".doc"
|
|
46
69
|
DOCX = ".docx"
|
|
70
|
+
DOTX = ".dotx"
|
|
71
|
+
DOTM = ".dotm"
|
|
72
|
+
DOCM = ".docm"
|
|
47
73
|
EPUB = ".epub"
|
|
48
74
|
GIF = ".gif"
|
|
49
75
|
HTLM = ".html"
|
|
76
|
+
HTM = ".htm"
|
|
50
77
|
JPEG = ".jpeg"
|
|
51
78
|
JPG = ".jpg"
|
|
79
|
+
JSON = ".json"
|
|
80
|
+
LATEX = ".latex"
|
|
52
81
|
MD = ".md"
|
|
82
|
+
NXML = ".nxml"
|
|
53
83
|
ODP = ".odp"
|
|
54
84
|
ODS = ".ods"
|
|
55
85
|
ODT = ".odt"
|
|
56
86
|
PDF = ".pdf"
|
|
57
87
|
PNG = ".png"
|
|
88
|
+
PPSX = ".ppsx"
|
|
58
89
|
PPT = ".ppt"
|
|
90
|
+
PPTM = ".pptm"
|
|
91
|
+
PPSM = ".ppsm"
|
|
92
|
+
POTX = ".potx"
|
|
93
|
+
POTM = ".potm"
|
|
59
94
|
PPTX = ".pptx"
|
|
95
|
+
QMD = ".qmd"
|
|
96
|
+
RMD = ".rmd"
|
|
60
97
|
TEX = ".tex"
|
|
98
|
+
TIF = ".tif"
|
|
61
99
|
TIFF = ".tiff"
|
|
62
100
|
TXT = ".txt"
|
|
101
|
+
TEXT = ".text"
|
|
63
102
|
WEBP = ".webp"
|
|
103
|
+
XBRL = ".xbrl"
|
|
64
104
|
XHTML = ".xhtml"
|
|
65
105
|
XLS = ".xls"
|
|
66
106
|
XLSM = ".xlsm"
|
|
67
107
|
XLSX = ".xlsx"
|
|
68
108
|
XLTX = ".xltx"
|
|
109
|
+
XML = ".xml"
|
|
69
110
|
|
|
70
111
|
def to_docling(self) -> InputFormat:
|
|
71
|
-
return
|
|
112
|
+
return _ext_to_docling_input_format()[self]
|
|
72
113
|
|
|
73
114
|
|
|
74
115
|
class OutputFormat(StrEnum):
|
|
@@ -868,7 +868,6 @@ wheels = [
|
|
|
868
868
|
|
|
869
869
|
[[package]]
|
|
870
870
|
name = "extract-python"
|
|
871
|
-
version = "0.1.0"
|
|
872
871
|
source = { editable = "." }
|
|
873
872
|
dependencies = [
|
|
874
873
|
{ name = "icij-common" },
|
|
@@ -898,11 +897,12 @@ mineru = [
|
|
|
898
897
|
dev = [
|
|
899
898
|
{ name = "pytest" },
|
|
900
899
|
{ name = "pytest-asyncio" },
|
|
900
|
+
{ name = "ruff" },
|
|
901
901
|
]
|
|
902
902
|
|
|
903
903
|
[package.metadata]
|
|
904
904
|
requires-dist = [
|
|
905
|
-
{ name = "docling-slim", extras = ["
|
|
905
|
+
{ name = "docling-slim", extras = ["feat-ocr-easyocr", "feat-ocr-mac", "feat-ocr-tesserocr", "standard"], marker = "extra == 'docling'", specifier = "~=2.96" },
|
|
906
906
|
{ name = "html2image", marker = "extra == 'benches'", specifier = "~=2.0.7" },
|
|
907
907
|
{ name = "icij-common", specifier = "~=0.8.2" },
|
|
908
908
|
{ name = "markdown2", marker = "extra == 'benches'", specifier = ">=2.5.4" },
|
|
@@ -920,6 +920,7 @@ provides-extras = ["benches", "docling", "marker", "mineru"]
|
|
|
920
920
|
dev = [
|
|
921
921
|
{ name = "pytest", specifier = "~=8.3.5" },
|
|
922
922
|
{ name = "pytest-asyncio", specifier = "~=0.25.3" },
|
|
923
|
+
{ name = "ruff", specifier = "==0.15.2" },
|
|
923
924
|
]
|
|
924
925
|
|
|
925
926
|
[[package]]
|
|
@@ -4334,6 +4335,31 @@ wheels = [
|
|
|
4334
4335
|
{ url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" },
|
|
4335
4336
|
]
|
|
4336
4337
|
|
|
4338
|
+
[[package]]
|
|
4339
|
+
name = "ruff"
|
|
4340
|
+
version = "0.15.2"
|
|
4341
|
+
source = { registry = "https://pypi.org/simple" }
|
|
4342
|
+
sdist = { url = "https://files.pythonhosted.org/packages/06/04/eab13a954e763b0606f460443fcbf6bb5a0faf06890ea3754ff16523dce5/ruff-0.15.2.tar.gz", hash = "sha256:14b965afee0969e68bb871eba625343b8673375f457af4abe98553e8bbb98342", size = 4558148, upload-time = "2026-02-19T22:32:20.271Z" }
|
|
4343
|
+
wheels = [
|
|
4344
|
+
{ url = "https://files.pythonhosted.org/packages/2f/70/3a4dc6d09b13cb3e695f28307e5d889b2e1a66b7af9c5e257e796695b0e6/ruff-0.15.2-py3-none-linux_armv6l.whl", hash = "sha256:120691a6fdae2f16d65435648160f5b81a9625288f75544dc40637436b5d3c0d", size = 10430565, upload-time = "2026-02-19T22:32:41.824Z" },
|
|
4345
|
+
{ url = "https://files.pythonhosted.org/packages/71/0b/bb8457b56185ece1305c666dc895832946d24055be90692381c31d57466d/ruff-0.15.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a89056d831256099658b6bba4037ac6dd06f49d194199215befe2bb10457ea5e", size = 10820354, upload-time = "2026-02-19T22:32:07.366Z" },
|
|
4346
|
+
{ url = "https://files.pythonhosted.org/packages/2d/c1/e0532d7f9c9e0b14c46f61b14afd563298b8b83f337b6789ddd987e46121/ruff-0.15.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e36dee3a64be0ebd23c86ffa3aa3fd3ac9a712ff295e192243f814a830b6bd87", size = 10170767, upload-time = "2026-02-19T22:32:13.188Z" },
|
|
4347
|
+
{ url = "https://files.pythonhosted.org/packages/47/e8/da1aa341d3af017a21c7a62fb5ec31d4e7ad0a93ab80e3a508316efbcb23/ruff-0.15.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9fb47b6d9764677f8c0a193c0943ce9a05d6763523f132325af8a858eadc2b9", size = 10529591, upload-time = "2026-02-19T22:32:02.547Z" },
|
|
4348
|
+
{ url = "https://files.pythonhosted.org/packages/93/74/184fbf38e9f3510231fbc5e437e808f0b48c42d1df9434b208821efcd8d6/ruff-0.15.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f376990f9d0d6442ea9014b19621d8f2aaf2b8e39fdbfc79220b7f0c596c9b80", size = 10260771, upload-time = "2026-02-19T22:32:36.938Z" },
|
|
4349
|
+
{ url = "https://files.pythonhosted.org/packages/05/ac/605c20b8e059a0bc4b42360414baa4892ff278cec1c91fff4be0dceedefd/ruff-0.15.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dcc987551952d73cbf5c88d9fdee815618d497e4df86cd4c4824cc59d5dd75f", size = 11045791, upload-time = "2026-02-19T22:32:31.642Z" },
|
|
4350
|
+
{ url = "https://files.pythonhosted.org/packages/fd/52/db6e419908f45a894924d410ac77d64bdd98ff86901d833364251bd08e22/ruff-0.15.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:42a47fd785cbe8c01b9ff45031af875d101b040ad8f4de7bbb716487c74c9a77", size = 11879271, upload-time = "2026-02-19T22:32:29.305Z" },
|
|
4351
|
+
{ url = "https://files.pythonhosted.org/packages/3e/d8/7992b18f2008bdc9231d0f10b16df7dda964dbf639e2b8b4c1b4e91b83af/ruff-0.15.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbe9f49354866e575b4c6943856989f966421870e85cd2ac94dccb0a9dcb2fea", size = 11303707, upload-time = "2026-02-19T22:32:22.492Z" },
|
|
4352
|
+
{ url = "https://files.pythonhosted.org/packages/d7/02/849b46184bcfdd4b64cde61752cc9a146c54759ed036edd11857e9b8443b/ruff-0.15.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7a672c82b5f9887576087d97be5ce439f04bbaf548ee987b92d3a7dede41d3a", size = 11149151, upload-time = "2026-02-19T22:32:44.234Z" },
|
|
4353
|
+
{ url = "https://files.pythonhosted.org/packages/70/04/f5284e388bab60d1d3b99614a5a9aeb03e0f333847e2429bebd2aaa1feec/ruff-0.15.2-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ecc64f46f7019e2bcc3cdc05d4a7da958b629a5ab7033195e11a438403d956", size = 11091132, upload-time = "2026-02-19T22:32:24.691Z" },
|
|
4354
|
+
{ url = "https://files.pythonhosted.org/packages/fa/ae/88d844a21110e14d92cf73d57363fab59b727ebeabe78009b9ccb23500af/ruff-0.15.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:8dcf243b15b561c655c1ef2f2b0050e5d50db37fe90115507f6ff37d865dc8b4", size = 10504717, upload-time = "2026-02-19T22:32:26.75Z" },
|
|
4355
|
+
{ url = "https://files.pythonhosted.org/packages/64/27/867076a6ada7f2b9c8292884ab44d08fd2ba71bd2b5364d4136f3cd537e1/ruff-0.15.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dab6941c862c05739774677c6273166d2510d254dac0695c0e3f5efa1b5585de", size = 10263122, upload-time = "2026-02-19T22:32:10.036Z" },
|
|
4356
|
+
{ url = "https://files.pythonhosted.org/packages/e7/ef/faf9321d550f8ebf0c6373696e70d1758e20ccdc3951ad7af00c0956be7c/ruff-0.15.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1b9164f57fc36058e9a6806eb92af185b0697c9fe4c7c52caa431c6554521e5c", size = 10735295, upload-time = "2026-02-19T22:32:39.227Z" },
|
|
4357
|
+
{ url = "https://files.pythonhosted.org/packages/2f/55/e8089fec62e050ba84d71b70e7834b97709ca9b7aba10c1a0b196e493f97/ruff-0.15.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:80d24fcae24d42659db7e335b9e1531697a7102c19185b8dc4a028b952865fd8", size = 11241641, upload-time = "2026-02-19T22:32:34.617Z" },
|
|
4358
|
+
{ url = "https://files.pythonhosted.org/packages/23/01/1c30526460f4d23222d0fabd5888868262fd0e2b71a00570ca26483cd993/ruff-0.15.2-py3-none-win32.whl", hash = "sha256:fd5ff9e5f519a7e1bd99cbe8daa324010a74f5e2ebc97c6242c08f26f3714f6f", size = 10507885, upload-time = "2026-02-19T22:32:15.635Z" },
|
|
4359
|
+
{ url = "https://files.pythonhosted.org/packages/5c/10/3d18e3bbdf8fc50bbb4ac3cc45970aa5a9753c5cb51bf9ed9a3cd8b79fa3/ruff-0.15.2-py3-none-win_amd64.whl", hash = "sha256:d20014e3dfa400f3ff84830dfb5755ece2de45ab62ecea4af6b7262d0fb4f7c5", size = 11623725, upload-time = "2026-02-19T22:32:04.947Z" },
|
|
4360
|
+
{ url = "https://files.pythonhosted.org/packages/6d/78/097c0798b1dab9f8affe73da9642bb4500e098cb27fd8dc9724816ac747b/ruff-0.15.2-py3-none-win_arm64.whl", hash = "sha256:cabddc5822acdc8f7b5527b36ceac55cc51eec7b1946e60181de8fe83ca8876e", size = 10941649, upload-time = "2026-02-19T22:32:18.108Z" },
|
|
4361
|
+
]
|
|
4362
|
+
|
|
4337
4363
|
[[package]]
|
|
4338
4364
|
name = "safetensors"
|
|
4339
4365
|
version = "0.6.2"
|
|
@@ -1,258 +0,0 @@
|
|
|
1
|
-
import shutil
|
|
2
|
-
import tempfile
|
|
3
|
-
from collections.abc import AsyncGenerator, Iterable, Iterator
|
|
4
|
-
from functools import cache
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Any, ClassVar, Literal, TypeVar
|
|
7
|
-
|
|
8
|
-
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
9
|
-
from docling.datamodel.base_models import InputFormat
|
|
10
|
-
from docling.datamodel.document import ConversionResult
|
|
11
|
-
from docling.datamodel.pipeline_options import (
|
|
12
|
-
EasyOcrOptions,
|
|
13
|
-
PdfPipelineOptions,
|
|
14
|
-
PipelineOptions,
|
|
15
|
-
VlmPipelineOptions,
|
|
16
|
-
)
|
|
17
|
-
from docling.document_converter import DocumentConverter, FormatOption
|
|
18
|
-
from docling.models.factories import get_ocr_factory
|
|
19
|
-
from docling.pipeline.base_pipeline import BasePipeline
|
|
20
|
-
from docling_core.types.doc import ImageRefMode
|
|
21
|
-
from docling_core.types.io import DocumentStream
|
|
22
|
-
from icij_common.registrable import FromConfig
|
|
23
|
-
from pydantic import Field, model_validator
|
|
24
|
-
|
|
25
|
-
from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
|
|
26
|
-
from .objects import (
|
|
27
|
-
BaseModel,
|
|
28
|
-
Error,
|
|
29
|
-
InputDoc,
|
|
30
|
-
MarkdownDoc,
|
|
31
|
-
OutputFormat,
|
|
32
|
-
PageIndexes,
|
|
33
|
-
Result,
|
|
34
|
-
Status,
|
|
35
|
-
SupportedExt,
|
|
36
|
-
)
|
|
37
|
-
from .pipeline import Pipeline, PipelineConfig, PipelineType
|
|
38
|
-
from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
|
|
39
|
-
|
|
40
|
-
DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class _PdfPipelineOptions(PdfPipelineOptions):
|
|
44
|
-
generate_picture_images: bool = Field(default=True, frozen=True)
|
|
45
|
-
|
|
46
|
-
@model_validator(mode="before")
|
|
47
|
-
@classmethod
|
|
48
|
-
def validate_ocr_options(cls, data: Any) -> Any:
|
|
49
|
-
if isinstance(data, dict):
|
|
50
|
-
ocr_options = data.get("ocr_options")
|
|
51
|
-
if not isinstance(ocr_options, dict):
|
|
52
|
-
return data
|
|
53
|
-
allow_external_plugins = ocr_options.get("allow_external_plugins", False)
|
|
54
|
-
ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
|
|
55
|
-
kind = ocr_options.pop("kind")
|
|
56
|
-
data["ocr_options"] = ocr_factory.create_options(kind=kind, **ocr_options)
|
|
57
|
-
return data
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
OptionsByPipeline = list[
|
|
61
|
-
tuple[Literal["pdf"], _PdfPipelineOptions]
|
|
62
|
-
| tuple[Literal["vlm"], VlmPipelineOptions]
|
|
63
|
-
]
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _default_pipeline_options() -> OptionsByPipeline:
|
|
67
|
-
pipeline_options = _PdfPipelineOptions(ocr_options=EasyOcrOptions())
|
|
68
|
-
return [("pdf", pipeline_options), ("vlm", VlmPipelineOptions())]
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
class DoclingFormatOption(BaseModel):
|
|
72
|
-
pipeline_cls: str
|
|
73
|
-
backend_cls: str
|
|
74
|
-
|
|
75
|
-
def to_docling(
|
|
76
|
-
self, pipeline_options: dict[Literal["pdf", "vlm"], PipelineOptions]
|
|
77
|
-
) -> FormatOption:
|
|
78
|
-
pipeline_cls = _find_subcls(BasePipeline, self.pipeline_cls)
|
|
79
|
-
backend_cls = _find_subcls(AbstractDocumentBackend, self.backend_cls)
|
|
80
|
-
if "vlm" in self.pipeline_cls.lower():
|
|
81
|
-
pipeline_options = pipeline_options.get("vlm")
|
|
82
|
-
if pipeline_options is not None:
|
|
83
|
-
pipeline_options = VlmPipelineOptions.model_validate(pipeline_options)
|
|
84
|
-
elif "pdf" in self.pipeline_cls.lower():
|
|
85
|
-
pipeline_options = pipeline_options.get("pdf")
|
|
86
|
-
if pipeline_options is not None:
|
|
87
|
-
pipeline_options = _PdfPipelineOptions.model_validate(pipeline_options)
|
|
88
|
-
else:
|
|
89
|
-
raise ValueError(
|
|
90
|
-
f"invalid pipeline_cls: {pipeline_cls}, expected a VLM or PDF pipeline"
|
|
91
|
-
)
|
|
92
|
-
return FormatOption(
|
|
93
|
-
pipeline_cls=pipeline_cls,
|
|
94
|
-
pipeline_options=pipeline_options,
|
|
95
|
-
backend=backend_cls,
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
@cache
|
|
100
|
-
def _default_format_options() -> dict[InputFormat, DoclingFormatOption]:
|
|
101
|
-
supported_fmt = {InputFormat.PDF}
|
|
102
|
-
return {
|
|
103
|
-
fmt: DoclingFormatOption(
|
|
104
|
-
pipeline_cls=opt.pipeline_cls.__name__, backend_cls=opt.backend.__name__
|
|
105
|
-
)
|
|
106
|
-
for fmt, opt in DocumentConverter().format_to_options.items()
|
|
107
|
-
if fmt in supported_fmt
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
T = TypeVar("T")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def _find_subcls(cls: type[T], name: str) -> type[T]:
|
|
115
|
-
for c in all_subclasses(cls):
|
|
116
|
-
if c.__name__ == name:
|
|
117
|
-
return c
|
|
118
|
-
raise ValueError(f"unknown {cls.__name__} subclass {name}")
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
@PipelineConfig.register()
|
|
122
|
-
class DoclingPipelineConfig(PipelineConfig):
|
|
123
|
-
pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
|
|
124
|
-
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
125
|
-
|
|
126
|
-
pipeline_options: OptionsByPipeline = Field(
|
|
127
|
-
default_factory=_default_pipeline_options
|
|
128
|
-
)
|
|
129
|
-
format_options: dict[InputFormat, DoclingFormatOption] = Field(
|
|
130
|
-
default_factory=_default_format_options
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
def to_format_options(self) -> dict[InputFormat, FormatOption]:
|
|
134
|
-
pipeline_options = dict(self.pipeline_options)
|
|
135
|
-
return {
|
|
136
|
-
InputFormat(f): opt.to_docling(pipeline_options)
|
|
137
|
-
for f, opt in self.format_options.items()
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
@classmethod
|
|
141
|
-
@cache
|
|
142
|
-
def supported_formats(cls) -> set[SupportedExt]:
|
|
143
|
-
# Subset of https://docling-project.github.io/docling/usage/supported_formats/
|
|
144
|
-
return {
|
|
145
|
-
SupportedExt.ADOC,
|
|
146
|
-
SupportedExt.ASCIIDOC,
|
|
147
|
-
SupportedExt.BMP,
|
|
148
|
-
SupportedExt.CSV,
|
|
149
|
-
SupportedExt.DOCX,
|
|
150
|
-
SupportedExt.HTLM,
|
|
151
|
-
SupportedExt.JPG,
|
|
152
|
-
SupportedExt.MD,
|
|
153
|
-
SupportedExt.PDF,
|
|
154
|
-
SupportedExt.PNG,
|
|
155
|
-
SupportedExt.PPTX,
|
|
156
|
-
SupportedExt.TEX,
|
|
157
|
-
SupportedExt.TIFF,
|
|
158
|
-
SupportedExt.TXT,
|
|
159
|
-
SupportedExt.WEBP,
|
|
160
|
-
SupportedExt.XHTML,
|
|
161
|
-
SupportedExt.XLSX,
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
@Pipeline.register(PipelineType.DOCLING)
|
|
169
|
-
class DoclingPipeline(Pipeline):
|
|
170
|
-
def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
|
|
171
|
-
if format_options is None:
|
|
172
|
-
format_options = DEFAULT_FORMAT_OPTIONS
|
|
173
|
-
self._converter = DocumentConverter(format_options=format_options)
|
|
174
|
-
|
|
175
|
-
async def extract_content(
|
|
176
|
-
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
177
|
-
) -> AsyncGenerator[Result, None]:
|
|
178
|
-
docs, path_or_streams = map_and_preserve(_to_docling, docs)
|
|
179
|
-
outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
|
|
180
|
-
for doc, res in zip(docs, outputs, strict=True):
|
|
181
|
-
yield _to_result(res, doc, output_format, output_path=output_path)
|
|
182
|
-
|
|
183
|
-
@classmethod
|
|
184
|
-
def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
|
|
185
|
-
return cls(config.to_format_options())
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
|
|
189
|
-
for d in docs:
|
|
190
|
-
yield d.to_docling()
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def _to_result(
|
|
194
|
-
res: ConversionResult,
|
|
195
|
-
input_document: InputDoc,
|
|
196
|
-
output_format: OutputFormat,
|
|
197
|
-
output_path: Path,
|
|
198
|
-
**kwargs,
|
|
199
|
-
) -> Result:
|
|
200
|
-
output_path.mkdir(parents=True, exist_ok=True)
|
|
201
|
-
output = None
|
|
202
|
-
status = Status.from_docling(res.status)
|
|
203
|
-
if status.allows_conversion:
|
|
204
|
-
match output_format:
|
|
205
|
-
case OutputFormat.MARKDOWN:
|
|
206
|
-
output = _to_markdown_doc(res, output_path, **kwargs)
|
|
207
|
-
case _:
|
|
208
|
-
raise NotImplementedError(f"unsupported output format {output_format}")
|
|
209
|
-
errors = [Error.from_docling(e) for e in res.errors]
|
|
210
|
-
input_doc = input_document.without_content()
|
|
211
|
-
return Result(input=input_doc, status=status, errors=errors, output=output)
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
def _to_markdown_doc(
|
|
215
|
-
res: ConversionResult,
|
|
216
|
-
output_path: Path,
|
|
217
|
-
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
218
|
-
**kwargs,
|
|
219
|
-
) -> MarkdownDoc:
|
|
220
|
-
# TODO: Should we add a hash to avoid collision between files with same names
|
|
221
|
-
# nested in the tree structured
|
|
222
|
-
md_dir_name = path_to_artifacts_dirname(res.input.file)
|
|
223
|
-
md_dir = output_path / md_dir_name
|
|
224
|
-
if md_dir.exists():
|
|
225
|
-
raise FileExistsError(f"directory {md_dir} already exists")
|
|
226
|
-
# Let's avoid issue of duplicated input file names flattened top level
|
|
227
|
-
md_filename = md_dir_name + OutputFormat.MARKDOWN
|
|
228
|
-
total_length = 0
|
|
229
|
-
n_pages = len(res.pages)
|
|
230
|
-
|
|
231
|
-
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
|
|
232
|
-
tmp_dir = Path(td)
|
|
233
|
-
page_path = Path("page.md")
|
|
234
|
-
# We do a chdir to bypass a Docling bug which only allows to maintain relative
|
|
235
|
-
# image ref when saving the markdown to a relative path
|
|
236
|
-
with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
|
|
237
|
-
end_indices = []
|
|
238
|
-
for page_i in range(n_pages):
|
|
239
|
-
res.document.save_as_markdown(
|
|
240
|
-
page_path,
|
|
241
|
-
page_no=page_i + 1,
|
|
242
|
-
image_mode=ImageRefMode.REFERENCED,
|
|
243
|
-
artifacts_dir=Path(ARTIFACTS),
|
|
244
|
-
**kwargs,
|
|
245
|
-
)
|
|
246
|
-
content = page_path.read_text()
|
|
247
|
-
if page_i > 0:
|
|
248
|
-
content += "\n"
|
|
249
|
-
if page_i < n_pages - 1:
|
|
250
|
-
content += page_sep
|
|
251
|
-
total_length += len(content)
|
|
252
|
-
end_indices.append(total_length)
|
|
253
|
-
f.write(content)
|
|
254
|
-
f.flush()
|
|
255
|
-
page_path.unlink()
|
|
256
|
-
shutil.move(tmp_dir, md_dir)
|
|
257
|
-
pages = PageIndexes.from_page_end_indices(end_indices)
|
|
258
|
-
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|