docling 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +1 -0
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +4 -4
- docling/backend/docling_parse_v2_backend.py +12 -4
- docling/backend/html_backend.py +61 -57
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +62 -39
- docling/backend/msword_backend.py +12 -25
- docling/backend/pypdfium2_backend.py +1 -1
- docling/cli/main.py +38 -8
- docling/datamodel/base_models.py +16 -10
- docling/datamodel/document.py +36 -6
- docling/datamodel/pipeline_options.py +3 -3
- docling/datamodel/settings.py +15 -1
- docling/document_converter.py +38 -12
- docling/models/base_model.py +4 -1
- docling/models/base_ocr_model.py +21 -4
- docling/models/ds_glm_model.py +27 -11
- docling/models/easyocr_model.py +49 -39
- docling/models/layout_model.py +87 -61
- docling/models/page_assemble_model.py +102 -100
- docling/models/page_preprocessing_model.py +25 -7
- docling/models/table_structure_model.py +125 -90
- docling/models/tesseract_ocr_cli_model.py +62 -52
- docling/models/tesseract_ocr_model.py +76 -52
- docling/pipeline/base_pipeline.py +68 -69
- docling/pipeline/simple_pipeline.py +8 -11
- docling/pipeline/standard_pdf_pipeline.py +59 -56
- docling/utils/profiling.py +62 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/METADATA +27 -22
- docling-2.4.1.dist-info/RECORD +45 -0
- docling-2.1.0.dist-info/RECORD +0 -42
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -5,12 +5,15 @@ import time
|
|
5
5
|
import warnings
|
6
6
|
from enum import Enum
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Annotated, Dict, Iterable, List, Optional
|
8
|
+
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
9
9
|
|
10
10
|
import typer
|
11
11
|
from docling_core.utils.file import resolve_file_source
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
|
+
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
15
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
16
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
14
17
|
from docling.datamodel.base_models import (
|
15
18
|
ConversionStatus,
|
16
19
|
FormatToExtensions,
|
@@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
|
|
22
25
|
EasyOcrOptions,
|
23
26
|
OcrOptions,
|
24
27
|
PdfPipelineOptions,
|
28
|
+
TableFormerMode,
|
25
29
|
TesseractCliOcrOptions,
|
26
30
|
TesseractOcrOptions,
|
27
31
|
)
|
@@ -58,9 +62,10 @@ def version_callback(value: bool):
|
|
58
62
|
|
59
63
|
|
60
64
|
# Define an enum for the backend options
|
61
|
-
class
|
65
|
+
class PdfBackend(str, Enum):
|
62
66
|
PYPDFIUM2 = "pypdfium2"
|
63
|
-
|
67
|
+
DLPARSE_V1 = "dlparse_v1"
|
68
|
+
DLPARSE_V2 = "dlparse_v2"
|
64
69
|
|
65
70
|
|
66
71
|
# Define an enum for the ocr engines
|
@@ -90,28 +95,28 @@ def export_documents(
|
|
90
95
|
# Export Deep Search document JSON format:
|
91
96
|
if export_json:
|
92
97
|
fname = output_dir / f"{doc_filename}.json"
|
93
|
-
with fname.open("w") as fp:
|
98
|
+
with fname.open("w", encoding="utf8") as fp:
|
94
99
|
_log.info(f"writing JSON output to {fname}")
|
95
100
|
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
96
101
|
|
97
102
|
# Export Text format:
|
98
103
|
if export_txt:
|
99
104
|
fname = output_dir / f"{doc_filename}.txt"
|
100
|
-
with fname.open("w") as fp:
|
105
|
+
with fname.open("w", encoding="utf8") as fp:
|
101
106
|
_log.info(f"writing Text output to {fname}")
|
102
107
|
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
103
108
|
|
104
109
|
# Export Markdown format:
|
105
110
|
if export_md:
|
106
111
|
fname = output_dir / f"{doc_filename}.md"
|
107
|
-
with fname.open("w") as fp:
|
112
|
+
with fname.open("w", encoding="utf8") as fp:
|
108
113
|
_log.info(f"writing Markdown output to {fname}")
|
109
114
|
fp.write(conv_res.document.export_to_markdown())
|
110
115
|
|
111
116
|
# Export Document Tags format:
|
112
117
|
if export_doctags:
|
113
118
|
fname = output_dir / f"{doc_filename}.doctags"
|
114
|
-
with fname.open("w") as fp:
|
119
|
+
with fname.open("w", encoding="utf8") as fp:
|
115
120
|
_log.info(f"writing Doc Tags output to {fname}")
|
116
121
|
fp.write(conv_res.document.export_to_document_tokens())
|
117
122
|
|
@@ -151,6 +156,17 @@ def convert(
|
|
151
156
|
ocr_engine: Annotated[
|
152
157
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
153
158
|
] = OcrEngine.EASYOCR,
|
159
|
+
pdf_backend: Annotated[
|
160
|
+
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
161
|
+
] = PdfBackend.DLPARSE_V1,
|
162
|
+
table_mode: Annotated[
|
163
|
+
TableFormerMode,
|
164
|
+
typer.Option(..., help="The mode to use in the table structure model."),
|
165
|
+
] = TableFormerMode.FAST,
|
166
|
+
artifacts_path: Annotated[
|
167
|
+
Optional[Path],
|
168
|
+
typer.Option(..., help="If provided, the location of the model artifacts."),
|
169
|
+
] = None,
|
154
170
|
abort_on_error: Annotated[
|
155
171
|
bool,
|
156
172
|
typer.Option(
|
@@ -217,11 +233,25 @@ def convert(
|
|
217
233
|
do_table_structure=True,
|
218
234
|
)
|
219
235
|
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
236
|
+
pipeline_options.table_structure_options.mode = table_mode
|
237
|
+
|
238
|
+
if artifacts_path is not None:
|
239
|
+
pipeline_options.artifacts_path = artifacts_path
|
240
|
+
|
241
|
+
match pdf_backend:
|
242
|
+
case PdfBackend.DLPARSE_V1:
|
243
|
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
244
|
+
case PdfBackend.DLPARSE_V2:
|
245
|
+
backend = DoclingParseV2DocumentBackend
|
246
|
+
case PdfBackend.PYPDFIUM2:
|
247
|
+
backend = PyPdfiumDocumentBackend
|
248
|
+
case _:
|
249
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
220
250
|
|
221
251
|
format_options: Dict[InputFormat, FormatOption] = {
|
222
252
|
InputFormat.PDF: PdfFormatOption(
|
223
253
|
pipeline_options=pipeline_options,
|
224
|
-
backend=
|
254
|
+
backend=backend, # pdf_backend
|
225
255
|
)
|
226
256
|
}
|
227
257
|
doc_converter = DocumentConverter(
|
docling/datamodel/base_models.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from enum import Enum, auto
|
2
2
|
from io import BytesIO
|
3
|
-
from typing import TYPE_CHECKING, Dict, List, Optional,
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
4
|
|
5
5
|
from docling_core.types.doc import (
|
6
6
|
BoundingBox,
|
@@ -30,6 +30,8 @@ class InputFormat(str, Enum):
|
|
30
30
|
HTML = "html"
|
31
31
|
IMAGE = "image"
|
32
32
|
PDF = "pdf"
|
33
|
+
ASCIIDOC = "asciidoc"
|
34
|
+
MD = "md"
|
33
35
|
|
34
36
|
|
35
37
|
class OutputFormat(str, Enum):
|
@@ -43,29 +45,33 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
43
45
|
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
44
46
|
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
45
47
|
InputFormat.PDF: ["pdf"],
|
48
|
+
InputFormat.MD: ["md"],
|
46
49
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
47
50
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
51
|
+
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
48
52
|
}
|
49
53
|
|
50
|
-
FormatToMimeType: Dict[InputFormat,
|
51
|
-
InputFormat.DOCX:
|
54
|
+
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
55
|
+
InputFormat.DOCX: [
|
52
56
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
53
57
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
54
|
-
|
55
|
-
InputFormat.PPTX:
|
58
|
+
],
|
59
|
+
InputFormat.PPTX: [
|
56
60
|
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
57
61
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
58
62
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
59
|
-
|
60
|
-
InputFormat.HTML:
|
61
|
-
InputFormat.IMAGE:
|
63
|
+
],
|
64
|
+
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
65
|
+
InputFormat.IMAGE: [
|
62
66
|
"image/png",
|
63
67
|
"image/jpeg",
|
64
68
|
"image/tiff",
|
65
69
|
"image/gif",
|
66
70
|
"image/bmp",
|
67
|
-
|
68
|
-
InputFormat.PDF:
|
71
|
+
],
|
72
|
+
InputFormat.PDF: ["application/pdf"],
|
73
|
+
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
74
|
+
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
69
75
|
}
|
70
76
|
MimeTypeToFormat = {
|
71
77
|
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
docling/datamodel/document.py
CHANGED
@@ -3,7 +3,7 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional,
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
7
7
|
|
8
8
|
import filetype
|
9
9
|
from docling_core.types.doc import (
|
@@ -45,11 +45,14 @@ from docling.datamodel.base_models import (
|
|
45
45
|
ConversionStatus,
|
46
46
|
DocumentStream,
|
47
47
|
ErrorItem,
|
48
|
+
FormatToExtensions,
|
49
|
+
FormatToMimeType,
|
48
50
|
InputFormat,
|
49
51
|
MimeTypeToFormat,
|
50
52
|
Page,
|
51
53
|
)
|
52
54
|
from docling.datamodel.settings import DocumentLimits
|
55
|
+
from docling.utils.profiling import ProfilingItem
|
53
56
|
from docling.utils.utils import create_file_hash, create_hash
|
54
57
|
|
55
58
|
if TYPE_CHECKING:
|
@@ -143,11 +146,13 @@ class InputDocument(BaseModel):
|
|
143
146
|
self.valid = False
|
144
147
|
|
145
148
|
except (FileNotFoundError, OSError) as e:
|
149
|
+
self.valid = False
|
146
150
|
_log.exception(
|
147
151
|
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
148
152
|
)
|
149
153
|
# raise
|
150
154
|
except RuntimeError as e:
|
155
|
+
self.valid = False
|
151
156
|
_log.exception(
|
152
157
|
f"An unexpected error occurred while opening the document {self.file.name}",
|
153
158
|
exc_info=e,
|
@@ -166,6 +171,8 @@ class InputDocument(BaseModel):
|
|
166
171
|
)
|
167
172
|
|
168
173
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
174
|
+
if not self._backend.is_valid():
|
175
|
+
self.valid = False
|
169
176
|
|
170
177
|
|
171
178
|
class DocumentFormat(str, Enum):
|
@@ -181,6 +188,7 @@ class ConversionResult(BaseModel):
|
|
181
188
|
|
182
189
|
pages: List[Page] = []
|
183
190
|
assembled: AssembledUnit = AssembledUnit()
|
191
|
+
timings: Dict[str, ProfilingItem] = {}
|
184
192
|
|
185
193
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
186
194
|
|
@@ -480,26 +488,48 @@ class _DocumentConversionInput(BaseModel):
|
|
480
488
|
else:
|
481
489
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
482
490
|
|
483
|
-
def _guess_format(self, obj):
|
484
|
-
content =
|
491
|
+
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
492
|
+
content = b"" # empty binary blob
|
493
|
+
format = None
|
494
|
+
|
485
495
|
if isinstance(obj, Path):
|
486
496
|
mime = filetype.guess_mime(str(obj))
|
487
497
|
if mime is None:
|
498
|
+
ext = obj.suffix[1:]
|
499
|
+
mime = self._mime_from_extension(ext)
|
500
|
+
if mime is None: # must guess from
|
488
501
|
with obj.open("rb") as f:
|
489
502
|
content = f.read(1024) # Read first 1KB
|
490
503
|
|
491
504
|
elif isinstance(obj, DocumentStream):
|
492
|
-
obj.stream.seek(0)
|
493
505
|
content = obj.stream.read(8192)
|
494
506
|
obj.stream.seek(0)
|
495
507
|
mime = filetype.guess_mime(content)
|
508
|
+
if mime is None:
|
509
|
+
ext = (
|
510
|
+
obj.name.rsplit(".", 1)[-1]
|
511
|
+
if ("." in obj.name and not obj.name.startswith("."))
|
512
|
+
else ""
|
513
|
+
)
|
514
|
+
mime = self._mime_from_extension(ext)
|
496
515
|
|
497
|
-
|
498
|
-
|
516
|
+
mime = mime or self._detect_html_xhtml(content)
|
517
|
+
mime = mime or "text/plain"
|
499
518
|
|
500
519
|
format = MimeTypeToFormat.get(mime)
|
501
520
|
return format
|
502
521
|
|
522
|
+
def _mime_from_extension(self, ext):
|
523
|
+
mime = None
|
524
|
+
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
525
|
+
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
526
|
+
elif ext in FormatToExtensions[InputFormat.HTML]:
|
527
|
+
mime = FormatToMimeType[InputFormat.HTML][0]
|
528
|
+
elif ext in FormatToExtensions[InputFormat.MD]:
|
529
|
+
mime = FormatToMimeType[InputFormat.MD][0]
|
530
|
+
|
531
|
+
return mime
|
532
|
+
|
503
533
|
def _detect_html_xhtml(self, content):
|
504
534
|
content_str = content.decode("ascii", errors="ignore").lower()
|
505
535
|
# Remove XML comments
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from enum import Enum
|
1
|
+
from enum import Enum
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import List, Literal, Optional, Union
|
4
4
|
|
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
6
6
|
|
7
7
|
|
8
8
|
class TableFormerMode(str, Enum):
|
9
|
-
FAST =
|
10
|
-
ACCURATE =
|
9
|
+
FAST = "fast"
|
10
|
+
ACCURATE = "accurate"
|
11
11
|
|
12
12
|
|
13
13
|
class TableStructureOptions(BaseModel):
|
docling/datamodel/settings.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import sys
|
2
|
+
from pathlib import Path
|
2
3
|
|
3
4
|
from pydantic import BaseModel
|
4
5
|
from pydantic_settings import BaseSettings
|
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
|
|
26
27
|
# To force models into single core: export OMP_NUM_THREADS=1
|
27
28
|
|
28
29
|
|
30
|
+
class DebugSettings(BaseModel):
|
31
|
+
visualize_cells: bool = False
|
32
|
+
visualize_ocr: bool = False
|
33
|
+
visualize_layout: bool = False
|
34
|
+
visualize_tables: bool = False
|
35
|
+
|
36
|
+
profile_pipeline_timings: bool = False
|
37
|
+
|
38
|
+
# Path used to output debug information.
|
39
|
+
debug_output_path: str = str(Path.cwd() / "debug")
|
40
|
+
|
41
|
+
|
29
42
|
class AppSettings(BaseSettings):
|
30
43
|
perf: BatchConcurrencySettings
|
44
|
+
debug: DebugSettings
|
31
45
|
|
32
46
|
|
33
|
-
settings = AppSettings(perf=BatchConcurrencySettings())
|
47
|
+
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
docling/document_converter.py
CHANGED
@@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
|
|
8
8
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
9
9
|
|
10
10
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
|
+
from docling.backend.asciidoc_backend import AsciiDocBackend
|
11
12
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
12
13
|
from docling.backend.html_backend import HTMLDocumentBackend
|
14
|
+
from docling.backend.md_backend import MarkdownDocumentBackend
|
13
15
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
14
16
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
15
17
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
@@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
|
|
52
54
|
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
53
55
|
|
54
56
|
|
57
|
+
class MarkdownFormatOption(FormatOption):
|
58
|
+
pipeline_cls: Type = SimplePipeline
|
59
|
+
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
60
|
+
|
61
|
+
|
62
|
+
class AsciiDocFormatOption(FormatOption):
|
63
|
+
pipeline_cls: Type = SimplePipeline
|
64
|
+
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
65
|
+
|
66
|
+
|
55
67
|
class HTMLFormatOption(FormatOption):
|
56
68
|
pipeline_cls: Type = SimplePipeline
|
57
69
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
@@ -74,6 +86,12 @@ _format_to_default_options = {
|
|
74
86
|
InputFormat.PPTX: FormatOption(
|
75
87
|
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
76
88
|
),
|
89
|
+
InputFormat.MD: FormatOption(
|
90
|
+
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
91
|
+
),
|
92
|
+
InputFormat.ASCIIDOC: FormatOption(
|
93
|
+
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
94
|
+
),
|
77
95
|
InputFormat.HTML: FormatOption(
|
78
96
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
79
97
|
),
|
@@ -121,6 +139,10 @@ class DocumentConverter:
|
|
121
139
|
|
122
140
|
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
123
141
|
|
142
|
+
def initialize_pipeline(self, format: InputFormat):
|
143
|
+
"""Initialize the conversion pipeline for the selected format."""
|
144
|
+
self._get_pipeline(doc_format=format)
|
145
|
+
|
124
146
|
@validate_call(config=ConfigDict(strict=True))
|
125
147
|
def convert(
|
126
148
|
self,
|
@@ -171,32 +193,43 @@ class DocumentConverter:
|
|
171
193
|
) -> Iterator[ConversionResult]:
|
172
194
|
assert self.format_to_options is not None
|
173
195
|
|
196
|
+
start_time = time.monotonic()
|
197
|
+
|
174
198
|
for input_batch in chunkify(
|
175
199
|
conv_input.docs(self.format_to_options),
|
176
200
|
settings.perf.doc_batch_size, # pass format_options
|
177
201
|
):
|
178
202
|
_log.info(f"Going to convert document batch...")
|
203
|
+
|
179
204
|
# parallel processing only within input_batch
|
180
205
|
# with ThreadPoolExecutor(
|
181
206
|
# max_workers=settings.perf.doc_batch_concurrency
|
182
207
|
# ) as pool:
|
183
208
|
# yield from pool.map(self.process_document, input_batch)
|
184
|
-
|
185
209
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
210
|
+
|
186
211
|
for item in map(
|
187
212
|
partial(self._process_document, raises_on_error=raises_on_error),
|
188
213
|
input_batch,
|
189
214
|
):
|
215
|
+
elapsed = time.monotonic() - start_time
|
216
|
+
start_time = time.monotonic()
|
217
|
+
|
190
218
|
if item is not None:
|
219
|
+
_log.info(
|
220
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
221
|
+
)
|
191
222
|
yield item
|
223
|
+
else:
|
224
|
+
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
192
225
|
|
193
|
-
def _get_pipeline(self,
|
226
|
+
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
194
227
|
assert self.format_to_options is not None
|
195
228
|
|
196
|
-
fopt = self.format_to_options.get(
|
229
|
+
fopt = self.format_to_options.get(doc_format)
|
197
230
|
|
198
231
|
if fopt is None:
|
199
|
-
raise RuntimeError(f"Could not get pipeline for
|
232
|
+
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
200
233
|
else:
|
201
234
|
pipeline_class = fopt.pipeline_cls
|
202
235
|
pipeline_options = fopt.pipeline_options
|
@@ -219,22 +252,15 @@ class DocumentConverter:
|
|
219
252
|
assert self.allowed_formats is not None
|
220
253
|
assert in_doc.format in self.allowed_formats
|
221
254
|
|
222
|
-
start_doc_time = time.time()
|
223
|
-
|
224
255
|
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
225
256
|
|
226
|
-
end_doc_time = time.time() - start_doc_time
|
227
|
-
_log.info(
|
228
|
-
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
229
|
-
)
|
230
|
-
|
231
257
|
return conv_res
|
232
258
|
|
233
259
|
def _execute_pipeline(
|
234
260
|
self, in_doc: InputDocument, raises_on_error: bool
|
235
261
|
) -> ConversionResult:
|
236
262
|
if in_doc.valid:
|
237
|
-
pipeline = self._get_pipeline(in_doc)
|
263
|
+
pipeline = self._get_pipeline(in_doc.format)
|
238
264
|
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
239
265
|
if raises_on_error:
|
240
266
|
raise RuntimeError(
|
docling/models/base_model.py
CHANGED
@@ -4,11 +4,14 @@ from typing import Any, Iterable
|
|
4
4
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import Page
|
7
|
+
from docling.datamodel.document import ConversionResult
|
7
8
|
|
8
9
|
|
9
10
|
class BasePageModel(ABC):
|
10
11
|
@abstractmethod
|
11
|
-
def __call__(
|
12
|
+
def __call__(
|
13
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
14
|
+
) -> Iterable[Page]:
|
12
15
|
pass
|
13
16
|
|
14
17
|
|
docling/models/base_ocr_model.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
+
from pathlib import Path
|
4
5
|
from typing import Iterable, List
|
5
6
|
|
6
7
|
import numpy as np
|
@@ -10,12 +11,15 @@ from rtree import index
|
|
10
11
|
from scipy.ndimage import find_objects, label
|
11
12
|
|
12
13
|
from docling.datamodel.base_models import OcrCell, Page
|
14
|
+
from docling.datamodel.document import ConversionResult
|
13
15
|
from docling.datamodel.pipeline_options import OcrOptions
|
16
|
+
from docling.datamodel.settings import settings
|
17
|
+
from docling.models.base_model import BasePageModel
|
14
18
|
|
15
19
|
_log = logging.getLogger(__name__)
|
16
20
|
|
17
21
|
|
18
|
-
class BaseOcrModel:
|
22
|
+
class BaseOcrModel(BasePageModel):
|
19
23
|
def __init__(self, enabled: bool, options: OcrOptions):
|
20
24
|
self.enabled = enabled
|
21
25
|
self.options = options
|
@@ -113,7 +117,7 @@ class BaseOcrModel:
|
|
113
117
|
]
|
114
118
|
return filtered_ocr_cells
|
115
119
|
|
116
|
-
def draw_ocr_rects_and_cells(self, page, ocr_rects):
|
120
|
+
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
117
121
|
image = copy.deepcopy(page.image)
|
118
122
|
draw = ImageDraw.Draw(image, "RGBA")
|
119
123
|
|
@@ -130,8 +134,21 @@ class BaseOcrModel:
|
|
130
134
|
if isinstance(tc, OcrCell):
|
131
135
|
color = "magenta"
|
132
136
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
133
|
-
|
137
|
+
|
138
|
+
if show:
|
139
|
+
image.show()
|
140
|
+
else:
|
141
|
+
out_path: Path = (
|
142
|
+
Path(settings.debug.debug_output_path)
|
143
|
+
/ f"debug_{conv_res.input.file.stem}"
|
144
|
+
)
|
145
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
146
|
+
|
147
|
+
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
|
148
|
+
image.save(str(out_file), format="png")
|
134
149
|
|
135
150
|
@abstractmethod
|
136
|
-
def __call__(
|
151
|
+
def __call__(
|
152
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
153
|
+
) -> Iterable[Page]:
|
137
154
|
pass
|
docling/models/ds_glm_model.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import copy
|
2
2
|
import random
|
3
|
+
from pathlib import Path
|
3
4
|
from typing import List, Union
|
4
5
|
|
5
6
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
|
|
27
28
|
|
28
29
|
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
29
30
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
31
|
+
from docling.datamodel.settings import settings
|
32
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
30
33
|
from docling.utils.utils import create_hash
|
31
34
|
|
32
35
|
|
@@ -226,23 +229,24 @@ class GlmModel:
|
|
226
229
|
return ds_doc
|
227
230
|
|
228
231
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
229
|
-
|
230
|
-
|
232
|
+
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
233
|
+
ds_doc = self._to_legacy_document(conv_res)
|
234
|
+
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
231
235
|
|
232
|
-
|
236
|
+
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
233
237
|
|
234
|
-
|
238
|
+
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
235
239
|
|
236
240
|
# DEBUG code:
|
237
|
-
def draw_clusters_and_cells(ds_document, page_no):
|
241
|
+
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
238
242
|
clusters_to_draw = []
|
239
243
|
image = copy.deepcopy(conv_res.pages[page_no].image)
|
240
244
|
for ix, elem in enumerate(ds_document.main_text):
|
241
245
|
if isinstance(elem, BaseText):
|
242
|
-
prov = elem.prov[0]
|
246
|
+
prov = elem.prov[0] # type: ignore
|
243
247
|
elif isinstance(elem, Ref):
|
244
248
|
_, arr, index = elem.ref.split("/")
|
245
|
-
index = int(index)
|
249
|
+
index = int(index) # type: ignore
|
246
250
|
if arr == "tables":
|
247
251
|
prov = ds_document.tables[index].prov[0]
|
248
252
|
elif arr == "figures":
|
@@ -256,7 +260,7 @@ class GlmModel:
|
|
256
260
|
id=ix,
|
257
261
|
label=elem.name,
|
258
262
|
bbox=BoundingBox.from_tuple(
|
259
|
-
coord=prov.bbox,
|
263
|
+
coord=prov.bbox, # type: ignore
|
260
264
|
origin=CoordOrigin.BOTTOMLEFT,
|
261
265
|
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
262
266
|
)
|
@@ -276,9 +280,21 @@ class GlmModel:
|
|
276
280
|
for tc in c.cells: # [:1]:
|
277
281
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
278
282
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
279
|
-
image.show()
|
280
283
|
|
281
|
-
|
282
|
-
|
284
|
+
if show:
|
285
|
+
image.show()
|
286
|
+
else:
|
287
|
+
out_path: Path = (
|
288
|
+
Path(settings.debug.debug_output_path)
|
289
|
+
/ f"debug_{conv_res.input.file.stem}"
|
290
|
+
)
|
291
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
292
|
+
|
293
|
+
out_file = out_path / f"doc_page_{page_no:05}.png"
|
294
|
+
image.save(str(out_file), format="png")
|
295
|
+
|
296
|
+
# for item in ds_doc.page_dimensions:
|
297
|
+
# page_no = item.page
|
298
|
+
# draw_clusters_and_cells(ds_doc, page_no)
|
283
299
|
|
284
300
|
return docling_doc
|