docling 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +17 -8
- docling/backend/docling_parse_backend.py +42 -26
- docling/backend/pypdfium2_backend.py +33 -11
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +253 -0
- docling/datamodel/base_models.py +39 -27
- docling/datamodel/document.py +115 -17
- docling/datamodel/pipeline_options.py +67 -0
- docling/document_converter.py +65 -44
- docling/models/base_ocr_model.py +4 -4
- docling/models/ds_glm_model.py +11 -7
- docling/models/easyocr_model.py +19 -4
- docling/models/layout_model.py +3 -3
- docling/models/table_structure_model.py +18 -2
- docling/models/tesseract_ocr_cli_model.py +167 -0
- docling/models/tesseract_ocr_model.py +122 -0
- docling/pipeline/base_model_pipeline.py +4 -3
- docling/pipeline/standard_model_pipeline.py +36 -8
- docling/utils/export.py +145 -0
- {docling-1.6.2.dist-info → docling-1.19.0.dist-info}/LICENSE +1 -1
- docling-1.19.0.dist-info/METADATA +380 -0
- docling-1.19.0.dist-info/RECORD +34 -0
- docling-1.19.0.dist-info/entry_points.txt +3 -0
- docling-1.6.2.dist-info/METADATA +0 -192
- docling-1.6.2.dist-info/RECORD +0 -27
- {docling-1.6.2.dist-info → docling-1.19.0.dist-info}/WHEEL +0 -0
docling/datamodel/document.py
CHANGED
@@ -4,14 +4,16 @@ from pathlib import Path, PurePath
|
|
4
4
|
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types import BaseCell, BaseText
|
7
|
-
from docling_core.types import BoundingBox as DsBoundingBox
|
8
7
|
from docling_core.types import Document as DsDocument
|
9
8
|
from docling_core.types import DocumentDescription as DsDocumentDescription
|
10
9
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
11
10
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
12
11
|
from docling_core.types import Table as DsSchemaTable
|
13
12
|
from docling_core.types import TableCell
|
13
|
+
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
14
|
+
from docling_core.types.doc.base import Figure
|
14
15
|
from pydantic import BaseModel
|
16
|
+
from typing_extensions import deprecated
|
15
17
|
|
16
18
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
19
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
@@ -19,6 +21,7 @@ from docling.datamodel.base_models import (
|
|
19
21
|
AssembledUnit,
|
20
22
|
ConversionStatus,
|
21
23
|
DocumentStream,
|
24
|
+
ErrorItem,
|
22
25
|
FigureElement,
|
23
26
|
Page,
|
24
27
|
PageElement,
|
@@ -48,6 +51,15 @@ layout_label_to_ds_type = {
|
|
48
51
|
"Text": "paragraph",
|
49
52
|
}
|
50
53
|
|
54
|
+
_EMPTY_DOC = DsDocument(
|
55
|
+
_name="",
|
56
|
+
description=DsDocumentDescription(logs=[]),
|
57
|
+
file_info=DsFileInfoObject(
|
58
|
+
filename="",
|
59
|
+
document_hash="",
|
60
|
+
),
|
61
|
+
)
|
62
|
+
|
51
63
|
|
52
64
|
class InputDocument(BaseModel):
|
53
65
|
file: PurePath = None
|
@@ -79,7 +91,9 @@ class InputDocument(BaseModel):
|
|
79
91
|
self.valid = False
|
80
92
|
else:
|
81
93
|
self.document_hash = create_file_hash(path_or_stream)
|
82
|
-
self._backend = pdf_backend(
|
94
|
+
self._backend = pdf_backend(
|
95
|
+
path_or_stream=path_or_stream, document_hash=self.document_hash
|
96
|
+
)
|
83
97
|
|
84
98
|
elif isinstance(path_or_stream, BytesIO):
|
85
99
|
self.file = PurePath(filename)
|
@@ -89,7 +103,9 @@ class InputDocument(BaseModel):
|
|
89
103
|
self.valid = False
|
90
104
|
else:
|
91
105
|
self.document_hash = create_file_hash(path_or_stream)
|
92
|
-
self._backend = pdf_backend(
|
106
|
+
self._backend = pdf_backend(
|
107
|
+
path_or_stream=path_or_stream, document_hash=self.document_hash
|
108
|
+
)
|
93
109
|
|
94
110
|
if self.document_hash and self._backend.page_count() > 0:
|
95
111
|
self.page_count = self._backend.page_count()
|
@@ -110,18 +126,19 @@ class InputDocument(BaseModel):
|
|
110
126
|
# raise
|
111
127
|
|
112
128
|
|
129
|
+
@deprecated("Use `ConversionResult` instead.")
|
113
130
|
class ConvertedDocument(BaseModel):
|
114
131
|
input: InputDocument
|
115
132
|
|
116
133
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
117
|
-
errors: List[
|
134
|
+
errors: List[ErrorItem] = [] # structure to keep errors
|
118
135
|
|
119
136
|
pages: List[Page] = []
|
120
|
-
assembled:
|
137
|
+
assembled: AssembledUnit = AssembledUnit()
|
121
138
|
|
122
|
-
output:
|
139
|
+
output: DsDocument = _EMPTY_DOC
|
123
140
|
|
124
|
-
def
|
141
|
+
def _to_ds_document(self) -> DsDocument:
|
125
142
|
title = ""
|
126
143
|
desc = DsDocumentDescription(logs=[])
|
127
144
|
|
@@ -206,6 +223,8 @@ class ConvertedDocument(BaseModel):
|
|
206
223
|
celltype = "col_header"
|
207
224
|
elif cell.row_header:
|
208
225
|
celltype = "row_header"
|
226
|
+
elif cell.row_section:
|
227
|
+
celltype = "row_section"
|
209
228
|
|
210
229
|
def make_spans(cell):
|
211
230
|
for rspan in range(
|
@@ -261,7 +280,7 @@ class ConvertedDocument(BaseModel):
|
|
261
280
|
),
|
262
281
|
)
|
263
282
|
figures.append(
|
264
|
-
|
283
|
+
Figure(
|
265
284
|
prov=[
|
266
285
|
Prov(
|
267
286
|
bbox=target_bbox,
|
@@ -292,16 +311,91 @@ class ConvertedDocument(BaseModel):
|
|
292
311
|
return ds_doc
|
293
312
|
|
294
313
|
def render_as_dict(self):
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
314
|
+
return self.output.model_dump(by_alias=True, exclude_none=True)
|
315
|
+
|
316
|
+
def render_as_markdown(
|
317
|
+
self,
|
318
|
+
delim: str = "\n\n",
|
319
|
+
main_text_start: int = 0,
|
320
|
+
main_text_stop: Optional[int] = None,
|
321
|
+
main_text_labels: list[str] = [
|
322
|
+
"title",
|
323
|
+
"subtitle-level-1",
|
324
|
+
"paragraph",
|
325
|
+
"caption",
|
326
|
+
"table",
|
327
|
+
"figure",
|
328
|
+
],
|
329
|
+
strict_text: bool = False,
|
330
|
+
image_placeholder: str = "<!-- image -->",
|
331
|
+
):
|
332
|
+
return self.output.export_to_markdown(
|
333
|
+
delim=delim,
|
334
|
+
main_text_start=main_text_start,
|
335
|
+
main_text_stop=main_text_stop,
|
336
|
+
main_text_labels=main_text_labels,
|
337
|
+
strict_text=strict_text,
|
338
|
+
image_placeholder=image_placeholder,
|
339
|
+
)
|
299
340
|
|
300
|
-
def
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
341
|
+
def render_as_text(
|
342
|
+
self,
|
343
|
+
delim: str = "\n\n",
|
344
|
+
main_text_start: int = 0,
|
345
|
+
main_text_stop: Optional[int] = None,
|
346
|
+
main_text_labels: list[str] = [
|
347
|
+
"title",
|
348
|
+
"subtitle-level-1",
|
349
|
+
"paragraph",
|
350
|
+
"caption",
|
351
|
+
],
|
352
|
+
):
|
353
|
+
return self.output.export_to_markdown(
|
354
|
+
delim=delim,
|
355
|
+
main_text_start=main_text_start,
|
356
|
+
main_text_stop=main_text_stop,
|
357
|
+
main_text_labels=main_text_labels,
|
358
|
+
strict_text=True,
|
359
|
+
)
|
360
|
+
|
361
|
+
def render_as_doctags(
|
362
|
+
self,
|
363
|
+
delim: str = "\n\n",
|
364
|
+
main_text_start: int = 0,
|
365
|
+
main_text_stop: Optional[int] = None,
|
366
|
+
main_text_labels: list[str] = [
|
367
|
+
"title",
|
368
|
+
"subtitle-level-1",
|
369
|
+
"paragraph",
|
370
|
+
"caption",
|
371
|
+
"table",
|
372
|
+
"figure",
|
373
|
+
],
|
374
|
+
xsize: int = 100,
|
375
|
+
ysize: int = 100,
|
376
|
+
add_location: bool = True,
|
377
|
+
add_content: bool = True,
|
378
|
+
add_page_index: bool = True,
|
379
|
+
# table specific flags
|
380
|
+
add_table_cell_location: bool = False,
|
381
|
+
add_table_cell_label: bool = True,
|
382
|
+
add_table_cell_text: bool = True,
|
383
|
+
) -> str:
|
384
|
+
return self.output.export_to_document_tokens(
|
385
|
+
delim=delim,
|
386
|
+
main_text_start=main_text_start,
|
387
|
+
main_text_stop=main_text_stop,
|
388
|
+
main_text_labels=main_text_labels,
|
389
|
+
xsize=xsize,
|
390
|
+
ysize=ysize,
|
391
|
+
add_location=add_location,
|
392
|
+
add_content=add_content,
|
393
|
+
add_page_index=add_page_index,
|
394
|
+
# table specific flags
|
395
|
+
add_table_cell_location=add_table_cell_location,
|
396
|
+
add_table_cell_label=add_table_cell_label,
|
397
|
+
add_table_cell_text=add_table_cell_text,
|
398
|
+
)
|
305
399
|
|
306
400
|
def render_element_images(
|
307
401
|
self, element_types: Tuple[PageElement] = (FigureElement,)
|
@@ -318,6 +412,10 @@ class ConvertedDocument(BaseModel):
|
|
318
412
|
yield element, cropped_im
|
319
413
|
|
320
414
|
|
415
|
+
class ConversionResult(ConvertedDocument):
|
416
|
+
pass
|
417
|
+
|
418
|
+
|
321
419
|
class DocumentConversionInput(BaseModel):
|
322
420
|
|
323
421
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from enum import Enum, auto
|
2
|
+
from typing import List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
5
|
+
|
6
|
+
|
7
|
+
class TableFormerMode(str, Enum):
|
8
|
+
FAST = auto()
|
9
|
+
ACCURATE = auto()
|
10
|
+
|
11
|
+
|
12
|
+
class TableStructureOptions(BaseModel):
|
13
|
+
do_cell_matching: bool = (
|
14
|
+
True
|
15
|
+
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
16
|
+
# are merged across table columns.
|
17
|
+
# False: Let table structure model define the text cells, ignore PDF cells.
|
18
|
+
)
|
19
|
+
mode: TableFormerMode = TableFormerMode.FAST
|
20
|
+
|
21
|
+
|
22
|
+
class OcrOptions(BaseModel):
|
23
|
+
kind: str
|
24
|
+
|
25
|
+
|
26
|
+
class EasyOcrOptions(OcrOptions):
|
27
|
+
kind: Literal["easyocr"] = "easyocr"
|
28
|
+
lang: List[str] = ["fr", "de", "es", "en"]
|
29
|
+
use_gpu: bool = True # same default as easyocr.Reader
|
30
|
+
model_storage_directory: Optional[str] = None
|
31
|
+
download_enabled: bool = True # same default as easyocr.Reader
|
32
|
+
|
33
|
+
model_config = ConfigDict(
|
34
|
+
extra="forbid",
|
35
|
+
protected_namespaces=(),
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
class TesseractCliOcrOptions(OcrOptions):
|
40
|
+
kind: Literal["tesseract"] = "tesseract"
|
41
|
+
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
42
|
+
tesseract_cmd: str = "tesseract"
|
43
|
+
path: Optional[str] = None
|
44
|
+
|
45
|
+
model_config = ConfigDict(
|
46
|
+
extra="forbid",
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
class TesseractOcrOptions(OcrOptions):
|
51
|
+
kind: Literal["tesserocr"] = "tesserocr"
|
52
|
+
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
53
|
+
path: Optional[str] = None
|
54
|
+
|
55
|
+
model_config = ConfigDict(
|
56
|
+
extra="forbid",
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
class PipelineOptions(BaseModel):
|
61
|
+
do_table_structure: bool = True # True: perform table structure extraction
|
62
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
63
|
+
|
64
|
+
table_structure_options: TableStructureOptions = TableStructureOptions()
|
65
|
+
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
66
|
+
Field(EasyOcrOptions(), discriminator="kind")
|
67
|
+
)
|
docling/document_converter.py
CHANGED
@@ -7,7 +7,6 @@ from pathlib import Path
|
|
7
7
|
from typing import Iterable, Optional, Type, Union
|
8
8
|
|
9
9
|
import requests
|
10
|
-
from docling_core.types import Document
|
11
10
|
from PIL import ImageDraw
|
12
11
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
13
12
|
|
@@ -16,14 +15,16 @@ from docling.datamodel.base_models import (
|
|
16
15
|
AssembledUnit,
|
17
16
|
AssembleOptions,
|
18
17
|
ConversionStatus,
|
18
|
+
DoclingComponentType,
|
19
|
+
ErrorItem,
|
19
20
|
Page,
|
20
|
-
PipelineOptions,
|
21
21
|
)
|
22
22
|
from docling.datamodel.document import (
|
23
|
-
|
23
|
+
ConversionResult,
|
24
24
|
DocumentConversionInput,
|
25
25
|
InputDocument,
|
26
26
|
)
|
27
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
27
28
|
from docling.datamodel.settings import settings
|
28
29
|
from docling.models.ds_glm_model import GlmModel
|
29
30
|
from docling.models.page_assemble_model import PageAssembleModel
|
@@ -66,12 +67,15 @@ class DocumentConverter:
|
|
66
67
|
from huggingface_hub import snapshot_download
|
67
68
|
|
68
69
|
download_path = snapshot_download(
|
69
|
-
repo_id="ds4sd/docling-models",
|
70
|
+
repo_id="ds4sd/docling-models",
|
71
|
+
force_download=force,
|
72
|
+
local_dir=local_dir,
|
73
|
+
revision="v2.0.0",
|
70
74
|
)
|
71
75
|
|
72
76
|
return Path(download_path)
|
73
77
|
|
74
|
-
def convert(self, input: DocumentConversionInput) -> Iterable[
|
78
|
+
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
75
79
|
|
76
80
|
for input_batch in chunkify(
|
77
81
|
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
@@ -84,9 +88,9 @@ class DocumentConverter:
|
|
84
88
|
# yield from pool.map(self.process_document, input_batch)
|
85
89
|
|
86
90
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
87
|
-
yield from map(self.
|
91
|
+
yield from map(self._process_document, input_batch)
|
88
92
|
|
89
|
-
def convert_single(self, source: Path | AnyHttpUrl | str) ->
|
93
|
+
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
90
94
|
"""Convert a single document.
|
91
95
|
|
92
96
|
Args:
|
@@ -97,7 +101,7 @@ class DocumentConverter:
|
|
97
101
|
RuntimeError: If conversion fails.
|
98
102
|
|
99
103
|
Returns:
|
100
|
-
|
104
|
+
ConversionResult: The conversion result object.
|
101
105
|
"""
|
102
106
|
with tempfile.TemporaryDirectory() as temp_dir:
|
103
107
|
try:
|
@@ -127,51 +131,49 @@ class DocumentConverter:
|
|
127
131
|
f"Unexpected file path type encountered: {type(source)}"
|
128
132
|
)
|
129
133
|
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
130
|
-
|
131
|
-
|
132
|
-
if
|
134
|
+
conv_res_iter = self.convert(conv_inp)
|
135
|
+
conv_res: ConversionResult = next(conv_res_iter)
|
136
|
+
if conv_res.status not in {
|
133
137
|
ConversionStatus.SUCCESS,
|
134
|
-
ConversionStatus.
|
138
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
135
139
|
}:
|
136
|
-
raise RuntimeError(f"Conversion failed with status: {
|
137
|
-
|
138
|
-
return doc
|
140
|
+
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
141
|
+
return conv_res
|
139
142
|
|
140
|
-
def
|
143
|
+
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
141
144
|
start_doc_time = time.time()
|
142
|
-
|
145
|
+
conv_res = ConversionResult(input=in_doc)
|
146
|
+
|
147
|
+
_log.info(f"Processing document {in_doc.file.name}")
|
143
148
|
|
144
149
|
if not in_doc.valid:
|
145
|
-
|
146
|
-
return
|
150
|
+
conv_res.status = ConversionStatus.FAILURE
|
151
|
+
return conv_res
|
147
152
|
|
148
153
|
for i in range(0, in_doc.page_count):
|
149
|
-
|
154
|
+
conv_res.pages.append(Page(page_no=i))
|
150
155
|
|
151
156
|
all_assembled_pages = []
|
152
157
|
|
153
158
|
try:
|
154
159
|
# Iterate batches of pages (page_batch_size) in the doc
|
155
|
-
for page_batch in chunkify(
|
156
|
-
converted_doc.pages, settings.perf.page_batch_size
|
157
|
-
):
|
158
|
-
|
160
|
+
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
159
161
|
start_pb_time = time.time()
|
160
162
|
# Pipeline
|
161
163
|
|
162
164
|
# 1. Initialise the page resources
|
163
165
|
init_pages = map(
|
164
|
-
functools.partial(self.
|
166
|
+
functools.partial(self._initialize_page, in_doc), page_batch
|
165
167
|
)
|
166
168
|
|
167
169
|
# 2. Populate page image
|
168
170
|
pages_with_images = map(
|
169
|
-
functools.partial(self.
|
171
|
+
functools.partial(self._populate_page_images, in_doc), init_pages
|
170
172
|
)
|
171
173
|
|
172
174
|
# 3. Populate programmatic page cells
|
173
175
|
pages_with_cells = map(
|
174
|
-
functools.partial(self.
|
176
|
+
functools.partial(self._parse_page_cells, in_doc),
|
175
177
|
pages_with_images,
|
176
178
|
)
|
177
179
|
|
@@ -197,28 +199,45 @@ class DocumentConverter:
|
|
197
199
|
end_pb_time = time.time() - start_pb_time
|
198
200
|
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
199
201
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
202
|
+
conv_res.pages = all_assembled_pages
|
203
|
+
self._assemble_doc(conv_res)
|
204
|
+
|
205
|
+
status = ConversionStatus.SUCCESS
|
206
|
+
for page in conv_res.pages:
|
207
|
+
if not page._backend.is_valid():
|
208
|
+
conv_res.errors.append(
|
209
|
+
ErrorItem(
|
210
|
+
component_type=DoclingComponentType.PDF_BACKEND,
|
211
|
+
module_name=type(page._backend).__name__,
|
212
|
+
error_message=f"Page {page.page_no} failed to parse.",
|
213
|
+
)
|
214
|
+
)
|
215
|
+
status = ConversionStatus.PARTIAL_SUCCESS
|
205
216
|
|
206
|
-
|
217
|
+
conv_res.status = status
|
207
218
|
|
208
219
|
except Exception as e:
|
209
|
-
|
220
|
+
conv_res.status = ConversionStatus.FAILURE
|
210
221
|
trace = "\n".join(traceback.format_exception(e))
|
211
|
-
_log.info(
|
222
|
+
_log.info(
|
223
|
+
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
224
|
+
f"{trace}"
|
225
|
+
)
|
226
|
+
|
227
|
+
finally:
|
228
|
+
# Always unload the PDF backend, even in case of failure
|
229
|
+
if in_doc._backend:
|
230
|
+
in_doc._backend.unload()
|
212
231
|
|
213
232
|
end_doc_time = time.time() - start_doc_time
|
214
233
|
_log.info(
|
215
234
|
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
216
235
|
)
|
217
236
|
|
218
|
-
return
|
237
|
+
return conv_res
|
219
238
|
|
220
239
|
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
221
|
-
def
|
240
|
+
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
222
241
|
page._backend = doc._backend.load_page(page.page_no)
|
223
242
|
page.size = page._backend.get_size()
|
224
243
|
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
@@ -226,9 +245,11 @@ class DocumentConverter:
|
|
226
245
|
return page
|
227
246
|
|
228
247
|
# Generate the page image and store it in the page object
|
229
|
-
def
|
248
|
+
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
230
249
|
# default scale
|
231
|
-
page.get_image(
|
250
|
+
page.get_image(
|
251
|
+
scale=1.0
|
252
|
+
) # puts the page image on the image cache at default scale
|
232
253
|
|
233
254
|
# user requested scales
|
234
255
|
if self.assemble_options.images_scale is not None:
|
@@ -240,7 +261,7 @@ class DocumentConverter:
|
|
240
261
|
return page
|
241
262
|
|
242
263
|
# Extract and populate the page cells and store it in the page object
|
243
|
-
def
|
264
|
+
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
244
265
|
page.cells = page._backend.get_text_cells()
|
245
266
|
|
246
267
|
# DEBUG code:
|
@@ -255,12 +276,12 @@ class DocumentConverter:
|
|
255
276
|
|
256
277
|
return page
|
257
278
|
|
258
|
-
def
|
279
|
+
def _assemble_doc(self, conv_res: ConversionResult):
|
259
280
|
all_elements = []
|
260
281
|
all_headers = []
|
261
282
|
all_body = []
|
262
283
|
|
263
|
-
for p in
|
284
|
+
for p in conv_res.pages:
|
264
285
|
|
265
286
|
for el in p.assembled.body:
|
266
287
|
all_body.append(el)
|
@@ -269,8 +290,8 @@ class DocumentConverter:
|
|
269
290
|
for el in p.assembled.elements:
|
270
291
|
all_elements.append(el)
|
271
292
|
|
272
|
-
|
293
|
+
conv_res.assembled = AssembledUnit(
|
273
294
|
elements=all_elements, headers=all_headers, body=all_body
|
274
295
|
)
|
275
296
|
|
276
|
-
|
297
|
+
conv_res.output = self.glm_model(conv_res)
|
docling/models/base_ocr_model.py
CHANGED
@@ -3,21 +3,21 @@ import logging
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
from typing import Iterable, List, Tuple
|
5
5
|
|
6
|
-
import numpy
|
7
6
|
import numpy as np
|
8
7
|
from PIL import Image, ImageDraw
|
9
8
|
from rtree import index
|
10
9
|
from scipy.ndimage import find_objects, label
|
11
10
|
|
12
11
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
12
|
+
from docling.datamodel.pipeline_options import OcrOptions
|
13
13
|
|
14
14
|
_log = logging.getLogger(__name__)
|
15
15
|
|
16
16
|
|
17
17
|
class BaseOcrModel:
|
18
|
-
def __init__(self,
|
19
|
-
self.
|
20
|
-
self.
|
18
|
+
def __init__(self, enabled: bool, options: OcrOptions):
|
19
|
+
self.enabled = enabled
|
20
|
+
self.options = options
|
21
21
|
|
22
22
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
23
23
|
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
docling/models/ds_glm_model.py
CHANGED
@@ -2,7 +2,7 @@ import copy
|
|
2
2
|
import random
|
3
3
|
|
4
4
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
5
|
-
from deepsearch_glm.utils.
|
5
|
+
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
|
6
6
|
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
7
7
|
from docling_core.types import BaseText
|
8
8
|
from docling_core.types import Document as DsDocument
|
@@ -10,18 +10,22 @@ from docling_core.types import Ref
|
|
10
10
|
from PIL import ImageDraw
|
11
11
|
|
12
12
|
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
13
|
-
from docling.datamodel.document import
|
13
|
+
from docling.datamodel.document import ConversionResult
|
14
14
|
|
15
15
|
|
16
16
|
class GlmModel:
|
17
17
|
def __init__(self, config):
|
18
18
|
self.config = config
|
19
|
+
self.model_names = self.config.get(
|
20
|
+
"model_names", ""
|
21
|
+
) # "language;term;reference"
|
19
22
|
load_pretrained_nlp_models()
|
20
|
-
model = init_nlp_model(model_names="language;term;reference")
|
23
|
+
# model = init_nlp_model(model_names="language;term;reference")
|
24
|
+
model = init_nlp_model(model_names=self.model_names)
|
21
25
|
self.model = model
|
22
26
|
|
23
|
-
def __call__(self,
|
24
|
-
ds_doc =
|
27
|
+
def __call__(self, conv_res: ConversionResult) -> DsDocument:
|
28
|
+
ds_doc = conv_res._to_ds_document()
|
25
29
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
26
30
|
|
27
31
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
@@ -34,7 +38,7 @@ class GlmModel:
|
|
34
38
|
# DEBUG code:
|
35
39
|
def draw_clusters_and_cells(ds_document, page_no):
|
36
40
|
clusters_to_draw = []
|
37
|
-
image = copy.deepcopy(
|
41
|
+
image = copy.deepcopy(conv_res.pages[page_no].image)
|
38
42
|
for ix, elem in enumerate(ds_document.main_text):
|
39
43
|
if isinstance(elem, BaseText):
|
40
44
|
prov = elem.prov[0]
|
@@ -56,7 +60,7 @@ class GlmModel:
|
|
56
60
|
bbox=BoundingBox.from_tuple(
|
57
61
|
coord=prov.bbox,
|
58
62
|
origin=CoordOrigin.BOTTOMLEFT,
|
59
|
-
).to_top_left_origin(
|
63
|
+
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
60
64
|
)
|
61
65
|
)
|
62
66
|
|
docling/models/easyocr_model.py
CHANGED
@@ -4,21 +4,33 @@ from typing import Iterable
|
|
4
4
|
import numpy
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
7
|
+
from docling.datamodel.pipeline_options import EasyOcrOptions
|
7
8
|
from docling.models.base_ocr_model import BaseOcrModel
|
8
9
|
|
9
10
|
_log = logging.getLogger(__name__)
|
10
11
|
|
11
12
|
|
12
13
|
class EasyOcrModel(BaseOcrModel):
|
13
|
-
def __init__(self,
|
14
|
-
super().__init__(
|
14
|
+
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
15
|
+
super().__init__(enabled=enabled, options=options)
|
16
|
+
self.options: EasyOcrOptions
|
15
17
|
|
16
18
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
17
19
|
|
18
20
|
if self.enabled:
|
19
|
-
|
21
|
+
try:
|
22
|
+
import easyocr
|
23
|
+
except ImportError:
|
24
|
+
raise ImportError(
|
25
|
+
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
26
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
27
|
+
)
|
20
28
|
|
21
|
-
self.reader = easyocr.Reader(
|
29
|
+
self.reader = easyocr.Reader(
|
30
|
+
lang_list=self.options.lang,
|
31
|
+
model_storage_directory=self.options.model_storage_directory,
|
32
|
+
download_enabled=self.options.download_enabled,
|
33
|
+
)
|
22
34
|
|
23
35
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
24
36
|
|
@@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
|
|
31
43
|
|
32
44
|
all_ocr_cells = []
|
33
45
|
for ocr_rect in ocr_rects:
|
46
|
+
# Skip zero area boxes
|
47
|
+
if ocr_rect.area() == 0:
|
48
|
+
continue
|
34
49
|
high_res_image = page._backend.get_page_image(
|
35
50
|
scale=self.scale, cropbox=ocr_rect
|
36
51
|
)
|
docling/models/layout_model.py
CHANGED
@@ -33,6 +33,7 @@ class LayoutModel:
|
|
33
33
|
"Page-footer",
|
34
34
|
"Code",
|
35
35
|
"List-item",
|
36
|
+
# "Title"
|
36
37
|
# "Formula",
|
37
38
|
]
|
38
39
|
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
|
@@ -69,9 +70,7 @@ class LayoutModel:
|
|
69
70
|
"Key-Value Region": 0.45,
|
70
71
|
}
|
71
72
|
|
72
|
-
CLASS_REMAPPINGS = {
|
73
|
-
"Document Index": "Table",
|
74
|
-
}
|
73
|
+
CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
|
75
74
|
|
76
75
|
_log.debug("================= Start postprocess function ====================")
|
77
76
|
start_time = time.time()
|
@@ -277,6 +276,7 @@ class LayoutModel:
|
|
277
276
|
bbox=BoundingBox.model_validate(pred_item),
|
278
277
|
cells=[],
|
279
278
|
)
|
279
|
+
|
280
280
|
clusters.append(cluster)
|
281
281
|
|
282
282
|
# Map cells to clusters
|