docling 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +0 -4
- docling/backend/html_backend.py +53 -56
- docling/backend/md_backend.py +59 -6
- docling/backend/msword_backend.py +9 -15
- docling/datamodel/base_models.py +1 -1
- docling/datamodel/document.py +3 -1
- docling/datamodel/settings.py +15 -1
- docling/document_converter.py +12 -8
- docling/models/base_model.py +4 -1
- docling/models/base_ocr_model.py +21 -4
- docling/models/ds_glm_model.py +27 -11
- docling/models/easyocr_model.py +49 -39
- docling/models/layout_model.py +87 -61
- docling/models/page_assemble_model.py +102 -100
- docling/models/page_preprocessing_model.py +25 -7
- docling/models/table_structure_model.py +125 -90
- docling/models/tesseract_ocr_cli_model.py +62 -52
- docling/models/tesseract_ocr_model.py +57 -45
- docling/pipeline/base_pipeline.py +68 -69
- docling/pipeline/simple_pipeline.py +8 -11
- docling/pipeline/standard_pdf_pipeline.py +59 -56
- docling/utils/profiling.py +62 -0
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/METADATA +5 -4
- docling-2.3.0.dist-info/RECORD +45 -0
- docling-2.2.0.dist-info/RECORD +0 -44
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/LICENSE +0 -0
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/WHEEL +0 -0
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/entry_points.txt +0 -0
@@ -1,24 +1,20 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
2
|
import re
|
4
3
|
from io import BytesIO
|
5
4
|
from pathlib import Path
|
6
5
|
from typing import Set, Union
|
7
6
|
|
8
7
|
from docling_core.types.doc import (
|
9
|
-
DocItem,
|
10
8
|
DocItemLabel,
|
11
9
|
DoclingDocument,
|
12
10
|
DocumentOrigin,
|
13
11
|
GroupItem,
|
14
12
|
GroupLabel,
|
15
13
|
ImageRef,
|
16
|
-
NodeItem,
|
17
14
|
Size,
|
18
15
|
TableCell,
|
19
16
|
TableData,
|
20
17
|
)
|
21
|
-
from pydantic import AnyUrl
|
22
18
|
|
23
19
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
24
20
|
from docling.datamodel.base_models import InputFormat
|
docling/backend/html_backend.py
CHANGED
@@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
136
136
|
def get_direct_text(self, item):
|
137
137
|
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
138
138
|
text = item.find(string=True, recursive=False)
|
139
|
-
|
140
139
|
if isinstance(text, str):
|
141
140
|
return text.strip()
|
142
141
|
|
@@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
149
148
|
if isinstance(item, str):
|
150
149
|
return [item]
|
151
150
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
return " ".join(result)
|
151
|
+
if item.name not in ["ul", "ol"]:
|
152
|
+
try:
|
153
|
+
# Iterate over the children (and their text and tails)
|
154
|
+
for child in item:
|
155
|
+
try:
|
156
|
+
# Recursively get the child's text content
|
157
|
+
result.extend(self.extract_text_recursively(child))
|
158
|
+
except:
|
159
|
+
pass
|
160
|
+
except:
|
161
|
+
_log.warn("item has no children")
|
162
|
+
pass
|
163
|
+
|
164
|
+
return "".join(result) + " "
|
167
165
|
|
168
166
|
def handle_header(self, element, idx, doc):
|
169
167
|
"""Handles header tags (h1, h2, etc.)."""
|
@@ -181,38 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
181
179
|
self.parents[self.level] = doc.add_text(
|
182
180
|
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
183
181
|
)
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
self.
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
for key, val in self.parents.items():
|
209
|
-
if key > hlevel:
|
210
|
-
self.parents[key] = None
|
211
|
-
|
212
|
-
self.parents[hlevel] = doc.add_text(
|
213
|
-
parent=self.parents[hlevel - 1], label=label, text=text
|
182
|
+
else:
|
183
|
+
if hlevel > self.level:
|
184
|
+
|
185
|
+
# add invisible group
|
186
|
+
for i in range(self.level + 1, hlevel):
|
187
|
+
self.parents[i] = doc.add_group(
|
188
|
+
name=f"header-{i}",
|
189
|
+
label=GroupLabel.SECTION,
|
190
|
+
parent=self.parents[i - 1],
|
191
|
+
)
|
192
|
+
self.level = hlevel
|
193
|
+
|
194
|
+
elif hlevel < self.level:
|
195
|
+
|
196
|
+
# remove the tail
|
197
|
+
for key, val in self.parents.items():
|
198
|
+
if key > hlevel:
|
199
|
+
self.parents[key] = None
|
200
|
+
self.level = hlevel
|
201
|
+
|
202
|
+
self.parents[hlevel] = doc.add_heading(
|
203
|
+
parent=self.parents[hlevel - 1],
|
204
|
+
text=text,
|
205
|
+
level=hlevel,
|
214
206
|
)
|
215
|
-
self.level = hlevel
|
216
207
|
|
217
208
|
def handle_paragraph(self, element, idx, doc):
|
218
209
|
"""Handles paragraph tags (p)."""
|
@@ -255,7 +246,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
255
246
|
|
256
247
|
if nested_lists:
|
257
248
|
name = element.name
|
258
|
-
|
249
|
+
# Text in list item can be hidden within hierarchy, hence
|
250
|
+
# we need to extract it recursively
|
251
|
+
text = self.extract_text_recursively(element)
|
252
|
+
# Flatten text, remove break lines:
|
253
|
+
text = text.replace("\n", "").replace("\r", "")
|
254
|
+
text = " ".join(text.split()).strip()
|
259
255
|
|
260
256
|
marker = ""
|
261
257
|
enumerated = False
|
@@ -263,14 +259,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
263
259
|
marker = str(index_in_list)
|
264
260
|
enumerated = True
|
265
261
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
262
|
+
if len(text) > 0:
|
263
|
+
# create a list-item
|
264
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
265
|
+
text=text,
|
266
|
+
enumerated=enumerated,
|
267
|
+
marker=marker,
|
268
|
+
parent=self.parents[self.level],
|
269
|
+
)
|
270
|
+
self.level += 1
|
274
271
|
|
275
272
|
self.walk(element, doc)
|
276
273
|
|
docling/backend/md_backend.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
3
|
+
import warnings
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path
|
4
6
|
from typing import Set, Union
|
@@ -25,6 +27,30 @@ _log = logging.getLogger(__name__)
|
|
25
27
|
|
26
28
|
|
27
29
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
30
|
+
|
31
|
+
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
32
|
+
# This regex will match any sequence of underscores
|
33
|
+
pattern = r"_+"
|
34
|
+
|
35
|
+
def replace_match(match):
|
36
|
+
underscore_sequence = match.group(
|
37
|
+
0
|
38
|
+
) # Get the full match (sequence of underscores)
|
39
|
+
|
40
|
+
# Shorten the sequence if it exceeds max_length
|
41
|
+
if len(underscore_sequence) > max_length:
|
42
|
+
return "_" * max_length
|
43
|
+
else:
|
44
|
+
return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length
|
45
|
+
|
46
|
+
# Use re.sub to replace long underscore sequences
|
47
|
+
shortened_text = re.sub(pattern, replace_match, markdown_text)
|
48
|
+
|
49
|
+
if len(shortened_text) != len(markdown_text):
|
50
|
+
warnings.warn("Detected potentially incorrect Markdown, correcting...")
|
51
|
+
|
52
|
+
return shortened_text
|
53
|
+
|
28
54
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
29
55
|
super().__init__(in_doc, path_or_stream)
|
30
56
|
|
@@ -42,11 +68,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
42
68
|
try:
|
43
69
|
if isinstance(self.path_or_stream, BytesIO):
|
44
70
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
45
|
-
|
71
|
+
# remove invalid sequences
|
72
|
+
# very long sequences of underscores will lead to unnecessary long processing times.
|
73
|
+
# In any proper Markdown files, underscores have to be escaped,
|
74
|
+
# otherwise they represent emphasis (bold or italic)
|
75
|
+
self.markdown = self.shorten_underscore_sequences(text_stream)
|
46
76
|
if isinstance(self.path_or_stream, Path):
|
47
77
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
48
78
|
md_content = f.read()
|
49
|
-
|
79
|
+
# remove invalid sequences
|
80
|
+
# very long sequences of underscores will lead to unnecessary long processing times.
|
81
|
+
# In any proper Markdown files, underscores have to be escaped,
|
82
|
+
# otherwise they represent emphasis (bold or italic)
|
83
|
+
self.markdown = self.shorten_underscore_sequences(md_content)
|
50
84
|
self.valid = True
|
51
85
|
|
52
86
|
_log.debug(self.markdown)
|
@@ -135,11 +169,29 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
135
169
|
doc_label = DocItemLabel.TITLE
|
136
170
|
else:
|
137
171
|
doc_label = DocItemLabel.SECTION_HEADER
|
138
|
-
snippet_text = element.children[0].children.strip()
|
139
172
|
|
140
|
-
|
141
|
-
|
142
|
-
|
173
|
+
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
174
|
+
# hence we need to traverse the tree to get full text of a header
|
175
|
+
strings = []
|
176
|
+
|
177
|
+
# Define a recursive function to traverse the tree
|
178
|
+
def traverse(node):
|
179
|
+
# Check if the node has a "children" attribute
|
180
|
+
if hasattr(node, "children"):
|
181
|
+
# If "children" is a list, continue traversal
|
182
|
+
if isinstance(node.children, list):
|
183
|
+
for child in node.children:
|
184
|
+
traverse(child)
|
185
|
+
# If "children" is text, add it to header text
|
186
|
+
elif isinstance(node.children, str):
|
187
|
+
strings.append(node.children)
|
188
|
+
|
189
|
+
traverse(element)
|
190
|
+
snippet_text = "".join(strings)
|
191
|
+
if len(snippet_text) > 0:
|
192
|
+
parent_element = doc.add_text(
|
193
|
+
label=doc_label, parent=parent_element, text=snippet_text
|
194
|
+
)
|
143
195
|
|
144
196
|
elif isinstance(element, marko.block.List):
|
145
197
|
self.close_table(doc)
|
@@ -286,6 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
286
338
|
parsed_ast = marko_parser.parse(self.markdown)
|
287
339
|
# Start iterating from the root of the AST
|
288
340
|
self.iterate_elements(parsed_ast, 0, doc, None)
|
341
|
+
self.process_inline_text(None, doc) # handle last hanging inline text
|
289
342
|
else:
|
290
343
|
raise RuntimeError(
|
291
344
|
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
@@ -294,13 +294,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
294
294
|
level = self.get_level()
|
295
295
|
if isinstance(curr_level, int):
|
296
296
|
|
297
|
-
if curr_level
|
298
|
-
|
299
|
-
self.parents[level] = doc.add_heading(
|
300
|
-
parent=self.parents[level - 1], text=text
|
301
|
-
)
|
302
|
-
|
303
|
-
elif curr_level > level:
|
297
|
+
if curr_level > level:
|
304
298
|
|
305
299
|
# add invisible group
|
306
300
|
for i in range(level, curr_level):
|
@@ -310,10 +304,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
310
304
|
name=f"header-{i}",
|
311
305
|
)
|
312
306
|
|
313
|
-
self.parents[curr_level] = doc.add_heading(
|
314
|
-
parent=self.parents[curr_level - 1], text=text
|
315
|
-
)
|
316
|
-
|
317
307
|
elif curr_level < level:
|
318
308
|
|
319
309
|
# remove the tail
|
@@ -321,13 +311,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
321
311
|
if key >= curr_level:
|
322
312
|
self.parents[key] = None
|
323
313
|
|
324
|
-
|
325
|
-
|
326
|
-
|
314
|
+
self.parents[curr_level] = doc.add_heading(
|
315
|
+
parent=self.parents[curr_level - 1],
|
316
|
+
text=text,
|
317
|
+
level=curr_level,
|
318
|
+
)
|
327
319
|
|
328
320
|
else:
|
329
321
|
self.parents[self.level] = doc.add_heading(
|
330
|
-
parent=self.parents[self.level - 1],
|
322
|
+
parent=self.parents[self.level - 1],
|
323
|
+
text=text,
|
324
|
+
level=1,
|
331
325
|
)
|
332
326
|
return
|
333
327
|
|
docling/datamodel/base_models.py
CHANGED
docling/datamodel/document.py
CHANGED
@@ -3,7 +3,7 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional,
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
7
7
|
|
8
8
|
import filetype
|
9
9
|
from docling_core.types.doc import (
|
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
|
|
52
52
|
Page,
|
53
53
|
)
|
54
54
|
from docling.datamodel.settings import DocumentLimits
|
55
|
+
from docling.utils.profiling import ProfilingItem
|
55
56
|
from docling.utils.utils import create_file_hash, create_hash
|
56
57
|
|
57
58
|
if TYPE_CHECKING:
|
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
|
|
187
188
|
|
188
189
|
pages: List[Page] = []
|
189
190
|
assembled: AssembledUnit = AssembledUnit()
|
191
|
+
timings: Dict[str, ProfilingItem] = {}
|
190
192
|
|
191
193
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
192
194
|
|
docling/datamodel/settings.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import sys
|
2
|
+
from pathlib import Path
|
2
3
|
|
3
4
|
from pydantic import BaseModel
|
4
5
|
from pydantic_settings import BaseSettings
|
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
|
|
26
27
|
# To force models into single core: export OMP_NUM_THREADS=1
|
27
28
|
|
28
29
|
|
30
|
+
class DebugSettings(BaseModel):
|
31
|
+
visualize_cells: bool = False
|
32
|
+
visualize_ocr: bool = False
|
33
|
+
visualize_layout: bool = False
|
34
|
+
visualize_tables: bool = False
|
35
|
+
|
36
|
+
profile_pipeline_timings: bool = False
|
37
|
+
|
38
|
+
# Path used to output debug information.
|
39
|
+
debug_output_path: str = str(Path.cwd() / "debug")
|
40
|
+
|
41
|
+
|
29
42
|
class AppSettings(BaseSettings):
|
30
43
|
perf: BatchConcurrencySettings
|
44
|
+
debug: DebugSettings
|
31
45
|
|
32
46
|
|
33
|
-
settings = AppSettings(perf=BatchConcurrencySettings())
|
47
|
+
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
docling/document_converter.py
CHANGED
@@ -189,24 +189,35 @@ class DocumentConverter:
|
|
189
189
|
) -> Iterator[ConversionResult]:
|
190
190
|
assert self.format_to_options is not None
|
191
191
|
|
192
|
+
start_time = time.monotonic()
|
193
|
+
|
192
194
|
for input_batch in chunkify(
|
193
195
|
conv_input.docs(self.format_to_options),
|
194
196
|
settings.perf.doc_batch_size, # pass format_options
|
195
197
|
):
|
196
198
|
_log.info(f"Going to convert document batch...")
|
199
|
+
|
197
200
|
# parallel processing only within input_batch
|
198
201
|
# with ThreadPoolExecutor(
|
199
202
|
# max_workers=settings.perf.doc_batch_concurrency
|
200
203
|
# ) as pool:
|
201
204
|
# yield from pool.map(self.process_document, input_batch)
|
202
|
-
|
203
205
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
206
|
+
|
204
207
|
for item in map(
|
205
208
|
partial(self._process_document, raises_on_error=raises_on_error),
|
206
209
|
input_batch,
|
207
210
|
):
|
211
|
+
elapsed = time.monotonic() - start_time
|
212
|
+
start_time = time.monotonic()
|
213
|
+
|
208
214
|
if item is not None:
|
215
|
+
_log.info(
|
216
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
217
|
+
)
|
209
218
|
yield item
|
219
|
+
else:
|
220
|
+
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
210
221
|
|
211
222
|
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
|
212
223
|
assert self.format_to_options is not None
|
@@ -237,15 +248,8 @@ class DocumentConverter:
|
|
237
248
|
assert self.allowed_formats is not None
|
238
249
|
assert in_doc.format in self.allowed_formats
|
239
250
|
|
240
|
-
start_doc_time = time.time()
|
241
|
-
|
242
251
|
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
243
252
|
|
244
|
-
end_doc_time = time.time() - start_doc_time
|
245
|
-
_log.info(
|
246
|
-
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
247
|
-
)
|
248
|
-
|
249
253
|
return conv_res
|
250
254
|
|
251
255
|
def _execute_pipeline(
|
docling/models/base_model.py
CHANGED
@@ -4,11 +4,14 @@ from typing import Any, Iterable
|
|
4
4
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import Page
|
7
|
+
from docling.datamodel.document import ConversionResult
|
7
8
|
|
8
9
|
|
9
10
|
class BasePageModel(ABC):
|
10
11
|
@abstractmethod
|
11
|
-
def __call__(
|
12
|
+
def __call__(
|
13
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
14
|
+
) -> Iterable[Page]:
|
12
15
|
pass
|
13
16
|
|
14
17
|
|
docling/models/base_ocr_model.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
+
from pathlib import Path
|
4
5
|
from typing import Iterable, List
|
5
6
|
|
6
7
|
import numpy as np
|
@@ -10,12 +11,15 @@ from rtree import index
|
|
10
11
|
from scipy.ndimage import find_objects, label
|
11
12
|
|
12
13
|
from docling.datamodel.base_models import OcrCell, Page
|
14
|
+
from docling.datamodel.document import ConversionResult
|
13
15
|
from docling.datamodel.pipeline_options import OcrOptions
|
16
|
+
from docling.datamodel.settings import settings
|
17
|
+
from docling.models.base_model import BasePageModel
|
14
18
|
|
15
19
|
_log = logging.getLogger(__name__)
|
16
20
|
|
17
21
|
|
18
|
-
class BaseOcrModel:
|
22
|
+
class BaseOcrModel(BasePageModel):
|
19
23
|
def __init__(self, enabled: bool, options: OcrOptions):
|
20
24
|
self.enabled = enabled
|
21
25
|
self.options = options
|
@@ -113,7 +117,7 @@ class BaseOcrModel:
|
|
113
117
|
]
|
114
118
|
return filtered_ocr_cells
|
115
119
|
|
116
|
-
def draw_ocr_rects_and_cells(self, page, ocr_rects):
|
120
|
+
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
117
121
|
image = copy.deepcopy(page.image)
|
118
122
|
draw = ImageDraw.Draw(image, "RGBA")
|
119
123
|
|
@@ -130,8 +134,21 @@ class BaseOcrModel:
|
|
130
134
|
if isinstance(tc, OcrCell):
|
131
135
|
color = "magenta"
|
132
136
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
133
|
-
|
137
|
+
|
138
|
+
if show:
|
139
|
+
image.show()
|
140
|
+
else:
|
141
|
+
out_path: Path = (
|
142
|
+
Path(settings.debug.debug_output_path)
|
143
|
+
/ f"debug_{conv_res.input.file.stem}"
|
144
|
+
)
|
145
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
146
|
+
|
147
|
+
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
|
148
|
+
image.save(str(out_file), format="png")
|
134
149
|
|
135
150
|
@abstractmethod
|
136
|
-
def __call__(
|
151
|
+
def __call__(
|
152
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
153
|
+
) -> Iterable[Page]:
|
137
154
|
pass
|
docling/models/ds_glm_model.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import copy
|
2
2
|
import random
|
3
|
+
from pathlib import Path
|
3
4
|
from typing import List, Union
|
4
5
|
|
5
6
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
|
|
27
28
|
|
28
29
|
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
29
30
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
31
|
+
from docling.datamodel.settings import settings
|
32
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
30
33
|
from docling.utils.utils import create_hash
|
31
34
|
|
32
35
|
|
@@ -226,23 +229,24 @@ class GlmModel:
|
|
226
229
|
return ds_doc
|
227
230
|
|
228
231
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
229
|
-
|
230
|
-
|
232
|
+
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
233
|
+
ds_doc = self._to_legacy_document(conv_res)
|
234
|
+
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
231
235
|
|
232
|
-
|
236
|
+
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
233
237
|
|
234
|
-
|
238
|
+
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
235
239
|
|
236
240
|
# DEBUG code:
|
237
|
-
def draw_clusters_and_cells(ds_document, page_no):
|
241
|
+
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
238
242
|
clusters_to_draw = []
|
239
243
|
image = copy.deepcopy(conv_res.pages[page_no].image)
|
240
244
|
for ix, elem in enumerate(ds_document.main_text):
|
241
245
|
if isinstance(elem, BaseText):
|
242
|
-
prov = elem.prov[0]
|
246
|
+
prov = elem.prov[0] # type: ignore
|
243
247
|
elif isinstance(elem, Ref):
|
244
248
|
_, arr, index = elem.ref.split("/")
|
245
|
-
index = int(index)
|
249
|
+
index = int(index) # type: ignore
|
246
250
|
if arr == "tables":
|
247
251
|
prov = ds_document.tables[index].prov[0]
|
248
252
|
elif arr == "figures":
|
@@ -256,7 +260,7 @@ class GlmModel:
|
|
256
260
|
id=ix,
|
257
261
|
label=elem.name,
|
258
262
|
bbox=BoundingBox.from_tuple(
|
259
|
-
coord=prov.bbox,
|
263
|
+
coord=prov.bbox, # type: ignore
|
260
264
|
origin=CoordOrigin.BOTTOMLEFT,
|
261
265
|
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
262
266
|
)
|
@@ -276,9 +280,21 @@ class GlmModel:
|
|
276
280
|
for tc in c.cells: # [:1]:
|
277
281
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
278
282
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
279
|
-
image.show()
|
280
283
|
|
281
|
-
|
282
|
-
|
284
|
+
if show:
|
285
|
+
image.show()
|
286
|
+
else:
|
287
|
+
out_path: Path = (
|
288
|
+
Path(settings.debug.debug_output_path)
|
289
|
+
/ f"debug_{conv_res.input.file.stem}"
|
290
|
+
)
|
291
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
292
|
+
|
293
|
+
out_file = out_path / f"doc_page_{page_no:05}.png"
|
294
|
+
image.save(str(out_file), format="png")
|
295
|
+
|
296
|
+
# for item in ds_doc.page_dimensions:
|
297
|
+
# page_no = item.page
|
298
|
+
# draw_clusters_and_cells(ds_doc, page_no)
|
283
299
|
|
284
300
|
return docling_doc
|