docling 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. docling/backend/abstract_backend.py +1 -0
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +4 -4
  4. docling/backend/docling_parse_v2_backend.py +12 -4
  5. docling/backend/html_backend.py +61 -57
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +62 -39
  8. docling/backend/msword_backend.py +12 -25
  9. docling/backend/pypdfium2_backend.py +1 -1
  10. docling/cli/main.py +38 -8
  11. docling/datamodel/base_models.py +16 -10
  12. docling/datamodel/document.py +36 -6
  13. docling/datamodel/pipeline_options.py +3 -3
  14. docling/datamodel/settings.py +15 -1
  15. docling/document_converter.py +38 -12
  16. docling/models/base_model.py +4 -1
  17. docling/models/base_ocr_model.py +21 -4
  18. docling/models/ds_glm_model.py +27 -11
  19. docling/models/easyocr_model.py +49 -39
  20. docling/models/layout_model.py +87 -61
  21. docling/models/page_assemble_model.py +102 -100
  22. docling/models/page_preprocessing_model.py +25 -7
  23. docling/models/table_structure_model.py +125 -90
  24. docling/models/tesseract_ocr_cli_model.py +62 -52
  25. docling/models/tesseract_ocr_model.py +76 -52
  26. docling/pipeline/base_pipeline.py +68 -69
  27. docling/pipeline/simple_pipeline.py +8 -11
  28. docling/pipeline/standard_pdf_pipeline.py +59 -56
  29. docling/utils/profiling.py +62 -0
  30. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/METADATA +27 -22
  31. docling-2.4.1.dist-info/RECORD +45 -0
  32. docling-2.1.0.dist-info/RECORD +0 -42
  33. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  34. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  35. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/cli/main.py CHANGED
@@ -5,12 +5,15 @@ import time
5
5
  import warnings
6
6
  from enum import Enum
7
7
  from pathlib import Path
8
- from typing import Annotated, Dict, Iterable, List, Optional
8
+ from typing import Annotated, Dict, Iterable, List, Optional, Type
9
9
 
10
10
  import typer
11
11
  from docling_core.utils.file import resolve_file_source
12
12
 
13
13
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
+ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
15
+ from docling.backend.pdf_backend import PdfDocumentBackend
16
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
14
17
  from docling.datamodel.base_models import (
15
18
  ConversionStatus,
16
19
  FormatToExtensions,
@@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
22
25
  EasyOcrOptions,
23
26
  OcrOptions,
24
27
  PdfPipelineOptions,
28
+ TableFormerMode,
25
29
  TesseractCliOcrOptions,
26
30
  TesseractOcrOptions,
27
31
  )
@@ -58,9 +62,10 @@ def version_callback(value: bool):
58
62
 
59
63
 
60
64
  # Define an enum for the backend options
61
- class Backend(str, Enum):
65
+ class PdfBackend(str, Enum):
62
66
  PYPDFIUM2 = "pypdfium2"
63
- DOCLING = "docling"
67
+ DLPARSE_V1 = "dlparse_v1"
68
+ DLPARSE_V2 = "dlparse_v2"
64
69
 
65
70
 
66
71
  # Define an enum for the ocr engines
@@ -90,28 +95,28 @@ def export_documents(
90
95
  # Export Deep Search document JSON format:
91
96
  if export_json:
92
97
  fname = output_dir / f"{doc_filename}.json"
93
- with fname.open("w") as fp:
98
+ with fname.open("w", encoding="utf8") as fp:
94
99
  _log.info(f"writing JSON output to {fname}")
95
100
  fp.write(json.dumps(conv_res.document.export_to_dict()))
96
101
 
97
102
  # Export Text format:
98
103
  if export_txt:
99
104
  fname = output_dir / f"{doc_filename}.txt"
100
- with fname.open("w") as fp:
105
+ with fname.open("w", encoding="utf8") as fp:
101
106
  _log.info(f"writing Text output to {fname}")
102
107
  fp.write(conv_res.document.export_to_markdown(strict_text=True))
103
108
 
104
109
  # Export Markdown format:
105
110
  if export_md:
106
111
  fname = output_dir / f"{doc_filename}.md"
107
- with fname.open("w") as fp:
112
+ with fname.open("w", encoding="utf8") as fp:
108
113
  _log.info(f"writing Markdown output to {fname}")
109
114
  fp.write(conv_res.document.export_to_markdown())
110
115
 
111
116
  # Export Document Tags format:
112
117
  if export_doctags:
113
118
  fname = output_dir / f"{doc_filename}.doctags"
114
- with fname.open("w") as fp:
119
+ with fname.open("w", encoding="utf8") as fp:
115
120
  _log.info(f"writing Doc Tags output to {fname}")
116
121
  fp.write(conv_res.document.export_to_document_tokens())
117
122
 
@@ -151,6 +156,17 @@ def convert(
151
156
  ocr_engine: Annotated[
152
157
  OcrEngine, typer.Option(..., help="The OCR engine to use.")
153
158
  ] = OcrEngine.EASYOCR,
159
+ pdf_backend: Annotated[
160
+ PdfBackend, typer.Option(..., help="The PDF backend to use.")
161
+ ] = PdfBackend.DLPARSE_V1,
162
+ table_mode: Annotated[
163
+ TableFormerMode,
164
+ typer.Option(..., help="The mode to use in the table structure model."),
165
+ ] = TableFormerMode.FAST,
166
+ artifacts_path: Annotated[
167
+ Optional[Path],
168
+ typer.Option(..., help="If provided, the location of the model artifacts."),
169
+ ] = None,
154
170
  abort_on_error: Annotated[
155
171
  bool,
156
172
  typer.Option(
@@ -217,11 +233,25 @@ def convert(
217
233
  do_table_structure=True,
218
234
  )
219
235
  pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
236
+ pipeline_options.table_structure_options.mode = table_mode
237
+
238
+ if artifacts_path is not None:
239
+ pipeline_options.artifacts_path = artifacts_path
240
+
241
+ match pdf_backend:
242
+ case PdfBackend.DLPARSE_V1:
243
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
244
+ case PdfBackend.DLPARSE_V2:
245
+ backend = DoclingParseV2DocumentBackend
246
+ case PdfBackend.PYPDFIUM2:
247
+ backend = PyPdfiumDocumentBackend
248
+ case _:
249
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
220
250
 
221
251
  format_options: Dict[InputFormat, FormatOption] = {
222
252
  InputFormat.PDF: PdfFormatOption(
223
253
  pipeline_options=pipeline_options,
224
- backend=DoclingParseDocumentBackend, # pdf_backend
254
+ backend=backend, # pdf_backend
225
255
  )
226
256
  }
227
257
  doc_converter = DocumentConverter(
@@ -1,6 +1,6 @@
1
1
  from enum import Enum, auto
2
2
  from io import BytesIO
3
- from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
4
 
5
5
  from docling_core.types.doc import (
6
6
  BoundingBox,
@@ -30,6 +30,8 @@ class InputFormat(str, Enum):
30
30
  HTML = "html"
31
31
  IMAGE = "image"
32
32
  PDF = "pdf"
33
+ ASCIIDOC = "asciidoc"
34
+ MD = "md"
33
35
 
34
36
 
35
37
  class OutputFormat(str, Enum):
@@ -43,29 +45,33 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
43
45
  InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
44
46
  InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
45
47
  InputFormat.PDF: ["pdf"],
48
+ InputFormat.MD: ["md"],
46
49
  InputFormat.HTML: ["html", "htm", "xhtml"],
47
50
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
51
+ InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
48
52
  }
49
53
 
50
- FormatToMimeType: Dict[InputFormat, Set[str]] = {
51
- InputFormat.DOCX: {
54
+ FormatToMimeType: Dict[InputFormat, List[str]] = {
55
+ InputFormat.DOCX: [
52
56
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
53
57
  "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
54
- },
55
- InputFormat.PPTX: {
58
+ ],
59
+ InputFormat.PPTX: [
56
60
  "application/vnd.openxmlformats-officedocument.presentationml.template",
57
61
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
58
62
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
59
- },
60
- InputFormat.HTML: {"text/html", "application/xhtml+xml"},
61
- InputFormat.IMAGE: {
63
+ ],
64
+ InputFormat.HTML: ["text/html", "application/xhtml+xml"],
65
+ InputFormat.IMAGE: [
62
66
  "image/png",
63
67
  "image/jpeg",
64
68
  "image/tiff",
65
69
  "image/gif",
66
70
  "image/bmp",
67
- },
68
- InputFormat.PDF: {"application/pdf"},
71
+ ],
72
+ InputFormat.PDF: ["application/pdf"],
73
+ InputFormat.ASCIIDOC: ["text/asciidoc"],
74
+ InputFormat.MD: ["text/markdown", "text/x-markdown"],
69
75
  }
70
76
  MimeTypeToFormat = {
71
77
  mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -45,11 +45,14 @@ from docling.datamodel.base_models import (
45
45
  ConversionStatus,
46
46
  DocumentStream,
47
47
  ErrorItem,
48
+ FormatToExtensions,
49
+ FormatToMimeType,
48
50
  InputFormat,
49
51
  MimeTypeToFormat,
50
52
  Page,
51
53
  )
52
54
  from docling.datamodel.settings import DocumentLimits
55
+ from docling.utils.profiling import ProfilingItem
53
56
  from docling.utils.utils import create_file_hash, create_hash
54
57
 
55
58
  if TYPE_CHECKING:
@@ -143,11 +146,13 @@ class InputDocument(BaseModel):
143
146
  self.valid = False
144
147
 
145
148
  except (FileNotFoundError, OSError) as e:
149
+ self.valid = False
146
150
  _log.exception(
147
151
  f"File {self.file.name} not found or cannot be opened.", exc_info=e
148
152
  )
149
153
  # raise
150
154
  except RuntimeError as e:
155
+ self.valid = False
151
156
  _log.exception(
152
157
  f"An unexpected error occurred while opening the document {self.file.name}",
153
158
  exc_info=e,
@@ -166,6 +171,8 @@ class InputDocument(BaseModel):
166
171
  )
167
172
 
168
173
  self._backend = backend(self, path_or_stream=path_or_stream)
174
+ if not self._backend.is_valid():
175
+ self.valid = False
169
176
 
170
177
 
171
178
  class DocumentFormat(str, Enum):
@@ -181,6 +188,7 @@ class ConversionResult(BaseModel):
181
188
 
182
189
  pages: List[Page] = []
183
190
  assembled: AssembledUnit = AssembledUnit()
191
+ timings: Dict[str, ProfilingItem] = {}
184
192
 
185
193
  document: DoclingDocument = _EMPTY_DOCLING_DOC
186
194
 
@@ -480,26 +488,48 @@ class _DocumentConversionInput(BaseModel):
480
488
  else:
481
489
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
482
490
 
483
- def _guess_format(self, obj):
484
- content = None
491
+ def _guess_format(self, obj: Union[Path, DocumentStream]):
492
+ content = b"" # empty binary blob
493
+ format = None
494
+
485
495
  if isinstance(obj, Path):
486
496
  mime = filetype.guess_mime(str(obj))
487
497
  if mime is None:
498
+ ext = obj.suffix[1:]
499
+ mime = self._mime_from_extension(ext)
500
+ if mime is None: # must guess from
488
501
  with obj.open("rb") as f:
489
502
  content = f.read(1024) # Read first 1KB
490
503
 
491
504
  elif isinstance(obj, DocumentStream):
492
- obj.stream.seek(0)
493
505
  content = obj.stream.read(8192)
494
506
  obj.stream.seek(0)
495
507
  mime = filetype.guess_mime(content)
508
+ if mime is None:
509
+ ext = (
510
+ obj.name.rsplit(".", 1)[-1]
511
+ if ("." in obj.name and not obj.name.startswith("."))
512
+ else ""
513
+ )
514
+ mime = self._mime_from_extension(ext)
496
515
 
497
- if mime is None:
498
- mime = self._detect_html_xhtml(content)
516
+ mime = mime or self._detect_html_xhtml(content)
517
+ mime = mime or "text/plain"
499
518
 
500
519
  format = MimeTypeToFormat.get(mime)
501
520
  return format
502
521
 
522
+ def _mime_from_extension(self, ext):
523
+ mime = None
524
+ if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
525
+ mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
526
+ elif ext in FormatToExtensions[InputFormat.HTML]:
527
+ mime = FormatToMimeType[InputFormat.HTML][0]
528
+ elif ext in FormatToExtensions[InputFormat.MD]:
529
+ mime = FormatToMimeType[InputFormat.MD][0]
530
+
531
+ return mime
532
+
503
533
  def _detect_html_xhtml(self, content):
504
534
  content_str = content.decode("ascii", errors="ignore").lower()
505
535
  # Remove XML comments
@@ -1,4 +1,4 @@
1
- from enum import Enum, auto
1
+ from enum import Enum
2
2
  from pathlib import Path
3
3
  from typing import List, Literal, Optional, Union
4
4
 
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
 
8
8
  class TableFormerMode(str, Enum):
9
- FAST = auto()
10
- ACCURATE = auto()
9
+ FAST = "fast"
10
+ ACCURATE = "accurate"
11
11
 
12
12
 
13
13
  class TableStructureOptions(BaseModel):
@@ -1,4 +1,5 @@
1
1
  import sys
2
+ from pathlib import Path
2
3
 
3
4
  from pydantic import BaseModel
4
5
  from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
26
27
  # To force models into single core: export OMP_NUM_THREADS=1
27
28
 
28
29
 
30
+ class DebugSettings(BaseModel):
31
+ visualize_cells: bool = False
32
+ visualize_ocr: bool = False
33
+ visualize_layout: bool = False
34
+ visualize_tables: bool = False
35
+
36
+ profile_pipeline_timings: bool = False
37
+
38
+ # Path used to output debug information.
39
+ debug_output_path: str = str(Path.cwd() / "debug")
40
+
41
+
29
42
  class AppSettings(BaseSettings):
30
43
  perf: BatchConcurrencySettings
44
+ debug: DebugSettings
31
45
 
32
46
 
33
- settings = AppSettings(perf=BatchConcurrencySettings())
47
+ settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
8
8
  from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
10
10
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.asciidoc_backend import AsciiDocBackend
11
12
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
12
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
+ from docling.backend.md_backend import MarkdownDocumentBackend
13
15
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
14
16
  from docling.backend.msword_backend import MsWordDocumentBackend
15
17
  from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
52
54
  backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
53
55
 
54
56
 
57
+ class MarkdownFormatOption(FormatOption):
58
+ pipeline_cls: Type = SimplePipeline
59
+ backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
60
+
61
+
62
+ class AsciiDocFormatOption(FormatOption):
63
+ pipeline_cls: Type = SimplePipeline
64
+ backend: Type[AbstractDocumentBackend] = AsciiDocBackend
65
+
66
+
55
67
  class HTMLFormatOption(FormatOption):
56
68
  pipeline_cls: Type = SimplePipeline
57
69
  backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
@@ -74,6 +86,12 @@ _format_to_default_options = {
74
86
  InputFormat.PPTX: FormatOption(
75
87
  pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
76
88
  ),
89
+ InputFormat.MD: FormatOption(
90
+ pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
91
+ ),
92
+ InputFormat.ASCIIDOC: FormatOption(
93
+ pipeline_cls=SimplePipeline, backend=AsciiDocBackend
94
+ ),
77
95
  InputFormat.HTML: FormatOption(
78
96
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
79
97
  ),
@@ -121,6 +139,10 @@ class DocumentConverter:
121
139
 
122
140
  self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
123
141
 
142
+ def initialize_pipeline(self, format: InputFormat):
143
+ """Initialize the conversion pipeline for the selected format."""
144
+ self._get_pipeline(doc_format=format)
145
+
124
146
  @validate_call(config=ConfigDict(strict=True))
125
147
  def convert(
126
148
  self,
@@ -171,32 +193,43 @@ class DocumentConverter:
171
193
  ) -> Iterator[ConversionResult]:
172
194
  assert self.format_to_options is not None
173
195
 
196
+ start_time = time.monotonic()
197
+
174
198
  for input_batch in chunkify(
175
199
  conv_input.docs(self.format_to_options),
176
200
  settings.perf.doc_batch_size, # pass format_options
177
201
  ):
178
202
  _log.info(f"Going to convert document batch...")
203
+
179
204
  # parallel processing only within input_batch
180
205
  # with ThreadPoolExecutor(
181
206
  # max_workers=settings.perf.doc_batch_concurrency
182
207
  # ) as pool:
183
208
  # yield from pool.map(self.process_document, input_batch)
184
-
185
209
  # Note: PDF backends are not thread-safe, thread pool usage was disabled.
210
+
186
211
  for item in map(
187
212
  partial(self._process_document, raises_on_error=raises_on_error),
188
213
  input_batch,
189
214
  ):
215
+ elapsed = time.monotonic() - start_time
216
+ start_time = time.monotonic()
217
+
190
218
  if item is not None:
219
+ _log.info(
220
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
221
+ )
191
222
  yield item
223
+ else:
224
+ _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
192
225
 
193
- def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
226
+ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
194
227
  assert self.format_to_options is not None
195
228
 
196
- fopt = self.format_to_options.get(doc.format)
229
+ fopt = self.format_to_options.get(doc_format)
197
230
 
198
231
  if fopt is None:
199
- raise RuntimeError(f"Could not get pipeline for document {doc.file}")
232
+ raise RuntimeError(f"Could not get pipeline for {doc_format}")
200
233
  else:
201
234
  pipeline_class = fopt.pipeline_cls
202
235
  pipeline_options = fopt.pipeline_options
@@ -219,22 +252,15 @@ class DocumentConverter:
219
252
  assert self.allowed_formats is not None
220
253
  assert in_doc.format in self.allowed_formats
221
254
 
222
- start_doc_time = time.time()
223
-
224
255
  conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
225
256
 
226
- end_doc_time = time.time() - start_doc_time
227
- _log.info(
228
- f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
229
- )
230
-
231
257
  return conv_res
232
258
 
233
259
  def _execute_pipeline(
234
260
  self, in_doc: InputDocument, raises_on_error: bool
235
261
  ) -> ConversionResult:
236
262
  if in_doc.valid:
237
- pipeline = self._get_pipeline(in_doc)
263
+ pipeline = self._get_pipeline(in_doc.format)
238
264
  if pipeline is None: # Can't find a default pipeline. Should this raise?
239
265
  if raises_on_error:
240
266
  raise RuntimeError(
@@ -4,11 +4,14 @@ from typing import Any, Iterable
4
4
  from docling_core.types.doc import DoclingDocument, NodeItem
5
5
 
6
6
  from docling.datamodel.base_models import Page
7
+ from docling.datamodel.document import ConversionResult
7
8
 
8
9
 
9
10
  class BasePageModel(ABC):
10
11
  @abstractmethod
11
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
12
+ def __call__(
13
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
14
+ ) -> Iterable[Page]:
12
15
  pass
13
16
 
14
17
 
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
+ from pathlib import Path
4
5
  from typing import Iterable, List
5
6
 
6
7
  import numpy as np
@@ -10,12 +11,15 @@ from rtree import index
10
11
  from scipy.ndimage import find_objects, label
11
12
 
12
13
  from docling.datamodel.base_models import OcrCell, Page
14
+ from docling.datamodel.document import ConversionResult
13
15
  from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.settings import settings
17
+ from docling.models.base_model import BasePageModel
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
17
21
 
18
- class BaseOcrModel:
22
+ class BaseOcrModel(BasePageModel):
19
23
  def __init__(self, enabled: bool, options: OcrOptions):
20
24
  self.enabled = enabled
21
25
  self.options = options
@@ -113,7 +117,7 @@ class BaseOcrModel:
113
117
  ]
114
118
  return filtered_ocr_cells
115
119
 
116
- def draw_ocr_rects_and_cells(self, page, ocr_rects):
120
+ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
117
121
  image = copy.deepcopy(page.image)
118
122
  draw = ImageDraw.Draw(image, "RGBA")
119
123
 
@@ -130,8 +134,21 @@ class BaseOcrModel:
130
134
  if isinstance(tc, OcrCell):
131
135
  color = "magenta"
132
136
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
133
- image.show()
137
+
138
+ if show:
139
+ image.show()
140
+ else:
141
+ out_path: Path = (
142
+ Path(settings.debug.debug_output_path)
143
+ / f"debug_{conv_res.input.file.stem}"
144
+ )
145
+ out_path.mkdir(parents=True, exist_ok=True)
146
+
147
+ out_file = out_path / f"ocr_page_{page.page_no:05}.png"
148
+ image.save(str(out_file), format="png")
134
149
 
135
150
  @abstractmethod
136
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
151
+ def __call__(
152
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
153
+ ) -> Iterable[Page]:
137
154
  pass
@@ -1,5 +1,6 @@
1
1
  import copy
2
2
  import random
3
+ from pathlib import Path
3
4
  from typing import List, Union
4
5
 
5
6
  from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
27
28
 
28
29
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
29
30
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
+ from docling.datamodel.settings import settings
32
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
30
33
  from docling.utils.utils import create_hash
31
34
 
32
35
 
@@ -226,23 +229,24 @@ class GlmModel:
226
229
  return ds_doc
227
230
 
228
231
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
229
- ds_doc = self._to_legacy_document(conv_res)
230
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
232
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
233
+ ds_doc = self._to_legacy_document(conv_res)
234
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
231
235
 
232
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
236
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
233
237
 
234
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
238
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
235
239
 
236
240
  # DEBUG code:
237
- def draw_clusters_and_cells(ds_document, page_no):
241
+ def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
238
242
  clusters_to_draw = []
239
243
  image = copy.deepcopy(conv_res.pages[page_no].image)
240
244
  for ix, elem in enumerate(ds_document.main_text):
241
245
  if isinstance(elem, BaseText):
242
- prov = elem.prov[0]
246
+ prov = elem.prov[0] # type: ignore
243
247
  elif isinstance(elem, Ref):
244
248
  _, arr, index = elem.ref.split("/")
245
- index = int(index)
249
+ index = int(index) # type: ignore
246
250
  if arr == "tables":
247
251
  prov = ds_document.tables[index].prov[0]
248
252
  elif arr == "figures":
@@ -256,7 +260,7 @@ class GlmModel:
256
260
  id=ix,
257
261
  label=elem.name,
258
262
  bbox=BoundingBox.from_tuple(
259
- coord=prov.bbox,
263
+ coord=prov.bbox, # type: ignore
260
264
  origin=CoordOrigin.BOTTOMLEFT,
261
265
  ).to_top_left_origin(conv_res.pages[page_no].size.height),
262
266
  )
@@ -276,9 +280,21 @@ class GlmModel:
276
280
  for tc in c.cells: # [:1]:
277
281
  x0, y0, x1, y1 = tc.bbox.as_tuple()
278
282
  draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
279
- image.show()
280
283
 
281
- # draw_clusters_and_cells(ds_doc, 0)
282
- # draw_clusters_and_cells(exported_doc, 0)
284
+ if show:
285
+ image.show()
286
+ else:
287
+ out_path: Path = (
288
+ Path(settings.debug.debug_output_path)
289
+ / f"debug_{conv_res.input.file.stem}"
290
+ )
291
+ out_path.mkdir(parents=True, exist_ok=True)
292
+
293
+ out_file = out_path / f"doc_page_{page_no:05}.png"
294
+ image.save(str(out_file), format="png")
295
+
296
+ # for item in ds_doc.page_dimensions:
297
+ # page_no = item.page
298
+ # draw_clusters_and_cells(ds_doc, page_no)
283
299
 
284
300
  return docling_doc