docling 2.2.1__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {docling-2.2.1 → docling-2.3.0}/PKG-INFO +5 -4
  2. {docling-2.2.1 → docling-2.3.0}/README.md +3 -2
  3. {docling-2.2.1 → docling-2.3.0}/docling/backend/asciidoc_backend.py +0 -4
  4. {docling-2.2.1 → docling-2.3.0}/docling/backend/html_backend.py +25 -25
  5. {docling-2.2.1 → docling-2.3.0}/docling/datamodel/base_models.py +1 -1
  6. {docling-2.2.1 → docling-2.3.0}/docling/datamodel/document.py +3 -1
  7. {docling-2.2.1 → docling-2.3.0}/docling/datamodel/settings.py +15 -1
  8. {docling-2.2.1 → docling-2.3.0}/docling/document_converter.py +12 -8
  9. {docling-2.2.1 → docling-2.3.0}/docling/models/base_model.py +4 -1
  10. {docling-2.2.1 → docling-2.3.0}/docling/models/base_ocr_model.py +21 -4
  11. {docling-2.2.1 → docling-2.3.0}/docling/models/ds_glm_model.py +27 -11
  12. docling-2.3.0/docling/models/easyocr_model.py +100 -0
  13. {docling-2.2.1 → docling-2.3.0}/docling/models/layout_model.py +87 -61
  14. docling-2.3.0/docling/models/page_assemble_model.py +174 -0
  15. {docling-2.2.1 → docling-2.3.0}/docling/models/page_preprocessing_model.py +25 -7
  16. docling-2.3.0/docling/models/table_structure_model.py +206 -0
  17. {docling-2.2.1 → docling-2.3.0}/docling/models/tesseract_ocr_cli_model.py +62 -52
  18. docling-2.3.0/docling/models/tesseract_ocr_model.py +142 -0
  19. docling-2.3.0/docling/pipeline/base_pipeline.py +189 -0
  20. {docling-2.2.1 → docling-2.3.0}/docling/pipeline/simple_pipeline.py +8 -11
  21. {docling-2.2.1 → docling-2.3.0}/docling/pipeline/standard_pdf_pipeline.py +59 -56
  22. docling-2.3.0/docling/utils/profiling.py +62 -0
  23. {docling-2.2.1 → docling-2.3.0}/pyproject.toml +2 -2
  24. docling-2.2.1/docling/models/easyocr_model.py +0 -90
  25. docling-2.2.1/docling/models/page_assemble_model.py +0 -172
  26. docling-2.2.1/docling/models/table_structure_model.py +0 -171
  27. docling-2.2.1/docling/models/tesseract_ocr_model.py +0 -130
  28. docling-2.2.1/docling/pipeline/base_pipeline.py +0 -190
  29. {docling-2.2.1 → docling-2.3.0}/LICENSE +0 -0
  30. {docling-2.2.1 → docling-2.3.0}/docling/__init__.py +0 -0
  31. {docling-2.2.1 → docling-2.3.0}/docling/backend/__init__.py +0 -0
  32. {docling-2.2.1 → docling-2.3.0}/docling/backend/abstract_backend.py +0 -0
  33. {docling-2.2.1 → docling-2.3.0}/docling/backend/docling_parse_backend.py +0 -0
  34. {docling-2.2.1 → docling-2.3.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  35. {docling-2.2.1 → docling-2.3.0}/docling/backend/md_backend.py +0 -0
  36. {docling-2.2.1 → docling-2.3.0}/docling/backend/mspowerpoint_backend.py +0 -0
  37. {docling-2.2.1 → docling-2.3.0}/docling/backend/msword_backend.py +0 -0
  38. {docling-2.2.1 → docling-2.3.0}/docling/backend/pdf_backend.py +0 -0
  39. {docling-2.2.1 → docling-2.3.0}/docling/backend/pypdfium2_backend.py +0 -0
  40. {docling-2.2.1 → docling-2.3.0}/docling/cli/__init__.py +0 -0
  41. {docling-2.2.1 → docling-2.3.0}/docling/cli/main.py +0 -0
  42. {docling-2.2.1 → docling-2.3.0}/docling/datamodel/__init__.py +0 -0
  43. {docling-2.2.1 → docling-2.3.0}/docling/datamodel/pipeline_options.py +0 -0
  44. {docling-2.2.1 → docling-2.3.0}/docling/models/__init__.py +0 -0
  45. {docling-2.2.1 → docling-2.3.0}/docling/pipeline/__init__.py +0 -0
  46. {docling-2.2.1 → docling-2.3.0}/docling/utils/__init__.py +0 -0
  47. {docling-2.2.1 → docling-2.3.0}/docling/utils/export.py +0 -0
  48. {docling-2.2.1 → docling-2.3.0}/docling/utils/layout_utils.py +0 -0
  49. {docling-2.2.1 → docling-2.3.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.2.1
3
+ Version: 2.3.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,7 +23,7 @@ Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
- Requires-Dist: docling-core (>=2.2.1,<3.0.0)
26
+ Requires-Dist: docling-core (>=2.2.3,<3.0.0)
27
27
  Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
28
  Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -73,8 +73,9 @@ Docling parses documents and exports them to the desired format with ease and sp
73
73
 
74
74
  ## Features
75
75
 
76
- * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
77
- * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
76
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
77
+ * 📑 Advanced PDF document understanding including page layout, reading order & table structures
78
+ * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
78
79
  * 📝 Metadata extraction, including title, authors, references & language
79
80
  * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
80
81
  * 🔍 OCR support for scanned PDFs
@@ -22,8 +22,9 @@ Docling parses documents and exports them to the desired format with ease and sp
22
22
 
23
23
  ## Features
24
24
 
25
- * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
26
- * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
25
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
26
+ * 📑 Advanced PDF document understanding including page layout, reading order & table structures
27
+ * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
27
28
  * 📝 Metadata extraction, including title, authors, references & language
28
29
  * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
29
30
  * 🔍 OCR support for scanned PDFs
@@ -1,24 +1,20 @@
1
1
  import logging
2
- import os
3
2
  import re
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Set, Union
7
6
 
8
7
  from docling_core.types.doc import (
9
- DocItem,
10
8
  DocItemLabel,
11
9
  DoclingDocument,
12
10
  DocumentOrigin,
13
11
  GroupItem,
14
12
  GroupLabel,
15
13
  ImageRef,
16
- NodeItem,
17
14
  Size,
18
15
  TableCell,
19
16
  TableData,
20
17
  )
21
- from pydantic import AnyUrl
22
18
 
23
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
24
20
  from docling.datamodel.base_models import InputFormat
@@ -179,31 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
179
179
  self.parents[self.level] = doc.add_text(
180
180
  parent=self.parents[0], label=DocItemLabel.TITLE, text=text
181
181
  )
182
-
183
- elif hlevel > self.level:
184
-
185
- # add invisible group
186
- for i in range(self.level + 1, hlevel):
187
- self.parents[i] = doc.add_group(
188
- name=f"header-{i}",
189
- label=GroupLabel.SECTION,
190
- parent=self.parents[i - 1],
191
- )
192
- self.level = hlevel
193
-
194
- elif hlevel < self.level:
195
-
196
- # remove the tail
197
- for key, val in self.parents.items():
198
- if key > hlevel:
199
- self.parents[key] = None
200
- self.level = hlevel
201
-
202
- self.parents[hlevel] = doc.add_heading(
203
- parent=self.parents[hlevel - 1],
204
- text=text,
205
- level=hlevel,
206
- )
182
+ else:
183
+ if hlevel > self.level:
184
+
185
+ # add invisible group
186
+ for i in range(self.level + 1, hlevel):
187
+ self.parents[i] = doc.add_group(
188
+ name=f"header-{i}",
189
+ label=GroupLabel.SECTION,
190
+ parent=self.parents[i - 1],
191
+ )
192
+ self.level = hlevel
193
+
194
+ elif hlevel < self.level:
195
+
196
+ # remove the tail
197
+ for key, val in self.parents.items():
198
+ if key > hlevel:
199
+ self.parents[key] = None
200
+ self.level = hlevel
201
+
202
+ self.parents[hlevel] = doc.add_heading(
203
+ parent=self.parents[hlevel - 1],
204
+ text=text,
205
+ level=hlevel,
206
+ )
207
207
 
208
208
  def handle_paragraph(self, element, idx, doc):
209
209
  """Handles paragraph tags (p)."""
@@ -1,6 +1,6 @@
1
1
  from enum import Enum, auto
2
2
  from io import BytesIO
3
- from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
4
 
5
5
  from docling_core.types.doc import (
6
6
  BoundingBox,
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
52
52
  Page,
53
53
  )
54
54
  from docling.datamodel.settings import DocumentLimits
55
+ from docling.utils.profiling import ProfilingItem
55
56
  from docling.utils.utils import create_file_hash, create_hash
56
57
 
57
58
  if TYPE_CHECKING:
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
187
188
 
188
189
  pages: List[Page] = []
189
190
  assembled: AssembledUnit = AssembledUnit()
191
+ timings: Dict[str, ProfilingItem] = {}
190
192
 
191
193
  document: DoclingDocument = _EMPTY_DOCLING_DOC
192
194
 
@@ -1,4 +1,5 @@
1
1
  import sys
2
+ from pathlib import Path
2
3
 
3
4
  from pydantic import BaseModel
4
5
  from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
26
27
  # To force models into single core: export OMP_NUM_THREADS=1
27
28
 
28
29
 
30
+ class DebugSettings(BaseModel):
31
+ visualize_cells: bool = False
32
+ visualize_ocr: bool = False
33
+ visualize_layout: bool = False
34
+ visualize_tables: bool = False
35
+
36
+ profile_pipeline_timings: bool = False
37
+
38
+ # Path used to output debug information.
39
+ debug_output_path: str = str(Path.cwd() / "debug")
40
+
41
+
29
42
  class AppSettings(BaseSettings):
30
43
  perf: BatchConcurrencySettings
44
+ debug: DebugSettings
31
45
 
32
46
 
33
- settings = AppSettings(perf=BatchConcurrencySettings())
47
+ settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -189,24 +189,35 @@ class DocumentConverter:
189
189
  ) -> Iterator[ConversionResult]:
190
190
  assert self.format_to_options is not None
191
191
 
192
+ start_time = time.monotonic()
193
+
192
194
  for input_batch in chunkify(
193
195
  conv_input.docs(self.format_to_options),
194
196
  settings.perf.doc_batch_size, # pass format_options
195
197
  ):
196
198
  _log.info(f"Going to convert document batch...")
199
+
197
200
  # parallel processing only within input_batch
198
201
  # with ThreadPoolExecutor(
199
202
  # max_workers=settings.perf.doc_batch_concurrency
200
203
  # ) as pool:
201
204
  # yield from pool.map(self.process_document, input_batch)
202
-
203
205
  # Note: PDF backends are not thread-safe, thread pool usage was disabled.
206
+
204
207
  for item in map(
205
208
  partial(self._process_document, raises_on_error=raises_on_error),
206
209
  input_batch,
207
210
  ):
211
+ elapsed = time.monotonic() - start_time
212
+ start_time = time.monotonic()
213
+
208
214
  if item is not None:
215
+ _log.info(
216
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
217
+ )
209
218
  yield item
219
+ else:
220
+ _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
210
221
 
211
222
  def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
212
223
  assert self.format_to_options is not None
@@ -237,15 +248,8 @@ class DocumentConverter:
237
248
  assert self.allowed_formats is not None
238
249
  assert in_doc.format in self.allowed_formats
239
250
 
240
- start_doc_time = time.time()
241
-
242
251
  conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
243
252
 
244
- end_doc_time = time.time() - start_doc_time
245
- _log.info(
246
- f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
247
- )
248
-
249
253
  return conv_res
250
254
 
251
255
  def _execute_pipeline(
@@ -4,11 +4,14 @@ from typing import Any, Iterable
4
4
  from docling_core.types.doc import DoclingDocument, NodeItem
5
5
 
6
6
  from docling.datamodel.base_models import Page
7
+ from docling.datamodel.document import ConversionResult
7
8
 
8
9
 
9
10
  class BasePageModel(ABC):
10
11
  @abstractmethod
11
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
12
+ def __call__(
13
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
14
+ ) -> Iterable[Page]:
12
15
  pass
13
16
 
14
17
 
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
+ from pathlib import Path
4
5
  from typing import Iterable, List
5
6
 
6
7
  import numpy as np
@@ -10,12 +11,15 @@ from rtree import index
10
11
  from scipy.ndimage import find_objects, label
11
12
 
12
13
  from docling.datamodel.base_models import OcrCell, Page
14
+ from docling.datamodel.document import ConversionResult
13
15
  from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.settings import settings
17
+ from docling.models.base_model import BasePageModel
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
17
21
 
18
- class BaseOcrModel:
22
+ class BaseOcrModel(BasePageModel):
19
23
  def __init__(self, enabled: bool, options: OcrOptions):
20
24
  self.enabled = enabled
21
25
  self.options = options
@@ -113,7 +117,7 @@ class BaseOcrModel:
113
117
  ]
114
118
  return filtered_ocr_cells
115
119
 
116
- def draw_ocr_rects_and_cells(self, page, ocr_rects):
120
+ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
117
121
  image = copy.deepcopy(page.image)
118
122
  draw = ImageDraw.Draw(image, "RGBA")
119
123
 
@@ -130,8 +134,21 @@ class BaseOcrModel:
130
134
  if isinstance(tc, OcrCell):
131
135
  color = "magenta"
132
136
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
133
- image.show()
137
+
138
+ if show:
139
+ image.show()
140
+ else:
141
+ out_path: Path = (
142
+ Path(settings.debug.debug_output_path)
143
+ / f"debug_{conv_res.input.file.stem}"
144
+ )
145
+ out_path.mkdir(parents=True, exist_ok=True)
146
+
147
+ out_file = out_path / f"ocr_page_{page.page_no:05}.png"
148
+ image.save(str(out_file), format="png")
134
149
 
135
150
  @abstractmethod
136
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
151
+ def __call__(
152
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
153
+ ) -> Iterable[Page]:
137
154
  pass
@@ -1,5 +1,6 @@
1
1
  import copy
2
2
  import random
3
+ from pathlib import Path
3
4
  from typing import List, Union
4
5
 
5
6
  from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
27
28
 
28
29
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
29
30
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
+ from docling.datamodel.settings import settings
32
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
30
33
  from docling.utils.utils import create_hash
31
34
 
32
35
 
@@ -226,23 +229,24 @@ class GlmModel:
226
229
  return ds_doc
227
230
 
228
231
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
229
- ds_doc = self._to_legacy_document(conv_res)
230
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
232
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
233
+ ds_doc = self._to_legacy_document(conv_res)
234
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
231
235
 
232
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
236
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
233
237
 
234
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
238
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
235
239
 
236
240
  # DEBUG code:
237
- def draw_clusters_and_cells(ds_document, page_no):
241
+ def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
238
242
  clusters_to_draw = []
239
243
  image = copy.deepcopy(conv_res.pages[page_no].image)
240
244
  for ix, elem in enumerate(ds_document.main_text):
241
245
  if isinstance(elem, BaseText):
242
- prov = elem.prov[0]
246
+ prov = elem.prov[0] # type: ignore
243
247
  elif isinstance(elem, Ref):
244
248
  _, arr, index = elem.ref.split("/")
245
- index = int(index)
249
+ index = int(index) # type: ignore
246
250
  if arr == "tables":
247
251
  prov = ds_document.tables[index].prov[0]
248
252
  elif arr == "figures":
@@ -256,7 +260,7 @@ class GlmModel:
256
260
  id=ix,
257
261
  label=elem.name,
258
262
  bbox=BoundingBox.from_tuple(
259
- coord=prov.bbox,
263
+ coord=prov.bbox, # type: ignore
260
264
  origin=CoordOrigin.BOTTOMLEFT,
261
265
  ).to_top_left_origin(conv_res.pages[page_no].size.height),
262
266
  )
@@ -276,9 +280,21 @@ class GlmModel:
276
280
  for tc in c.cells: # [:1]:
277
281
  x0, y0, x1, y1 = tc.bbox.as_tuple()
278
282
  draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
279
- image.show()
280
283
 
281
- # draw_clusters_and_cells(ds_doc, 0)
282
- # draw_clusters_and_cells(exported_doc, 0)
284
+ if show:
285
+ image.show()
286
+ else:
287
+ out_path: Path = (
288
+ Path(settings.debug.debug_output_path)
289
+ / f"debug_{conv_res.input.file.stem}"
290
+ )
291
+ out_path.mkdir(parents=True, exist_ok=True)
292
+
293
+ out_file = out_path / f"doc_page_{page_no:05}.png"
294
+ image.save(str(out_file), format="png")
295
+
296
+ # for item in ds_doc.page_dimensions:
297
+ # page_no = item.page
298
+ # draw_clusters_and_cells(ds_doc, page_no)
283
299
 
284
300
  return docling_doc
@@ -0,0 +1,100 @@
1
+ import logging
2
+ from typing import Iterable
3
+
4
+ import numpy
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
6
+
7
+ from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import EasyOcrOptions
10
+ from docling.datamodel.settings import settings
11
+ from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ class EasyOcrModel(BaseOcrModel):
18
+ def __init__(self, enabled: bool, options: EasyOcrOptions):
19
+ super().__init__(enabled=enabled, options=options)
20
+ self.options: EasyOcrOptions
21
+
22
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
+
24
+ if self.enabled:
25
+ try:
26
+ import easyocr
27
+ except ImportError:
28
+ raise ImportError(
29
+ "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
30
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
31
+ )
32
+
33
+ self.reader = easyocr.Reader(
34
+ lang_list=self.options.lang,
35
+ model_storage_directory=self.options.model_storage_directory,
36
+ download_enabled=self.options.download_enabled,
37
+ )
38
+
39
+ def __call__(
40
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
41
+ ) -> Iterable[Page]:
42
+
43
+ if not self.enabled:
44
+ yield from page_batch
45
+ return
46
+
47
+ for page in page_batch:
48
+
49
+ assert page._backend is not None
50
+ if not page._backend.is_valid():
51
+ yield page
52
+ else:
53
+ with TimeRecorder(conv_res, "ocr"):
54
+ ocr_rects = self.get_ocr_rects(page)
55
+
56
+ all_ocr_cells = []
57
+ for ocr_rect in ocr_rects:
58
+ # Skip zero area boxes
59
+ if ocr_rect.area() == 0:
60
+ continue
61
+ high_res_image = page._backend.get_page_image(
62
+ scale=self.scale, cropbox=ocr_rect
63
+ )
64
+ im = numpy.array(high_res_image)
65
+ result = self.reader.readtext(im)
66
+
67
+ del high_res_image
68
+ del im
69
+
70
+ cells = [
71
+ OcrCell(
72
+ id=ix,
73
+ text=line[1],
74
+ confidence=line[2],
75
+ bbox=BoundingBox.from_tuple(
76
+ coord=(
77
+ (line[0][0][0] / self.scale) + ocr_rect.l,
78
+ (line[0][0][1] / self.scale) + ocr_rect.t,
79
+ (line[0][2][0] / self.scale) + ocr_rect.l,
80
+ (line[0][2][1] / self.scale) + ocr_rect.t,
81
+ ),
82
+ origin=CoordOrigin.TOPLEFT,
83
+ ),
84
+ )
85
+ for ix, line in enumerate(result)
86
+ ]
87
+ all_ocr_cells.extend(cells)
88
+
89
+ ## Remove OCR cells which overlap with programmatic cells.
90
+ filtered_ocr_cells = self.filter_ocr_cells(
91
+ all_ocr_cells, page.cells
92
+ )
93
+
94
+ page.cells.extend(filtered_ocr_cells)
95
+
96
+ # DEBUG code:
97
+ if settings.debug.visualize_ocr:
98
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
99
+
100
+ yield page
@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
16
16
  LayoutPrediction,
17
17
  Page,
18
18
  )
19
+ from docling.datamodel.document import ConversionResult
20
+ from docling.datamodel.settings import settings
19
21
  from docling.models.base_model import BasePageModel
20
22
  from docling.utils import layout_utils as lu
23
+ from docling.utils.profiling import TimeRecorder
21
24
 
22
25
  _log = logging.getLogger(__name__)
23
26
 
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
271
274
 
272
275
  return clusters_out_new, cells_out_new
273
276
 
274
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
277
+ def __call__(
278
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
279
+ ) -> Iterable[Page]:
280
+
275
281
  for page in page_batch:
276
282
  assert page._backend is not None
277
283
  if not page._backend.is_valid():
278
284
  yield page
279
285
  else:
280
- assert page.size is not None
281
-
282
- clusters = []
283
- for ix, pred_item in enumerate(
284
- self.layout_predictor.predict(page.get_image(scale=1.0))
285
- ):
286
- label = DocItemLabel(
287
- pred_item["label"].lower().replace(" ", "_").replace("-", "_")
288
- ) # Temporary, until docling-ibm-model uses docling-core types
289
- cluster = Cluster(
290
- id=ix,
291
- label=label,
292
- confidence=pred_item["confidence"],
293
- bbox=BoundingBox.model_validate(pred_item),
294
- cells=[],
295
- )
296
- clusters.append(cluster)
297
-
298
- # Map cells to clusters
299
- # TODO: Remove, postprocess should take care of it anyway.
300
- for cell in page.cells:
301
- for cluster in clusters:
302
- if not cell.bbox.area() > 0:
303
- overlap_frac = 0.0
304
- else:
305
- overlap_frac = (
306
- cell.bbox.intersection_area_with(cluster.bbox)
307
- / cell.bbox.area()
308
- )
309
-
310
- if overlap_frac > 0.5:
311
- cluster.cells.append(cell)
312
-
313
- # Pre-sort clusters
314
- # clusters = self.sort_clusters_by_cell_order(clusters)
315
-
316
- # DEBUG code:
317
- def draw_clusters_and_cells():
318
- image = copy.deepcopy(page.image)
319
- draw = ImageDraw.Draw(image)
320
- for c in clusters:
321
- x0, y0, x1, y1 = c.bbox.as_tuple()
322
- draw.rectangle([(x0, y0), (x1, y1)], outline="green")
323
-
324
- cell_color = (
325
- random.randint(30, 140),
326
- random.randint(30, 140),
327
- random.randint(30, 140),
286
+ with TimeRecorder(conv_res, "layout"):
287
+ assert page.size is not None
288
+
289
+ clusters = []
290
+ for ix, pred_item in enumerate(
291
+ self.layout_predictor.predict(page.get_image(scale=1.0))
292
+ ):
293
+ label = DocItemLabel(
294
+ pred_item["label"]
295
+ .lower()
296
+ .replace(" ", "_")
297
+ .replace("-", "_")
298
+ ) # Temporary, until docling-ibm-model uses docling-core types
299
+ cluster = Cluster(
300
+ id=ix,
301
+ label=label,
302
+ confidence=pred_item["confidence"],
303
+ bbox=BoundingBox.model_validate(pred_item),
304
+ cells=[],
328
305
  )
329
- for tc in c.cells: # [:1]:
330
- x0, y0, x1, y1 = tc.bbox.as_tuple()
331
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
332
- image.show()
333
-
334
- # draw_clusters_and_cells()
335
-
336
- clusters, page.cells = self.postprocess(
337
- clusters, page.cells, page.size.height
338
- )
306
+ clusters.append(cluster)
307
+
308
+ # Map cells to clusters
309
+ # TODO: Remove, postprocess should take care of it anyway.
310
+ for cell in page.cells:
311
+ for cluster in clusters:
312
+ if not cell.bbox.area() > 0:
313
+ overlap_frac = 0.0
314
+ else:
315
+ overlap_frac = (
316
+ cell.bbox.intersection_area_with(cluster.bbox)
317
+ / cell.bbox.area()
318
+ )
319
+
320
+ if overlap_frac > 0.5:
321
+ cluster.cells.append(cell)
322
+
323
+ # Pre-sort clusters
324
+ # clusters = self.sort_clusters_by_cell_order(clusters)
325
+
326
+ # DEBUG code:
327
+ def draw_clusters_and_cells(show: bool = False):
328
+ image = copy.deepcopy(page.image)
329
+ if image is not None:
330
+ draw = ImageDraw.Draw(image)
331
+ for c in clusters:
332
+ x0, y0, x1, y1 = c.bbox.as_tuple()
333
+ draw.rectangle([(x0, y0), (x1, y1)], outline="green")
334
+
335
+ cell_color = (
336
+ random.randint(30, 140),
337
+ random.randint(30, 140),
338
+ random.randint(30, 140),
339
+ )
340
+ for tc in c.cells: # [:1]:
341
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
342
+ draw.rectangle(
343
+ [(x0, y0), (x1, y1)], outline=cell_color
344
+ )
345
+ if show:
346
+ image.show()
347
+ else:
348
+ out_path: Path = (
349
+ Path(settings.debug.debug_output_path)
350
+ / f"debug_{conv_res.input.file.stem}"
351
+ )
352
+ out_path.mkdir(parents=True, exist_ok=True)
353
+
354
+ out_file = (
355
+ out_path / f"layout_page_{page.page_no:05}.png"
356
+ )
357
+ image.save(str(out_file), format="png")
358
+
359
+ # draw_clusters_and_cells()
360
+
361
+ clusters, page.cells = self.postprocess(
362
+ clusters, page.cells, page.size.height
363
+ )
339
364
 
340
- # draw_clusters_and_cells()
365
+ page.predictions.layout = LayoutPrediction(clusters=clusters)
341
366
 
342
- page.predictions.layout = LayoutPrediction(clusters=clusters)
367
+ if settings.debug.visualize_layout:
368
+ draw_clusters_and_cells()
343
369
 
344
370
  yield page