docling 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. docling/backend/abstract_backend.py +1 -0
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +4 -4
  4. docling/backend/docling_parse_v2_backend.py +12 -4
  5. docling/backend/html_backend.py +61 -57
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +62 -39
  8. docling/backend/msword_backend.py +12 -25
  9. docling/backend/pypdfium2_backend.py +1 -1
  10. docling/cli/main.py +38 -8
  11. docling/datamodel/base_models.py +16 -10
  12. docling/datamodel/document.py +36 -6
  13. docling/datamodel/pipeline_options.py +3 -3
  14. docling/datamodel/settings.py +15 -1
  15. docling/document_converter.py +38 -12
  16. docling/models/base_model.py +4 -1
  17. docling/models/base_ocr_model.py +21 -4
  18. docling/models/ds_glm_model.py +27 -11
  19. docling/models/easyocr_model.py +49 -39
  20. docling/models/layout_model.py +87 -61
  21. docling/models/page_assemble_model.py +102 -100
  22. docling/models/page_preprocessing_model.py +25 -7
  23. docling/models/table_structure_model.py +125 -90
  24. docling/models/tesseract_ocr_cli_model.py +62 -52
  25. docling/models/tesseract_ocr_model.py +76 -52
  26. docling/pipeline/base_pipeline.py +68 -69
  27. docling/pipeline/simple_pipeline.py +8 -11
  28. docling/pipeline/standard_pdf_pipeline.py +59 -56
  29. docling/utils/profiling.py +62 -0
  30. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/METADATA +27 -22
  31. docling-2.4.1.dist-info/RECORD +45 -0
  32. docling-2.1.0.dist-info/RECORD +0 -42
  33. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  34. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  35. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -5,8 +5,11 @@ import numpy
5
5
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
6
 
7
7
  from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import EasyOcrOptions
10
+ from docling.datamodel.settings import settings
9
11
  from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
10
13
 
11
14
  _log = logging.getLogger(__name__)
12
15
 
@@ -33,58 +36,65 @@ class EasyOcrModel(BaseOcrModel):
33
36
  download_enabled=self.options.download_enabled,
34
37
  )
35
38
 
36
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
39
+ def __call__(
40
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
41
+ ) -> Iterable[Page]:
37
42
 
38
43
  if not self.enabled:
39
44
  yield from page_batch
40
45
  return
41
46
 
42
47
  for page in page_batch:
48
+
43
49
  assert page._backend is not None
44
50
  if not page._backend.is_valid():
45
51
  yield page
46
52
  else:
47
- ocr_rects = self.get_ocr_rects(page)
48
-
49
- all_ocr_cells = []
50
- for ocr_rect in ocr_rects:
51
- # Skip zero area boxes
52
- if ocr_rect.area() == 0:
53
- continue
54
- high_res_image = page._backend.get_page_image(
55
- scale=self.scale, cropbox=ocr_rect
56
- )
57
- im = numpy.array(high_res_image)
58
- result = self.reader.readtext(im)
59
-
60
- del high_res_image
61
- del im
62
-
63
- cells = [
64
- OcrCell(
65
- id=ix,
66
- text=line[1],
67
- confidence=line[2],
68
- bbox=BoundingBox.from_tuple(
69
- coord=(
70
- (line[0][0][0] / self.scale) + ocr_rect.l,
71
- (line[0][0][1] / self.scale) + ocr_rect.t,
72
- (line[0][2][0] / self.scale) + ocr_rect.l,
73
- (line[0][2][1] / self.scale) + ocr_rect.t,
74
- ),
75
- origin=CoordOrigin.TOPLEFT,
76
- ),
53
+ with TimeRecorder(conv_res, "ocr"):
54
+ ocr_rects = self.get_ocr_rects(page)
55
+
56
+ all_ocr_cells = []
57
+ for ocr_rect in ocr_rects:
58
+ # Skip zero area boxes
59
+ if ocr_rect.area() == 0:
60
+ continue
61
+ high_res_image = page._backend.get_page_image(
62
+ scale=self.scale, cropbox=ocr_rect
77
63
  )
78
- for ix, line in enumerate(result)
79
- ]
80
- all_ocr_cells.extend(cells)
81
-
82
- ## Remove OCR cells which overlap with programmatic cells.
83
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
64
+ im = numpy.array(high_res_image)
65
+ result = self.reader.readtext(im)
66
+
67
+ del high_res_image
68
+ del im
69
+
70
+ cells = [
71
+ OcrCell(
72
+ id=ix,
73
+ text=line[1],
74
+ confidence=line[2],
75
+ bbox=BoundingBox.from_tuple(
76
+ coord=(
77
+ (line[0][0][0] / self.scale) + ocr_rect.l,
78
+ (line[0][0][1] / self.scale) + ocr_rect.t,
79
+ (line[0][2][0] / self.scale) + ocr_rect.l,
80
+ (line[0][2][1] / self.scale) + ocr_rect.t,
81
+ ),
82
+ origin=CoordOrigin.TOPLEFT,
83
+ ),
84
+ )
85
+ for ix, line in enumerate(result)
86
+ ]
87
+ all_ocr_cells.extend(cells)
88
+
89
+ ## Remove OCR cells which overlap with programmatic cells.
90
+ filtered_ocr_cells = self.filter_ocr_cells(
91
+ all_ocr_cells, page.cells
92
+ )
84
93
 
85
- page.cells.extend(filtered_ocr_cells)
94
+ page.cells.extend(filtered_ocr_cells)
86
95
 
87
96
  # DEBUG code:
88
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
97
+ if settings.debug.visualize_ocr:
98
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
89
99
 
90
100
  yield page
@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
16
16
  LayoutPrediction,
17
17
  Page,
18
18
  )
19
+ from docling.datamodel.document import ConversionResult
20
+ from docling.datamodel.settings import settings
19
21
  from docling.models.base_model import BasePageModel
20
22
  from docling.utils import layout_utils as lu
23
+ from docling.utils.profiling import TimeRecorder
21
24
 
22
25
  _log = logging.getLogger(__name__)
23
26
 
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
271
274
 
272
275
  return clusters_out_new, cells_out_new
273
276
 
274
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
277
+ def __call__(
278
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
279
+ ) -> Iterable[Page]:
280
+
275
281
  for page in page_batch:
276
282
  assert page._backend is not None
277
283
  if not page._backend.is_valid():
278
284
  yield page
279
285
  else:
280
- assert page.size is not None
281
-
282
- clusters = []
283
- for ix, pred_item in enumerate(
284
- self.layout_predictor.predict(page.get_image(scale=1.0))
285
- ):
286
- label = DocItemLabel(
287
- pred_item["label"].lower().replace(" ", "_").replace("-", "_")
288
- ) # Temporary, until docling-ibm-model uses docling-core types
289
- cluster = Cluster(
290
- id=ix,
291
- label=label,
292
- confidence=pred_item["confidence"],
293
- bbox=BoundingBox.model_validate(pred_item),
294
- cells=[],
295
- )
296
- clusters.append(cluster)
297
-
298
- # Map cells to clusters
299
- # TODO: Remove, postprocess should take care of it anyway.
300
- for cell in page.cells:
301
- for cluster in clusters:
302
- if not cell.bbox.area() > 0:
303
- overlap_frac = 0.0
304
- else:
305
- overlap_frac = (
306
- cell.bbox.intersection_area_with(cluster.bbox)
307
- / cell.bbox.area()
308
- )
309
-
310
- if overlap_frac > 0.5:
311
- cluster.cells.append(cell)
312
-
313
- # Pre-sort clusters
314
- # clusters = self.sort_clusters_by_cell_order(clusters)
315
-
316
- # DEBUG code:
317
- def draw_clusters_and_cells():
318
- image = copy.deepcopy(page.image)
319
- draw = ImageDraw.Draw(image)
320
- for c in clusters:
321
- x0, y0, x1, y1 = c.bbox.as_tuple()
322
- draw.rectangle([(x0, y0), (x1, y1)], outline="green")
323
-
324
- cell_color = (
325
- random.randint(30, 140),
326
- random.randint(30, 140),
327
- random.randint(30, 140),
286
+ with TimeRecorder(conv_res, "layout"):
287
+ assert page.size is not None
288
+
289
+ clusters = []
290
+ for ix, pred_item in enumerate(
291
+ self.layout_predictor.predict(page.get_image(scale=1.0))
292
+ ):
293
+ label = DocItemLabel(
294
+ pred_item["label"]
295
+ .lower()
296
+ .replace(" ", "_")
297
+ .replace("-", "_")
298
+ ) # Temporary, until docling-ibm-model uses docling-core types
299
+ cluster = Cluster(
300
+ id=ix,
301
+ label=label,
302
+ confidence=pred_item["confidence"],
303
+ bbox=BoundingBox.model_validate(pred_item),
304
+ cells=[],
328
305
  )
329
- for tc in c.cells: # [:1]:
330
- x0, y0, x1, y1 = tc.bbox.as_tuple()
331
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
332
- image.show()
333
-
334
- # draw_clusters_and_cells()
335
-
336
- clusters, page.cells = self.postprocess(
337
- clusters, page.cells, page.size.height
338
- )
306
+ clusters.append(cluster)
307
+
308
+ # Map cells to clusters
309
+ # TODO: Remove, postprocess should take care of it anyway.
310
+ for cell in page.cells:
311
+ for cluster in clusters:
312
+ if not cell.bbox.area() > 0:
313
+ overlap_frac = 0.0
314
+ else:
315
+ overlap_frac = (
316
+ cell.bbox.intersection_area_with(cluster.bbox)
317
+ / cell.bbox.area()
318
+ )
319
+
320
+ if overlap_frac > 0.5:
321
+ cluster.cells.append(cell)
322
+
323
+ # Pre-sort clusters
324
+ # clusters = self.sort_clusters_by_cell_order(clusters)
325
+
326
+ # DEBUG code:
327
+ def draw_clusters_and_cells(show: bool = False):
328
+ image = copy.deepcopy(page.image)
329
+ if image is not None:
330
+ draw = ImageDraw.Draw(image)
331
+ for c in clusters:
332
+ x0, y0, x1, y1 = c.bbox.as_tuple()
333
+ draw.rectangle([(x0, y0), (x1, y1)], outline="green")
334
+
335
+ cell_color = (
336
+ random.randint(30, 140),
337
+ random.randint(30, 140),
338
+ random.randint(30, 140),
339
+ )
340
+ for tc in c.cells: # [:1]:
341
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
342
+ draw.rectangle(
343
+ [(x0, y0), (x1, y1)], outline=cell_color
344
+ )
345
+ if show:
346
+ image.show()
347
+ else:
348
+ out_path: Path = (
349
+ Path(settings.debug.debug_output_path)
350
+ / f"debug_{conv_res.input.file.stem}"
351
+ )
352
+ out_path.mkdir(parents=True, exist_ok=True)
353
+
354
+ out_file = (
355
+ out_path / f"layout_page_{page.page_no:05}.png"
356
+ )
357
+ image.save(str(out_file), format="png")
358
+
359
+ # draw_clusters_and_cells()
360
+
361
+ clusters, page.cells = self.postprocess(
362
+ clusters, page.cells, page.size.height
363
+ )
339
364
 
340
- # draw_clusters_and_cells()
365
+ page.predictions.layout = LayoutPrediction(clusters=clusters)
341
366
 
342
- page.predictions.layout = LayoutPrediction(clusters=clusters)
367
+ if settings.debug.visualize_layout:
368
+ draw_clusters_and_cells()
343
369
 
344
370
  yield page
@@ -12,8 +12,10 @@ from docling.datamodel.base_models import (
12
12
  Table,
13
13
  TextElement,
14
14
  )
15
+ from docling.datamodel.document import ConversionResult
15
16
  from docling.models.base_model import BasePageModel
16
17
  from docling.models.layout_model import LayoutModel
18
+ from docling.utils.profiling import TimeRecorder
17
19
 
18
20
  _log = logging.getLogger(__name__)
19
21
 
@@ -51,122 +53,122 @@ class PageAssembleModel(BasePageModel):
51
53
 
52
54
  return sanitized_text.strip() # Strip any leading or trailing whitespace
53
55
 
54
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
56
+ def __call__(
57
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
58
+ ) -> Iterable[Page]:
55
59
  for page in page_batch:
56
60
  assert page._backend is not None
57
61
  if not page._backend.is_valid():
58
62
  yield page
59
63
  else:
60
- assert page.predictions.layout is not None
61
-
62
- # assembles some JSON output page by page.
63
-
64
- elements: List[PageElement] = []
65
- headers: List[PageElement] = []
66
- body: List[PageElement] = []
67
-
68
- for cluster in page.predictions.layout.clusters:
69
- # _log.info("Cluster label seen:", cluster.label)
70
- if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
71
-
72
- textlines = [
73
- cell.text.replace("\x02", "-").strip()
74
- for cell in cluster.cells
75
- if len(cell.text.strip()) > 0
76
- ]
77
- text = self.sanitize_text(textlines)
78
- text_el = TextElement(
79
- label=cluster.label,
80
- id=cluster.id,
81
- text=text,
82
- page_no=page.page_no,
83
- cluster=cluster,
84
- )
85
- elements.append(text_el)
86
-
87
- if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
88
- headers.append(text_el)
89
- else:
90
- body.append(text_el)
91
- elif cluster.label == LayoutModel.TABLE_LABEL:
92
- tbl = None
93
- if page.predictions.tablestructure:
94
- tbl = page.predictions.tablestructure.table_map.get(
95
- cluster.id, None
96
- )
97
- if (
98
- not tbl
99
- ): # fallback: add table without structure, if it isn't present
100
- tbl = Table(
64
+ with TimeRecorder(conv_res, "page_assemble"):
65
+
66
+ assert page.predictions.layout is not None
67
+
68
+ # assembles some JSON output page by page.
69
+
70
+ elements: List[PageElement] = []
71
+ headers: List[PageElement] = []
72
+ body: List[PageElement] = []
73
+
74
+ for cluster in page.predictions.layout.clusters:
75
+ # _log.info("Cluster label seen:", cluster.label)
76
+ if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
77
+
78
+ textlines = [
79
+ cell.text.replace("\x02", "-").strip()
80
+ for cell in cluster.cells
81
+ if len(cell.text.strip()) > 0
82
+ ]
83
+ text = self.sanitize_text(textlines)
84
+ text_el = TextElement(
101
85
  label=cluster.label,
102
86
  id=cluster.id,
103
- text="",
104
- otsl_seq=[],
105
- table_cells=[],
106
- cluster=cluster,
87
+ text=text,
107
88
  page_no=page.page_no,
89
+ cluster=cluster,
108
90
  )
91
+ elements.append(text_el)
92
+
93
+ if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
94
+ headers.append(text_el)
95
+ else:
96
+ body.append(text_el)
97
+ elif cluster.label == LayoutModel.TABLE_LABEL:
98
+ tbl = None
99
+ if page.predictions.tablestructure:
100
+ tbl = page.predictions.tablestructure.table_map.get(
101
+ cluster.id, None
102
+ )
103
+ if (
104
+ not tbl
105
+ ): # fallback: add table without structure, if it isn't present
106
+ tbl = Table(
107
+ label=cluster.label,
108
+ id=cluster.id,
109
+ text="",
110
+ otsl_seq=[],
111
+ table_cells=[],
112
+ cluster=cluster,
113
+ page_no=page.page_no,
114
+ )
109
115
 
110
- elements.append(tbl)
111
- body.append(tbl)
112
- elif cluster.label == LayoutModel.FIGURE_LABEL:
113
- fig = None
114
- if page.predictions.figures_classification:
115
- fig = (
116
- page.predictions.figures_classification.figure_map.get(
116
+ elements.append(tbl)
117
+ body.append(tbl)
118
+ elif cluster.label == LayoutModel.FIGURE_LABEL:
119
+ fig = None
120
+ if page.predictions.figures_classification:
121
+ fig = page.predictions.figures_classification.figure_map.get(
117
122
  cluster.id, None
118
123
  )
119
- )
120
- if (
121
- not fig
122
- ): # fallback: add figure without classification, if it isn't present
123
- fig = FigureElement(
124
- label=cluster.label,
125
- id=cluster.id,
126
- text="",
127
- data=None,
128
- cluster=cluster,
129
- page_no=page.page_no,
130
- )
131
- elements.append(fig)
132
- body.append(fig)
133
- elif cluster.label == LayoutModel.FORMULA_LABEL:
134
- equation = None
135
- if page.predictions.equations_prediction:
136
- equation = (
137
- page.predictions.equations_prediction.equation_map.get(
124
+ if (
125
+ not fig
126
+ ): # fallback: add figure without classification, if it isn't present
127
+ fig = FigureElement(
128
+ label=cluster.label,
129
+ id=cluster.id,
130
+ text="",
131
+ data=None,
132
+ cluster=cluster,
133
+ page_no=page.page_no,
134
+ )
135
+ elements.append(fig)
136
+ body.append(fig)
137
+ elif cluster.label == LayoutModel.FORMULA_LABEL:
138
+ equation = None
139
+ if page.predictions.equations_prediction:
140
+ equation = page.predictions.equations_prediction.equation_map.get(
138
141
  cluster.id, None
139
142
  )
140
- )
141
- if (
142
- not equation
143
- ): # fallback: add empty formula, if it isn't present
144
- text = self.sanitize_text(
145
- [
146
- cell.text.replace("\x02", "-").strip()
147
- for cell in cluster.cells
148
- if len(cell.text.strip()) > 0
149
- ]
150
- )
151
- equation = TextElement(
152
- label=cluster.label,
153
- id=cluster.id,
154
- cluster=cluster,
155
- page_no=page.page_no,
156
- text=text,
157
- )
158
- elements.append(equation)
159
- body.append(equation)
143
+ if (
144
+ not equation
145
+ ): # fallback: add empty formula, if it isn't present
146
+ text = self.sanitize_text(
147
+ [
148
+ cell.text.replace("\x02", "-").strip()
149
+ for cell in cluster.cells
150
+ if len(cell.text.strip()) > 0
151
+ ]
152
+ )
153
+ equation = TextElement(
154
+ label=cluster.label,
155
+ id=cluster.id,
156
+ cluster=cluster,
157
+ page_no=page.page_no,
158
+ text=text,
159
+ )
160
+ elements.append(equation)
161
+ body.append(equation)
160
162
 
161
- page.assembled = AssembledUnit(
162
- elements=elements, headers=headers, body=body
163
- )
163
+ page.assembled = AssembledUnit(
164
+ elements=elements, headers=headers, body=body
165
+ )
164
166
 
165
- # Remove page images (can be disabled)
166
- if not self.options.keep_images:
167
- page._image_cache = {}
167
+ # Remove page images (can be disabled)
168
+ if not self.options.keep_images:
169
+ page._image_cache = {}
168
170
 
169
- # Unload backend
170
- page._backend.unload()
171
+ # Unload backend
172
+ page._backend.unload()
171
173
 
172
174
  yield page
@@ -1,10 +1,14 @@
1
+ from pathlib import Path
1
2
  from typing import Iterable, Optional
2
3
 
3
4
  from PIL import ImageDraw
4
5
  from pydantic import BaseModel
5
6
 
6
7
  from docling.datamodel.base_models import Page
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.settings import settings
7
10
  from docling.models.base_model import BasePageModel
11
+ from docling.utils.profiling import TimeRecorder
8
12
 
9
13
 
10
14
  class PagePreprocessingOptions(BaseModel):
@@ -15,14 +19,17 @@ class PagePreprocessingModel(BasePageModel):
15
19
  def __init__(self, options: PagePreprocessingOptions):
16
20
  self.options = options
17
21
 
18
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
22
+ def __call__(
23
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
24
+ ) -> Iterable[Page]:
19
25
  for page in page_batch:
20
26
  assert page._backend is not None
21
27
  if not page._backend.is_valid():
22
28
  yield page
23
29
  else:
24
- page = self._populate_page_images(page)
25
- page = self._parse_page_cells(page)
30
+ with TimeRecorder(conv_res, "page_parse"):
31
+ page = self._populate_page_images(page)
32
+ page = self._parse_page_cells(conv_res, page)
26
33
  yield page
27
34
 
28
35
  # Generate the page image and store it in the page object
@@ -43,19 +50,30 @@ class PagePreprocessingModel(BasePageModel):
43
50
  return page
44
51
 
45
52
  # Extract and populate the page cells and store it in the page object
46
- def _parse_page_cells(self, page: Page) -> Page:
53
+ def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
47
54
  assert page._backend is not None
48
55
 
49
56
  page.cells = list(page._backend.get_text_cells())
50
57
 
51
58
  # DEBUG code:
52
- def draw_text_boxes(image, cells):
59
+ def draw_text_boxes(image, cells, show: bool = False):
53
60
  draw = ImageDraw.Draw(image)
54
61
  for c in cells:
55
62
  x0, y0, x1, y1 = c.bbox.as_tuple()
56
63
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
57
- image.show()
64
+ if show:
65
+ image.show()
66
+ else:
67
+ out_path: Path = (
68
+ Path(settings.debug.debug_output_path)
69
+ / f"debug_{conv_res.input.file.stem}"
70
+ )
71
+ out_path.mkdir(parents=True, exist_ok=True)
72
+
73
+ out_file = out_path / f"cells_page_{page.page_no:05}.png"
74
+ image.save(str(out_file), format="png")
58
75
 
59
- # draw_text_boxes(page.get_image(scale=1.0), cells)
76
+ if settings.debug.visualize_cells:
77
+ draw_text_boxes(page.get_image(scale=1.0), page.cells)
60
78
 
61
79
  return page