docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +379 -324
  12. docling/datamodel/pipeline_options.py +16 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +19 -6
  17. docling/models/ds_glm_model.py +220 -22
  18. docling/models/easyocr_model.py +45 -40
  19. docling/models/layout_model.py +130 -114
  20. docling/models/page_assemble_model.py +119 -95
  21. docling/models/page_preprocessing_model.py +61 -0
  22. docling/models/table_structure_model.py +122 -111
  23. docling/models/tesseract_ocr_cli_model.py +65 -58
  24. docling/models/tesseract_ocr_model.py +58 -50
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.1.0.dist-info/METADATA +149 -0
  31. docling-2.1.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.0.dist-info/METADATA +0 -380
  35. docling-1.19.0.dist-info/RECORD +0 -34
  36. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -3,29 +3,25 @@ from pathlib import Path
3
3
  from typing import Iterable, List
4
4
 
5
5
  import numpy
6
+ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
6
7
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
7
8
  from PIL import ImageDraw
8
9
 
9
- from docling.datamodel.base_models import (
10
- BoundingBox,
11
- Page,
12
- TableCell,
13
- TableElement,
14
- TableStructurePrediction,
15
- )
16
- from docling.datamodel.pipeline_options import TableFormerMode
10
+ from docling.datamodel.base_models import Page, Table, TableStructurePrediction
11
+ from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
12
+ from docling.models.base_model import BasePageModel
17
13
 
18
14
 
19
- class TableStructureModel:
20
- def __init__(self, config):
21
- self.config = config
22
- self.do_cell_matching = config["do_cell_matching"]
23
- self.mode = config["mode"]
15
+ class TableStructureModel(BasePageModel):
16
+ def __init__(
17
+ self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
18
+ ):
19
+ self.options = options
20
+ self.do_cell_matching = self.options.do_cell_matching
21
+ self.mode = self.options.mode
24
22
 
25
- self.enabled = config["enabled"]
23
+ self.enabled = enabled
26
24
  if self.enabled:
27
- artifacts_path: Path = config["artifacts_path"]
28
-
29
25
  if self.mode == TableFormerMode.ACCURATE:
30
26
  artifacts_path = artifacts_path / "fat"
31
27
 
@@ -39,7 +35,9 @@ class TableStructureModel:
39
35
  self.tf_predictor = TFPredictor(self.tm_config)
40
36
  self.scale = 2.0 # Scale up table input images to 144 dpi
41
37
 
42
- def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
38
+ def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
39
+ assert page._backend is not None
40
+
43
41
  image = (
44
42
  page._backend.get_page_image()
45
43
  ) # make new image to avoid drawing on the saved ones
@@ -50,17 +48,18 @@ class TableStructureModel:
50
48
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
51
49
 
52
50
  for tc in table_element.table_cells:
53
- x0, y0, x1, y1 = tc.bbox.as_tuple()
54
- if tc.column_header:
55
- width = 3
56
- else:
57
- width = 1
58
- draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
59
- draw.text(
60
- (x0 + 3, y0 + 3),
61
- text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
62
- fill="black",
63
- )
51
+ if tc.bbox is not None:
52
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
53
+ if tc.column_header:
54
+ width = 3
55
+ else:
56
+ width = 1
57
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
58
+ draw.text(
59
+ (x0 + 3, y0 + 3),
60
+ text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
61
+ fill="black",
62
+ )
64
63
 
65
64
  image.show()
66
65
 
@@ -71,90 +70,102 @@ class TableStructureModel:
71
70
  return
72
71
 
73
72
  for page in page_batch:
74
-
75
- page.predictions.tablestructure = TableStructurePrediction() # dummy
76
-
77
- in_tables = [
78
- (
79
- cluster,
80
- [
81
- round(cluster.bbox.l) * self.scale,
82
- round(cluster.bbox.t) * self.scale,
83
- round(cluster.bbox.r) * self.scale,
84
- round(cluster.bbox.b) * self.scale,
85
- ],
86
- )
87
- for cluster in page.predictions.layout.clusters
88
- if cluster.label == "Table"
89
- ]
90
- if not len(in_tables):
73
+ assert page._backend is not None
74
+ if not page._backend.is_valid():
91
75
  yield page
92
- continue
93
-
94
- tokens = []
95
- for c in page.cells:
96
- for cluster, _ in in_tables:
97
- if c.bbox.area() > 0:
98
- if (
99
- c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
100
- > 0.2
101
- ):
102
- # Only allow non empty stings (spaces) into the cells of a table
103
- if len(c.text.strip()) > 0:
104
- new_cell = copy.deepcopy(c)
105
- new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
106
-
107
- tokens.append(new_cell.model_dump())
108
-
109
- page_input = {
110
- "tokens": tokens,
111
- "width": page.size.width * self.scale,
112
- "height": page.size.height * self.scale,
113
- }
114
- page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
115
-
116
- table_clusters, table_bboxes = zip(*in_tables)
117
-
118
- if len(table_bboxes):
119
- tf_output = self.tf_predictor.multi_table_predict(
120
- page_input, table_bboxes, do_matching=self.do_cell_matching
121
- )
122
-
123
- for table_cluster, table_out in zip(table_clusters, tf_output):
124
- table_cells = []
125
- for element in table_out["tf_responses"]:
126
-
127
- if not self.do_cell_matching:
128
- the_bbox = BoundingBox.model_validate(
129
- element["bbox"]
130
- ).scaled(1 / self.scale)
131
- text_piece = page._backend.get_text_in_rect(the_bbox)
132
- element["bbox"]["token"] = text_piece
133
-
134
- tc = TableCell.model_validate(element)
135
- if self.do_cell_matching:
136
- tc.bbox = tc.bbox.scaled(1 / self.scale)
137
- table_cells.append(tc)
138
-
139
- # Retrieving cols/rows, after post processing:
140
- num_rows = table_out["predict_details"]["num_rows"]
141
- num_cols = table_out["predict_details"]["num_cols"]
142
- otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
143
-
144
- tbl = TableElement(
145
- otsl_seq=otsl_seq,
146
- table_cells=table_cells,
147
- num_rows=num_rows,
148
- num_cols=num_cols,
149
- id=table_cluster.id,
150
- page_no=page.page_no,
151
- cluster=table_cluster,
152
- label="Table",
76
+ else:
77
+
78
+ assert page.predictions.layout is not None
79
+ assert page.size is not None
80
+
81
+ page.predictions.tablestructure = TableStructurePrediction() # dummy
82
+
83
+ in_tables = [
84
+ (
85
+ cluster,
86
+ [
87
+ round(cluster.bbox.l) * self.scale,
88
+ round(cluster.bbox.t) * self.scale,
89
+ round(cluster.bbox.r) * self.scale,
90
+ round(cluster.bbox.b) * self.scale,
91
+ ],
92
+ )
93
+ for cluster in page.predictions.layout.clusters
94
+ if cluster.label == DocItemLabel.TABLE
95
+ ]
96
+ if not len(in_tables):
97
+ yield page
98
+ continue
99
+
100
+ tokens = []
101
+ for c in page.cells:
102
+ for cluster, _ in in_tables:
103
+ if c.bbox.area() > 0:
104
+ if (
105
+ c.bbox.intersection_area_with(cluster.bbox)
106
+ / c.bbox.area()
107
+ > 0.2
108
+ ):
109
+ # Only allow non empty stings (spaces) into the cells of a table
110
+ if len(c.text.strip()) > 0:
111
+ new_cell = copy.deepcopy(c)
112
+ new_cell.bbox = new_cell.bbox.scaled(
113
+ scale=self.scale
114
+ )
115
+
116
+ tokens.append(new_cell.model_dump())
117
+
118
+ page_input = {
119
+ "tokens": tokens,
120
+ "width": page.size.width * self.scale,
121
+ "height": page.size.height * self.scale,
122
+ }
123
+ page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
124
+
125
+ table_clusters, table_bboxes = zip(*in_tables)
126
+
127
+ if len(table_bboxes):
128
+ tf_output = self.tf_predictor.multi_table_predict(
129
+ page_input, table_bboxes, do_matching=self.do_cell_matching
153
130
  )
154
131
 
155
- page.predictions.tablestructure.table_map[table_cluster.id] = tbl
156
-
157
- # For debugging purposes:
158
- # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
132
+ for table_cluster, table_out in zip(table_clusters, tf_output):
133
+ table_cells = []
134
+ for element in table_out["tf_responses"]:
135
+
136
+ if not self.do_cell_matching:
137
+ the_bbox = BoundingBox.model_validate(
138
+ element["bbox"]
139
+ ).scaled(1 / self.scale)
140
+ text_piece = page._backend.get_text_in_rect(the_bbox)
141
+ element["bbox"]["token"] = text_piece
142
+
143
+ tc = TableCell.model_validate(element)
144
+ if self.do_cell_matching and tc.bbox is not None:
145
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
146
+ table_cells.append(tc)
147
+
148
+ # Retrieving cols/rows, after post processing:
149
+ num_rows = table_out["predict_details"]["num_rows"]
150
+ num_cols = table_out["predict_details"]["num_cols"]
151
+ otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
152
+
153
+ tbl = Table(
154
+ otsl_seq=otsl_seq,
155
+ table_cells=table_cells,
156
+ num_rows=num_rows,
157
+ num_cols=num_cols,
158
+ id=table_cluster.id,
159
+ page_no=page.page_no,
160
+ cluster=table_cluster,
161
+ label=DocItemLabel.TABLE,
162
+ )
163
+
164
+ page.predictions.tablestructure.table_map[table_cluster.id] = (
165
+ tbl
166
+ )
167
+
168
+ # For debugging purposes:
169
+ # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
159
170
 
160
- yield page
171
+ yield page
@@ -1,12 +1,13 @@
1
1
  import io
2
2
  import logging
3
3
  import tempfile
4
- from subprocess import PIPE, Popen
5
- from typing import Iterable, Tuple
4
+ from subprocess import DEVNULL, PIPE, Popen
5
+ from typing import Iterable, Optional, Tuple
6
6
 
7
7
  import pandas as pd
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
 
9
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
10
+ from docling.datamodel.base_models import OcrCell, Page
10
11
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
11
12
  from docling.models.base_ocr_model import BaseOcrModel
12
13
 
@@ -21,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):
21
22
 
22
23
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
24
 
24
- self._name = None
25
- self._version = None
25
+ self._name: Optional[str] = None
26
+ self._version: Optional[str] = None
26
27
 
27
28
  if self.enabled:
28
29
  try:
@@ -39,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
39
40
  def _get_name_and_version(self) -> Tuple[str, str]:
40
41
 
41
42
  if self._name != None and self._version != None:
42
- return self._name, self._version
43
+ return self._name, self._version # type: ignore
43
44
 
44
45
  cmd = [self.options.tesseract_cmd, "--version"]
45
46
 
@@ -81,7 +82,7 @@ class TesseractOcrCliModel(BaseOcrModel):
81
82
  cmd += [ifilename, "stdout", "tsv"]
82
83
  _log.info("command: {}".format(" ".join(cmd)))
83
84
 
84
- proc = Popen(cmd, stdout=PIPE)
85
+ proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
85
86
  output, _ = proc.communicate()
86
87
 
87
88
  # _log.info(output)
@@ -108,60 +109,66 @@ class TesseractOcrCliModel(BaseOcrModel):
108
109
  return
109
110
 
110
111
  for page in page_batch:
111
- ocr_rects = self.get_ocr_rects(page)
112
-
113
- all_ocr_cells = []
114
- for ocr_rect in ocr_rects:
115
- # Skip zero area boxes
116
- if ocr_rect.area() == 0:
117
- continue
118
- high_res_image = page._backend.get_page_image(
119
- scale=self.scale, cropbox=ocr_rect
120
- )
112
+ assert page._backend is not None
113
+ if not page._backend.is_valid():
114
+ yield page
115
+ else:
116
+ ocr_rects = self.get_ocr_rects(page)
117
+
118
+ all_ocr_cells = []
119
+ for ocr_rect in ocr_rects:
120
+ # Skip zero area boxes
121
+ if ocr_rect.area() == 0:
122
+ continue
123
+ high_res_image = page._backend.get_page_image(
124
+ scale=self.scale, cropbox=ocr_rect
125
+ )
121
126
 
122
- with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
123
- fname = image_file.name
124
- high_res_image.save(fname)
125
-
126
- df = self._run_tesseract(fname)
127
-
128
- # _log.info(df)
129
-
130
- # Print relevant columns (bounding box and text)
131
- for ix, row in df.iterrows():
132
- text = row["text"]
133
- conf = row["conf"]
134
-
135
- l = float(row["left"])
136
- b = float(row["top"])
137
- w = float(row["width"])
138
- h = float(row["height"])
139
-
140
- t = b + h
141
- r = l + w
142
-
143
- cell = OcrCell(
144
- id=ix,
145
- text=text,
146
- confidence=conf / 100.0,
147
- bbox=BoundingBox.from_tuple(
148
- coord=(
149
- (l / self.scale) + ocr_rect.l,
150
- (b / self.scale) + ocr_rect.t,
151
- (r / self.scale) + ocr_rect.l,
152
- (t / self.scale) + ocr_rect.t,
127
+ with tempfile.NamedTemporaryFile(
128
+ suffix=".png", mode="w"
129
+ ) as image_file:
130
+ fname = image_file.name
131
+ high_res_image.save(fname)
132
+
133
+ df = self._run_tesseract(fname)
134
+
135
+ # _log.info(df)
136
+
137
+ # Print relevant columns (bounding box and text)
138
+ for ix, row in df.iterrows():
139
+ text = row["text"]
140
+ conf = row["conf"]
141
+
142
+ l = float(row["left"])
143
+ b = float(row["top"])
144
+ w = float(row["width"])
145
+ h = float(row["height"])
146
+
147
+ t = b + h
148
+ r = l + w
149
+
150
+ cell = OcrCell(
151
+ id=ix,
152
+ text=text,
153
+ confidence=conf / 100.0,
154
+ bbox=BoundingBox.from_tuple(
155
+ coord=(
156
+ (l / self.scale) + ocr_rect.l,
157
+ (b / self.scale) + ocr_rect.t,
158
+ (r / self.scale) + ocr_rect.l,
159
+ (t / self.scale) + ocr_rect.t,
160
+ ),
161
+ origin=CoordOrigin.TOPLEFT,
153
162
  ),
154
- origin=CoordOrigin.TOPLEFT,
155
- ),
156
- )
157
- all_ocr_cells.append(cell)
163
+ )
164
+ all_ocr_cells.append(cell)
158
165
 
159
- ## Remove OCR cells which overlap with programmatic cells.
160
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
166
+ ## Remove OCR cells which overlap with programmatic cells.
167
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
161
168
 
162
- page.cells.extend(filtered_ocr_cells)
169
+ page.cells.extend(filtered_ocr_cells)
163
170
 
164
- # DEBUG code:
165
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
171
+ # DEBUG code:
172
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
166
173
 
167
- yield page
174
+ yield page
@@ -1,19 +1,19 @@
1
1
  import logging
2
2
  from typing import Iterable
3
3
 
4
- import numpy
4
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
5
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
- from docling.datamodel.pipeline_options import TesseractCliOcrOptions
6
+ from docling.datamodel.base_models import OcrCell, Page
7
+ from docling.datamodel.pipeline_options import TesseractOcrOptions
8
8
  from docling.models.base_ocr_model import BaseOcrModel
9
9
 
10
10
  _log = logging.getLogger(__name__)
11
11
 
12
12
 
13
13
  class TesseractOcrModel(BaseOcrModel):
14
- def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
14
+ def __init__(self, enabled: bool, options: TesseractOcrOptions):
15
15
  super().__init__(enabled=enabled, options=options)
16
- self.options: TesseractCliOcrOptions
16
+ self.options: TesseractOcrOptions
17
17
 
18
18
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
19
19
  self.reader = None
@@ -68,55 +68,63 @@ class TesseractOcrModel(BaseOcrModel):
68
68
  return
69
69
 
70
70
  for page in page_batch:
71
- ocr_rects = self.get_ocr_rects(page)
72
-
73
- all_ocr_cells = []
74
- for ocr_rect in ocr_rects:
75
- # Skip zero area boxes
76
- if ocr_rect.area() == 0:
77
- continue
78
- high_res_image = page._backend.get_page_image(
79
- scale=self.scale, cropbox=ocr_rect
80
- )
71
+ assert page._backend is not None
72
+ if not page._backend.is_valid():
73
+ yield page
74
+ else:
75
+ assert self.reader is not None
81
76
 
82
- # Retrieve text snippets with their bounding boxes
83
- self.reader.SetImage(high_res_image)
84
- boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
85
-
86
- cells = []
87
- for ix, (im, box, _, _) in enumerate(boxes):
88
- # Set the area of interest. Tesseract uses Bottom-Left for the origin
89
- self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
90
-
91
- # Extract text within the bounding box
92
- text = self.reader.GetUTF8Text().strip()
93
- confidence = self.reader.MeanTextConf()
94
- left = box["x"] / self.scale
95
- bottom = box["y"] / self.scale
96
- right = (box["x"] + box["w"]) / self.scale
97
- top = (box["y"] + box["h"]) / self.scale
98
-
99
- cells.append(
100
- OcrCell(
101
- id=ix,
102
- text=text,
103
- confidence=confidence,
104
- bbox=BoundingBox.from_tuple(
105
- coord=(left, top, right, bottom),
106
- origin=CoordOrigin.TOPLEFT,
107
- ),
108
- )
77
+ ocr_rects = self.get_ocr_rects(page)
78
+
79
+ all_ocr_cells = []
80
+ for ocr_rect in ocr_rects:
81
+ # Skip zero area boxes
82
+ if ocr_rect.area() == 0:
83
+ continue
84
+ high_res_image = page._backend.get_page_image(
85
+ scale=self.scale, cropbox=ocr_rect
109
86
  )
110
87
 
111
- # del high_res_image
112
- all_ocr_cells.extend(cells)
88
+ # Retrieve text snippets with their bounding boxes
89
+ self.reader.SetImage(high_res_image)
90
+ boxes = self.reader.GetComponentImages(
91
+ self.reader_RIL.TEXTLINE, True
92
+ )
93
+
94
+ cells = []
95
+ for ix, (im, box, _, _) in enumerate(boxes):
96
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
97
+ self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
98
+
99
+ # Extract text within the bounding box
100
+ text = self.reader.GetUTF8Text().strip()
101
+ confidence = self.reader.MeanTextConf()
102
+ left = box["x"] / self.scale
103
+ bottom = box["y"] / self.scale
104
+ right = (box["x"] + box["w"]) / self.scale
105
+ top = (box["y"] + box["h"]) / self.scale
106
+
107
+ cells.append(
108
+ OcrCell(
109
+ id=ix,
110
+ text=text,
111
+ confidence=confidence,
112
+ bbox=BoundingBox.from_tuple(
113
+ coord=(left, top, right, bottom),
114
+ origin=CoordOrigin.TOPLEFT,
115
+ ),
116
+ )
117
+ )
118
+
119
+ # del high_res_image
120
+ all_ocr_cells.extend(cells)
113
121
 
114
- ## Remove OCR cells which overlap with programmatic cells.
115
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
122
+ ## Remove OCR cells which overlap with programmatic cells.
123
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
116
124
 
117
- page.cells.extend(filtered_ocr_cells)
125
+ page.cells.extend(filtered_ocr_cells)
118
126
 
119
- # DEBUG code:
120
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
127
+ # DEBUG code:
128
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
121
129
 
122
- yield page
130
+ yield page