docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,79 @@
1
+ from pathlib import Path
2
+ from typing import Iterable, Optional
3
+
4
+ from PIL import ImageDraw
5
+ from pydantic import BaseModel
6
+
7
+ from docling.datamodel.base_models import Page
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.settings import settings
10
+ from docling.models.base_model import BasePageModel
11
+ from docling.utils.profiling import TimeRecorder
12
+
13
+
14
+ class PagePreprocessingOptions(BaseModel):
15
+ images_scale: Optional[float]
16
+
17
+
18
+ class PagePreprocessingModel(BasePageModel):
19
+ def __init__(self, options: PagePreprocessingOptions):
20
+ self.options = options
21
+
22
+ def __call__(
23
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
24
+ ) -> Iterable[Page]:
25
+ for page in page_batch:
26
+ assert page._backend is not None
27
+ if not page._backend.is_valid():
28
+ yield page
29
+ else:
30
+ with TimeRecorder(conv_res, "page_parse"):
31
+ page = self._populate_page_images(page)
32
+ page = self._parse_page_cells(conv_res, page)
33
+ yield page
34
+
35
+ # Generate the page image and store it in the page object
36
+ def _populate_page_images(self, page: Page) -> Page:
37
+ # default scale
38
+ page.get_image(
39
+ scale=1.0
40
+ ) # puts the page image on the image cache at default scale
41
+
42
+ images_scale = self.options.images_scale
43
+ # user requested scales
44
+ if images_scale is not None:
45
+ page._default_image_scale = images_scale
46
+ page.get_image(
47
+ scale=images_scale
48
+ ) # this will trigger storing the image in the internal cache
49
+
50
+ return page
51
+
52
+ # Extract and populate the page cells and store it in the page object
53
+ def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
54
+ assert page._backend is not None
55
+
56
+ page.cells = list(page._backend.get_text_cells())
57
+
58
+ # DEBUG code:
59
+ def draw_text_boxes(image, cells, show: bool = False):
60
+ draw = ImageDraw.Draw(image)
61
+ for c in cells:
62
+ x0, y0, x1, y1 = c.bbox.as_tuple()
63
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
64
+ if show:
65
+ image.show()
66
+ else:
67
+ out_path: Path = (
68
+ Path(settings.debug.debug_output_path)
69
+ / f"debug_{conv_res.input.file.stem}"
70
+ )
71
+ out_path.mkdir(parents=True, exist_ok=True)
72
+
73
+ out_file = out_path / f"cells_page_{page.page_no:05}.png"
74
+ image.save(str(out_file), format="png")
75
+
76
+ if settings.debug.visualize_cells:
77
+ draw_text_boxes(page.get_image(scale=1.0), page.cells)
78
+
79
+ return page
@@ -1,31 +1,30 @@
1
1
  import copy
2
2
  from pathlib import Path
3
- from typing import Iterable, List
3
+ from typing import Iterable
4
4
 
5
5
  import numpy
6
+ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
6
7
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
7
8
  from PIL import ImageDraw
8
9
 
9
- from docling.datamodel.base_models import (
10
- BoundingBox,
11
- Page,
12
- TableCell,
13
- TableElement,
14
- TableStructurePrediction,
15
- )
16
- from docling.datamodel.pipeline_options import TableFormerMode
10
+ from docling.datamodel.base_models import Page, Table, TableStructurePrediction
11
+ from docling.datamodel.document import ConversionResult
12
+ from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
13
+ from docling.datamodel.settings import settings
14
+ from docling.models.base_model import BasePageModel
15
+ from docling.utils.profiling import TimeRecorder
17
16
 
18
17
 
19
- class TableStructureModel:
20
- def __init__(self, config):
21
- self.config = config
22
- self.do_cell_matching = config["do_cell_matching"]
23
- self.mode = config["mode"]
18
+ class TableStructureModel(BasePageModel):
19
+ def __init__(
20
+ self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
21
+ ):
22
+ self.options = options
23
+ self.do_cell_matching = self.options.do_cell_matching
24
+ self.mode = self.options.mode
24
25
 
25
- self.enabled = config["enabled"]
26
+ self.enabled = enabled
26
27
  if self.enabled:
27
- artifacts_path: Path = config["artifacts_path"]
28
-
29
28
  if self.mode == TableFormerMode.ACCURATE:
30
29
  artifacts_path = artifacts_path / "fat"
31
30
 
@@ -39,7 +38,15 @@ class TableStructureModel:
39
38
  self.tf_predictor = TFPredictor(self.tm_config)
40
39
  self.scale = 2.0 # Scale up table input images to 144 dpi
41
40
 
42
- def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
41
+ def draw_table_and_cells(
42
+ self,
43
+ conv_res: ConversionResult,
44
+ page: Page,
45
+ tbl_list: Iterable[Table],
46
+ show: bool = False,
47
+ ):
48
+ assert page._backend is not None
49
+
43
50
  image = (
44
51
  page._backend.get_page_image()
45
52
  ) # make new image to avoid drawing on the saved ones
@@ -50,111 +57,150 @@ class TableStructureModel:
50
57
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
51
58
 
52
59
  for tc in table_element.table_cells:
53
- x0, y0, x1, y1 = tc.bbox.as_tuple()
54
- if tc.column_header:
55
- width = 3
56
- else:
57
- width = 1
58
- draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
59
- draw.text(
60
- (x0 + 3, y0 + 3),
61
- text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
62
- fill="black",
63
- )
64
-
65
- image.show()
66
-
67
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
60
+ if tc.bbox is not None:
61
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
62
+ if tc.column_header:
63
+ width = 3
64
+ else:
65
+ width = 1
66
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
67
+ draw.text(
68
+ (x0 + 3, y0 + 3),
69
+ text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
70
+ fill="black",
71
+ )
72
+
73
+ if show:
74
+ image.show()
75
+ else:
76
+ out_path: Path = (
77
+ Path(settings.debug.debug_output_path)
78
+ / f"debug_{conv_res.input.file.stem}"
79
+ )
80
+ out_path.mkdir(parents=True, exist_ok=True)
81
+
82
+ out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
83
+ image.save(str(out_file), format="png")
84
+
85
+ def __call__(
86
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
87
+ ) -> Iterable[Page]:
68
88
 
69
89
  if not self.enabled:
70
90
  yield from page_batch
71
91
  return
72
92
 
73
93
  for page in page_batch:
74
-
75
- page.predictions.tablestructure = TableStructurePrediction() # dummy
76
-
77
- in_tables = [
78
- (
79
- cluster,
80
- [
81
- round(cluster.bbox.l) * self.scale,
82
- round(cluster.bbox.t) * self.scale,
83
- round(cluster.bbox.r) * self.scale,
84
- round(cluster.bbox.b) * self.scale,
85
- ],
86
- )
87
- for cluster in page.predictions.layout.clusters
88
- if cluster.label == "Table"
89
- ]
90
- if not len(in_tables):
94
+ assert page._backend is not None
95
+ if not page._backend.is_valid():
91
96
  yield page
92
- continue
93
-
94
- tokens = []
95
- for c in page.cells:
96
- for cluster, _ in in_tables:
97
- if c.bbox.area() > 0:
98
- if (
99
- c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
100
- > 0.2
101
- ):
102
- # Only allow non empty stings (spaces) into the cells of a table
103
- if len(c.text.strip()) > 0:
104
- new_cell = copy.deepcopy(c)
105
- new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
106
-
107
- tokens.append(new_cell.model_dump())
108
-
109
- page_input = {
110
- "tokens": tokens,
111
- "width": page.size.width * self.scale,
112
- "height": page.size.height * self.scale,
113
- }
114
- page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
115
-
116
- table_clusters, table_bboxes = zip(*in_tables)
117
-
118
- if len(table_bboxes):
119
- tf_output = self.tf_predictor.multi_table_predict(
120
- page_input, table_bboxes, do_matching=self.do_cell_matching
121
- )
122
-
123
- for table_cluster, table_out in zip(table_clusters, tf_output):
124
- table_cells = []
125
- for element in table_out["tf_responses"]:
126
-
127
- if not self.do_cell_matching:
128
- the_bbox = BoundingBox.model_validate(
129
- element["bbox"]
130
- ).scaled(1 / self.scale)
131
- text_piece = page._backend.get_text_in_rect(the_bbox)
132
- element["bbox"]["token"] = text_piece
133
-
134
- tc = TableCell.model_validate(element)
135
- if self.do_cell_matching:
136
- tc.bbox = tc.bbox.scaled(1 / self.scale)
137
- table_cells.append(tc)
138
-
139
- # Retrieving cols/rows, after post processing:
140
- num_rows = table_out["predict_details"]["num_rows"]
141
- num_cols = table_out["predict_details"]["num_cols"]
142
- otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
143
-
144
- tbl = TableElement(
145
- otsl_seq=otsl_seq,
146
- table_cells=table_cells,
147
- num_rows=num_rows,
148
- num_cols=num_cols,
149
- id=table_cluster.id,
150
- page_no=page.page_no,
151
- cluster=table_cluster,
152
- label="Table",
97
+ else:
98
+ with TimeRecorder(conv_res, "table_structure"):
99
+
100
+ assert page.predictions.layout is not None
101
+ assert page.size is not None
102
+
103
+ page.predictions.tablestructure = (
104
+ TableStructurePrediction()
105
+ ) # dummy
106
+
107
+ in_tables = [
108
+ (
109
+ cluster,
110
+ [
111
+ round(cluster.bbox.l) * self.scale,
112
+ round(cluster.bbox.t) * self.scale,
113
+ round(cluster.bbox.r) * self.scale,
114
+ round(cluster.bbox.b) * self.scale,
115
+ ],
116
+ )
117
+ for cluster in page.predictions.layout.clusters
118
+ if cluster.label == DocItemLabel.TABLE
119
+ ]
120
+ if not len(in_tables):
121
+ yield page
122
+ continue
123
+
124
+ tokens = []
125
+ for c in page.cells:
126
+ for cluster, _ in in_tables:
127
+ if c.bbox.area() > 0:
128
+ if (
129
+ c.bbox.intersection_area_with(cluster.bbox)
130
+ / c.bbox.area()
131
+ > 0.2
132
+ ):
133
+ # Only allow non empty stings (spaces) into the cells of a table
134
+ if len(c.text.strip()) > 0:
135
+ new_cell = copy.deepcopy(c)
136
+ new_cell.bbox = new_cell.bbox.scaled(
137
+ scale=self.scale
138
+ )
139
+
140
+ tokens.append(new_cell.model_dump())
141
+
142
+ page_input = {
143
+ "tokens": tokens,
144
+ "width": page.size.width * self.scale,
145
+ "height": page.size.height * self.scale,
146
+ }
147
+ page_input["image"] = numpy.asarray(
148
+ page.get_image(scale=self.scale)
153
149
  )
154
150
 
155
- page.predictions.tablestructure.table_map[table_cluster.id] = tbl
151
+ table_clusters, table_bboxes = zip(*in_tables)
152
+
153
+ if len(table_bboxes):
154
+ tf_output = self.tf_predictor.multi_table_predict(
155
+ page_input, table_bboxes, do_matching=self.do_cell_matching
156
+ )
157
+
158
+ for table_cluster, table_out in zip(table_clusters, tf_output):
159
+ table_cells = []
160
+ for element in table_out["tf_responses"]:
161
+
162
+ if not self.do_cell_matching:
163
+ the_bbox = BoundingBox.model_validate(
164
+ element["bbox"]
165
+ ).scaled(1 / self.scale)
166
+ text_piece = page._backend.get_text_in_rect(
167
+ the_bbox
168
+ )
169
+ element["bbox"]["token"] = text_piece
170
+
171
+ tc = TableCell.model_validate(element)
172
+ if self.do_cell_matching and tc.bbox is not None:
173
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
174
+ table_cells.append(tc)
175
+
176
+ # Retrieving cols/rows, after post processing:
177
+ num_rows = table_out["predict_details"]["num_rows"]
178
+ num_cols = table_out["predict_details"]["num_cols"]
179
+ otsl_seq = table_out["predict_details"]["prediction"][
180
+ "rs_seq"
181
+ ]
182
+
183
+ tbl = Table(
184
+ otsl_seq=otsl_seq,
185
+ table_cells=table_cells,
186
+ num_rows=num_rows,
187
+ num_cols=num_cols,
188
+ id=table_cluster.id,
189
+ page_no=page.page_no,
190
+ cluster=table_cluster,
191
+ label=DocItemLabel.TABLE,
192
+ )
193
+
194
+ page.predictions.tablestructure.table_map[
195
+ table_cluster.id
196
+ ] = tbl
197
+
198
+ # For debugging purposes:
199
+ if settings.debug.visualize_tables:
200
+ self.draw_table_and_cells(
201
+ conv_res,
202
+ page,
203
+ page.predictions.tablestructure.table_map.values(),
204
+ )
156
205
 
157
- # For debugging purposes:
158
- # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
159
-
160
- yield page
206
+ yield page
@@ -2,13 +2,17 @@ import io
2
2
  import logging
3
3
  import tempfile
4
4
  from subprocess import DEVNULL, PIPE, Popen
5
- from typing import Iterable, Tuple
5
+ from typing import Iterable, Optional, Tuple
6
6
 
7
7
  import pandas as pd
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
 
9
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
10
+ from docling.datamodel.base_models import OcrCell, Page
11
+ from docling.datamodel.document import ConversionResult
10
12
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
13
+ from docling.datamodel.settings import settings
11
14
  from docling.models.base_ocr_model import BaseOcrModel
15
+ from docling.utils.profiling import TimeRecorder
12
16
 
13
17
  _log = logging.getLogger(__name__)
14
18
 
@@ -21,8 +25,8 @@ class TesseractOcrCliModel(BaseOcrModel):
21
25
 
22
26
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
27
 
24
- self._name = None
25
- self._version = None
28
+ self._name: Optional[str] = None
29
+ self._version: Optional[str] = None
26
30
 
27
31
  if self.enabled:
28
32
  try:
@@ -39,7 +43,7 @@ class TesseractOcrCliModel(BaseOcrModel):
39
43
  def _get_name_and_version(self) -> Tuple[str, str]:
40
44
 
41
45
  if self._name != None and self._version != None:
42
- return self._name, self._version
46
+ return self._name, self._version # type: ignore
43
47
 
44
48
  cmd = [self.options.tesseract_cmd, "--version"]
45
49
 
@@ -101,67 +105,80 @@ class TesseractOcrCliModel(BaseOcrModel):
101
105
 
102
106
  return df_filtered
103
107
 
104
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
108
+ def __call__(
109
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
110
+ ) -> Iterable[Page]:
105
111
 
106
112
  if not self.enabled:
107
113
  yield from page_batch
108
114
  return
109
115
 
110
116
  for page in page_batch:
111
- ocr_rects = self.get_ocr_rects(page)
112
-
113
- all_ocr_cells = []
114
- for ocr_rect in ocr_rects:
115
- # Skip zero area boxes
116
- if ocr_rect.area() == 0:
117
- continue
118
- high_res_image = page._backend.get_page_image(
119
- scale=self.scale, cropbox=ocr_rect
120
- )
121
-
122
- with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
123
- fname = image_file.name
124
- high_res_image.save(fname)
125
-
126
- df = self._run_tesseract(fname)
127
-
128
- # _log.info(df)
129
-
130
- # Print relevant columns (bounding box and text)
131
- for ix, row in df.iterrows():
132
- text = row["text"]
133
- conf = row["conf"]
134
-
135
- l = float(row["left"])
136
- b = float(row["top"])
137
- w = float(row["width"])
138
- h = float(row["height"])
139
-
140
- t = b + h
141
- r = l + w
142
-
143
- cell = OcrCell(
144
- id=ix,
145
- text=text,
146
- confidence=conf / 100.0,
147
- bbox=BoundingBox.from_tuple(
148
- coord=(
149
- (l / self.scale) + ocr_rect.l,
150
- (b / self.scale) + ocr_rect.t,
151
- (r / self.scale) + ocr_rect.l,
152
- (t / self.scale) + ocr_rect.t,
153
- ),
154
- origin=CoordOrigin.TOPLEFT,
155
- ),
117
+ assert page._backend is not None
118
+ if not page._backend.is_valid():
119
+ yield page
120
+ else:
121
+ with TimeRecorder(conv_res, "ocr"):
122
+
123
+ ocr_rects = self.get_ocr_rects(page)
124
+
125
+ all_ocr_cells = []
126
+ for ocr_rect in ocr_rects:
127
+ # Skip zero area boxes
128
+ if ocr_rect.area() == 0:
129
+ continue
130
+ high_res_image = page._backend.get_page_image(
131
+ scale=self.scale, cropbox=ocr_rect
132
+ )
133
+
134
+ with tempfile.NamedTemporaryFile(
135
+ suffix=".png", mode="w"
136
+ ) as image_file:
137
+ fname = image_file.name
138
+ high_res_image.save(fname)
139
+
140
+ df = self._run_tesseract(fname)
141
+
142
+ # _log.info(df)
143
+
144
+ # Print relevant columns (bounding box and text)
145
+ for ix, row in df.iterrows():
146
+ text = row["text"]
147
+ conf = row["conf"]
148
+
149
+ l = float(row["left"])
150
+ b = float(row["top"])
151
+ w = float(row["width"])
152
+ h = float(row["height"])
153
+
154
+ t = b + h
155
+ r = l + w
156
+
157
+ cell = OcrCell(
158
+ id=ix,
159
+ text=text,
160
+ confidence=conf / 100.0,
161
+ bbox=BoundingBox.from_tuple(
162
+ coord=(
163
+ (l / self.scale) + ocr_rect.l,
164
+ (b / self.scale) + ocr_rect.t,
165
+ (r / self.scale) + ocr_rect.l,
166
+ (t / self.scale) + ocr_rect.t,
167
+ ),
168
+ origin=CoordOrigin.TOPLEFT,
169
+ ),
170
+ )
171
+ all_ocr_cells.append(cell)
172
+
173
+ ## Remove OCR cells which overlap with programmatic cells.
174
+ filtered_ocr_cells = self.filter_ocr_cells(
175
+ all_ocr_cells, page.cells
156
176
  )
157
- all_ocr_cells.append(cell)
158
-
159
- ## Remove OCR cells which overlap with programmatic cells.
160
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
161
177
 
162
- page.cells.extend(filtered_ocr_cells)
178
+ page.cells.extend(filtered_ocr_cells)
163
179
 
164
- # DEBUG code:
165
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
180
+ # DEBUG code:
181
+ if settings.debug.visualize_ocr:
182
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
166
183
 
167
- yield page
184
+ yield page