docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,27 +1,33 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
- from typing import Iterable, List, Tuple
4
+ from pathlib import Path
5
+ from typing import Iterable, List
5
6
 
6
7
  import numpy as np
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
7
9
  from PIL import Image, ImageDraw
8
10
  from rtree import index
9
11
  from scipy.ndimage import find_objects, label
10
12
 
11
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
13
+ from docling.datamodel.base_models import OcrCell, Page
14
+ from docling.datamodel.document import ConversionResult
12
15
  from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.settings import settings
17
+ from docling.models.base_model import BasePageModel
13
18
 
14
19
  _log = logging.getLogger(__name__)
15
20
 
16
21
 
17
- class BaseOcrModel:
22
+ class BaseOcrModel(BasePageModel):
18
23
  def __init__(self, enabled: bool, options: OcrOptions):
19
24
  self.enabled = enabled
20
25
  self.options = options
21
26
 
22
27
  # Computes the optimum amount and coordinates of rectangles to OCR on a given page
23
- def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
28
+ def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
24
29
  BITMAP_COVERAGE_TRESHOLD = 0.75
30
+ assert page.size is not None
25
31
 
26
32
  def find_ocr_rects(size, bitmap_rects):
27
33
  image = Image.new(
@@ -60,11 +66,14 @@ class BaseOcrModel:
60
66
 
61
67
  return (area_frac, bounding_boxes) # fraction covered # boxes
62
68
 
63
- bitmap_rects = page._backend.get_bitmap_rects()
69
+ if page._backend is not None:
70
+ bitmap_rects = page._backend.get_bitmap_rects()
71
+ else:
72
+ bitmap_rects = []
64
73
  coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
65
74
 
66
75
  # return full-page rectangle if sufficiently covered with bitmaps
67
- if coverage > BITMAP_COVERAGE_TRESHOLD:
76
+ if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
68
77
  return [
69
78
  BoundingBox(
70
79
  l=0,
@@ -75,7 +84,15 @@ class BaseOcrModel:
75
84
  )
76
85
  ]
77
86
  # return individual rectangles if the bitmap coverage is smaller
78
- elif coverage < BITMAP_COVERAGE_TRESHOLD:
87
+ else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
88
+
89
+ # skip OCR if the bitmap area on the page is smaller than the options threshold
90
+ ocr_rects = [
91
+ rect
92
+ for rect in ocr_rects
93
+ if rect.area() / (page.size.width * page.size.height)
94
+ > self.options.bitmap_area_threshold
95
+ ]
79
96
  return ocr_rects
80
97
 
81
98
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
@@ -100,7 +117,7 @@ class BaseOcrModel:
100
117
  ]
101
118
  return filtered_ocr_cells
102
119
 
103
- def draw_ocr_rects_and_cells(self, page, ocr_rects):
120
+ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
104
121
  image = copy.deepcopy(page.image)
105
122
  draw = ImageDraw.Draw(image, "RGBA")
106
123
 
@@ -117,8 +134,21 @@ class BaseOcrModel:
117
134
  if isinstance(tc, OcrCell):
118
135
  color = "magenta"
119
136
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
120
- image.show()
137
+
138
+ if show:
139
+ image.show()
140
+ else:
141
+ out_path: Path = (
142
+ Path(settings.debug.debug_output_path)
143
+ / f"debug_{conv_res.input.file.stem}"
144
+ )
145
+ out_path.mkdir(parents=True, exist_ok=True)
146
+
147
+ out_file = out_path / f"ocr_page_{page.page_no:05}.png"
148
+ image.save(str(out_file), format="png")
121
149
 
122
150
  @abstractmethod
123
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
151
+ def __call__(
152
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
153
+ ) -> Iterable[Page]:
124
154
  pass
@@ -1,54 +1,256 @@
1
1
  import copy
2
2
  import random
3
+ from pathlib import Path
4
+ from typing import List, Union
3
5
 
4
6
  from deepsearch_glm.nlp_utils import init_nlp_model
5
- from deepsearch_glm.utils.doc_utils import to_legacy_document_format
7
+ from deepsearch_glm.utils.doc_utils import to_docling_document
6
8
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
- from docling_core.types import BaseText
8
- from docling_core.types import Document as DsDocument
9
- from docling_core.types import Ref
9
+ from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
10
+ from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
11
+ from docling_core.types.legacy_doc.base import (
12
+ Figure,
13
+ PageDimensions,
14
+ PageReference,
15
+ Prov,
16
+ Ref,
17
+ )
18
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
19
+ from docling_core.types.legacy_doc.base import TableCell
20
+ from docling_core.types.legacy_doc.document import BaseText
21
+ from docling_core.types.legacy_doc.document import (
22
+ CCSDocumentDescription as DsDocumentDescription,
23
+ )
24
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
25
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
10
26
  from PIL import ImageDraw
27
+ from pydantic import BaseModel, ConfigDict
11
28
 
12
- from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
- from docling.datamodel.document import ConversionResult
29
+ from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
30
+ from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
+ from docling.datamodel.settings import settings
32
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
33
+ from docling.utils.utils import create_hash
34
+
35
+
36
+ class GlmOptions(BaseModel):
37
+ model_config = ConfigDict(protected_namespaces=())
38
+
39
+ model_names: str = "" # e.g. "language;term;reference"
14
40
 
15
41
 
16
42
  class GlmModel:
17
- def __init__(self, config):
18
- self.config = config
19
- self.model_names = self.config.get(
20
- "model_names", ""
21
- ) # "language;term;reference"
43
+ def __init__(self, options: GlmOptions):
44
+ self.options = options
45
+
22
46
  load_pretrained_nlp_models()
23
- # model = init_nlp_model(model_names="language;term;reference")
24
- model = init_nlp_model(model_names=self.model_names)
25
- self.model = model
47
+ self.model = init_nlp_model(model_names=self.options.model_names)
48
+
49
+ def _to_legacy_document(self, conv_res) -> DsDocument:
50
+ title = ""
51
+ desc: DsDocumentDescription = DsDocumentDescription(logs=[])
52
+
53
+ page_hashes = [
54
+ PageReference(
55
+ hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
56
+ page=p.page_no + 1,
57
+ model="default",
58
+ )
59
+ for p in conv_res.pages
60
+ ]
61
+
62
+ file_info = DsFileInfoObject(
63
+ filename=conv_res.input.file.name,
64
+ document_hash=conv_res.input.document_hash,
65
+ num_pages=conv_res.input.page_count,
66
+ page_hashes=page_hashes,
67
+ )
68
+
69
+ main_text: List[Union[Ref, BaseText]] = []
70
+ tables: List[DsSchemaTable] = []
71
+ figures: List[Figure] = []
72
+
73
+ page_no_to_page = {p.page_no: p for p in conv_res.pages}
74
+
75
+ for element in conv_res.assembled.elements:
76
+ # Convert bboxes to lower-left origin.
77
+ target_bbox = DsBoundingBox(
78
+ element.cluster.bbox.to_bottom_left_origin(
79
+ page_no_to_page[element.page_no].size.height
80
+ ).as_tuple()
81
+ )
82
+
83
+ if isinstance(element, TextElement):
84
+ main_text.append(
85
+ BaseText(
86
+ text=element.text,
87
+ obj_type=layout_label_to_ds_type.get(element.label),
88
+ name=element.label,
89
+ prov=[
90
+ Prov(
91
+ bbox=target_bbox,
92
+ page=element.page_no + 1,
93
+ span=[0, len(element.text)],
94
+ )
95
+ ],
96
+ )
97
+ )
98
+ elif isinstance(element, Table):
99
+ index = len(tables)
100
+ ref_str = f"#/tables/{index}"
101
+ main_text.append(
102
+ Ref(
103
+ name=element.label,
104
+ obj_type=layout_label_to_ds_type.get(element.label),
105
+ ref=ref_str,
106
+ ),
107
+ )
108
+
109
+ # Initialise empty table data grid (only empty cells)
110
+ table_data = [
111
+ [
112
+ TableCell(
113
+ text="",
114
+ # bbox=[0,0,0,0],
115
+ spans=[[i, j]],
116
+ obj_type="body",
117
+ )
118
+ for j in range(element.num_cols)
119
+ ]
120
+ for i in range(element.num_rows)
121
+ ]
26
122
 
27
- def __call__(self, conv_res: ConversionResult) -> DsDocument:
28
- ds_doc = conv_res._to_ds_document()
29
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
123
+ # Overwrite cells in table data for which there is actual cell content.
124
+ for cell in element.table_cells:
125
+ for i in range(
126
+ min(cell.start_row_offset_idx, element.num_rows),
127
+ min(cell.end_row_offset_idx, element.num_rows),
128
+ ):
129
+ for j in range(
130
+ min(cell.start_col_offset_idx, element.num_cols),
131
+ min(cell.end_col_offset_idx, element.num_cols),
132
+ ):
133
+ celltype = "body"
134
+ if cell.column_header:
135
+ celltype = "col_header"
136
+ elif cell.row_header:
137
+ celltype = "row_header"
138
+ elif cell.row_section:
139
+ celltype = "row_section"
30
140
 
31
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
32
- ds_doc_dict = to_legacy_document_format(
33
- glm_doc, ds_doc_dict, update_name_label=True
141
+ def make_spans(cell):
142
+ for rspan in range(
143
+ min(cell.start_row_offset_idx, element.num_rows),
144
+ min(cell.end_row_offset_idx, element.num_rows),
145
+ ):
146
+ for cspan in range(
147
+ min(
148
+ cell.start_col_offset_idx, element.num_cols
149
+ ),
150
+ min(cell.end_col_offset_idx, element.num_cols),
151
+ ):
152
+ yield [rspan, cspan]
153
+
154
+ spans = list(make_spans(cell))
155
+ if cell.bbox is not None:
156
+ bbox = cell.bbox.to_bottom_left_origin(
157
+ page_no_to_page[element.page_no].size.height
158
+ ).as_tuple()
159
+ else:
160
+ bbox = None
161
+
162
+ table_data[i][j] = TableCell(
163
+ text=cell.text,
164
+ bbox=bbox,
165
+ # col=j,
166
+ # row=i,
167
+ spans=spans,
168
+ obj_type=celltype,
169
+ # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
170
+ # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
171
+ )
172
+
173
+ tables.append(
174
+ DsSchemaTable(
175
+ num_cols=element.num_cols,
176
+ num_rows=element.num_rows,
177
+ obj_type=layout_label_to_ds_type.get(element.label),
178
+ data=table_data,
179
+ prov=[
180
+ Prov(
181
+ bbox=target_bbox,
182
+ page=element.page_no + 1,
183
+ span=[0, 0],
184
+ )
185
+ ],
186
+ )
187
+ )
188
+
189
+ elif isinstance(element, FigureElement):
190
+ index = len(figures)
191
+ ref_str = f"#/figures/{index}"
192
+ main_text.append(
193
+ Ref(
194
+ name=element.label,
195
+ obj_type=layout_label_to_ds_type.get(element.label),
196
+ ref=ref_str,
197
+ ),
198
+ )
199
+ figures.append(
200
+ Figure(
201
+ prov=[
202
+ Prov(
203
+ bbox=target_bbox,
204
+ page=element.page_no + 1,
205
+ span=[0, 0],
206
+ )
207
+ ],
208
+ obj_type=layout_label_to_ds_type.get(element.label),
209
+ # data=[[]],
210
+ )
211
+ )
212
+
213
+ page_dimensions = [
214
+ PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
215
+ for p in conv_res.pages
216
+ if p.size is not None
217
+ ]
218
+
219
+ ds_doc: DsDocument = DsDocument(
220
+ name=title,
221
+ description=desc,
222
+ file_info=file_info,
223
+ main_text=main_text,
224
+ tables=tables,
225
+ figures=figures,
226
+ page_dimensions=page_dimensions,
34
227
  )
35
228
 
36
- exported_doc = DsDocument.model_validate(ds_doc_dict)
229
+ return ds_doc
230
+
231
+ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
232
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
233
+ ds_doc = self._to_legacy_document(conv_res)
234
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
235
+
236
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
237
+
238
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
37
239
 
38
240
  # DEBUG code:
39
- def draw_clusters_and_cells(ds_document, page_no):
241
+ def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
40
242
  clusters_to_draw = []
41
243
  image = copy.deepcopy(conv_res.pages[page_no].image)
42
244
  for ix, elem in enumerate(ds_document.main_text):
43
245
  if isinstance(elem, BaseText):
44
- prov = elem.prov[0]
246
+ prov = elem.prov[0] # type: ignore
45
247
  elif isinstance(elem, Ref):
46
248
  _, arr, index = elem.ref.split("/")
47
- index = int(index)
249
+ index = int(index) # type: ignore
48
250
  if arr == "tables":
49
251
  prov = ds_document.tables[index].prov[0]
50
252
  elif arr == "figures":
51
- prov = ds_document.figures[index].prov[0]
253
+ prov = ds_document.pictures[index].prov[0]
52
254
  else:
53
255
  prov = None
54
256
 
@@ -58,7 +260,7 @@ class GlmModel:
58
260
  id=ix,
59
261
  label=elem.name,
60
262
  bbox=BoundingBox.from_tuple(
61
- coord=prov.bbox,
263
+ coord=prov.bbox, # type: ignore
62
264
  origin=CoordOrigin.BOTTOMLEFT,
63
265
  ).to_top_left_origin(conv_res.pages[page_no].size.height),
64
266
  )
@@ -78,9 +280,21 @@ class GlmModel:
78
280
  for tc in c.cells: # [:1]:
79
281
  x0, y0, x1, y1 = tc.bbox.as_tuple()
80
282
  draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
81
- image.show()
82
283
 
83
- # draw_clusters_and_cells(ds_doc, 0)
84
- # draw_clusters_and_cells(exported_doc, 0)
284
+ if show:
285
+ image.show()
286
+ else:
287
+ out_path: Path = (
288
+ Path(settings.debug.debug_output_path)
289
+ / f"debug_{conv_res.input.file.stem}"
290
+ )
291
+ out_path.mkdir(parents=True, exist_ok=True)
292
+
293
+ out_file = out_path / f"doc_page_{page_no:05}.png"
294
+ image.save(str(out_file), format="png")
295
+
296
+ # for item in ds_doc.page_dimensions:
297
+ # page_no = item.page
298
+ # draw_clusters_and_cells(ds_doc, page_no)
85
299
 
86
- return exported_doc
300
+ return docling_doc
@@ -2,10 +2,14 @@ import logging
2
2
  from typing import Iterable
3
3
 
4
4
  import numpy
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
6
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
7
9
  from docling.datamodel.pipeline_options import EasyOcrOptions
10
+ from docling.datamodel.settings import settings
8
11
  from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
9
13
 
10
14
  _log = logging.getLogger(__name__)
11
15
 
@@ -32,54 +36,65 @@ class EasyOcrModel(BaseOcrModel):
32
36
  download_enabled=self.options.download_enabled,
33
37
  )
34
38
 
35
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
39
+ def __call__(
40
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
41
+ ) -> Iterable[Page]:
36
42
 
37
43
  if not self.enabled:
38
44
  yield from page_batch
39
45
  return
40
46
 
41
47
  for page in page_batch:
42
- ocr_rects = self.get_ocr_rects(page)
43
-
44
- all_ocr_cells = []
45
- for ocr_rect in ocr_rects:
46
- # Skip zero area boxes
47
- if ocr_rect.area() == 0:
48
- continue
49
- high_res_image = page._backend.get_page_image(
50
- scale=self.scale, cropbox=ocr_rect
51
- )
52
- im = numpy.array(high_res_image)
53
- result = self.reader.readtext(im)
54
-
55
- del high_res_image
56
- del im
57
-
58
- cells = [
59
- OcrCell(
60
- id=ix,
61
- text=line[1],
62
- confidence=line[2],
63
- bbox=BoundingBox.from_tuple(
64
- coord=(
65
- (line[0][0][0] / self.scale) + ocr_rect.l,
66
- (line[0][0][1] / self.scale) + ocr_rect.t,
67
- (line[0][2][0] / self.scale) + ocr_rect.l,
68
- (line[0][2][1] / self.scale) + ocr_rect.t,
69
- ),
70
- origin=CoordOrigin.TOPLEFT,
71
- ),
72
- )
73
- for ix, line in enumerate(result)
74
- ]
75
- all_ocr_cells.extend(cells)
76
48
 
77
- ## Remove OCR cells which overlap with programmatic cells.
78
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
49
+ assert page._backend is not None
50
+ if not page._backend.is_valid():
51
+ yield page
52
+ else:
53
+ with TimeRecorder(conv_res, "ocr"):
54
+ ocr_rects = self.get_ocr_rects(page)
55
+
56
+ all_ocr_cells = []
57
+ for ocr_rect in ocr_rects:
58
+ # Skip zero area boxes
59
+ if ocr_rect.area() == 0:
60
+ continue
61
+ high_res_image = page._backend.get_page_image(
62
+ scale=self.scale, cropbox=ocr_rect
63
+ )
64
+ im = numpy.array(high_res_image)
65
+ result = self.reader.readtext(im)
66
+
67
+ del high_res_image
68
+ del im
69
+
70
+ cells = [
71
+ OcrCell(
72
+ id=ix,
73
+ text=line[1],
74
+ confidence=line[2],
75
+ bbox=BoundingBox.from_tuple(
76
+ coord=(
77
+ (line[0][0][0] / self.scale) + ocr_rect.l,
78
+ (line[0][0][1] / self.scale) + ocr_rect.t,
79
+ (line[0][2][0] / self.scale) + ocr_rect.l,
80
+ (line[0][2][1] / self.scale) + ocr_rect.t,
81
+ ),
82
+ origin=CoordOrigin.TOPLEFT,
83
+ ),
84
+ )
85
+ for ix, line in enumerate(result)
86
+ ]
87
+ all_ocr_cells.extend(cells)
88
+
89
+ ## Remove OCR cells which overlap with programmatic cells.
90
+ filtered_ocr_cells = self.filter_ocr_cells(
91
+ all_ocr_cells, page.cells
92
+ )
79
93
 
80
- page.cells.extend(filtered_ocr_cells)
94
+ page.cells.extend(filtered_ocr_cells)
81
95
 
82
- # DEBUG code:
83
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
96
+ # DEBUG code:
97
+ if settings.debug.visualize_ocr:
98
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
84
99
 
85
- yield page
100
+ yield page