docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +379 -324
  12. docling/datamodel/pipeline_options.py +16 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +19 -6
  17. docling/models/ds_glm_model.py +220 -22
  18. docling/models/easyocr_model.py +45 -40
  19. docling/models/layout_model.py +130 -114
  20. docling/models/page_assemble_model.py +119 -95
  21. docling/models/page_preprocessing_model.py +61 -0
  22. docling/models/table_structure_model.py +122 -111
  23. docling/models/tesseract_ocr_cli_model.py +65 -58
  24. docling/models/tesseract_ocr_model.py +58 -50
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.1.0.dist-info/METADATA +149 -0
  31. docling-2.1.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.0.dist-info/METADATA +0 -380
  35. docling-1.19.0.dist-info/RECORD +0 -34
  36. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,39 +1,237 @@
1
1
  import copy
2
2
  import random
3
+ from typing import List, Union
3
4
 
4
5
  from deepsearch_glm.nlp_utils import init_nlp_model
5
- from deepsearch_glm.utils.doc_utils import to_legacy_document_format
6
+ from deepsearch_glm.utils.doc_utils import to_docling_document
6
7
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
- from docling_core.types import BaseText
8
- from docling_core.types import Document as DsDocument
9
- from docling_core.types import Ref
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
9
+ from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
10
+ from docling_core.types.legacy_doc.base import (
11
+ Figure,
12
+ PageDimensions,
13
+ PageReference,
14
+ Prov,
15
+ Ref,
16
+ )
17
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
18
+ from docling_core.types.legacy_doc.base import TableCell
19
+ from docling_core.types.legacy_doc.document import BaseText
20
+ from docling_core.types.legacy_doc.document import (
21
+ CCSDocumentDescription as DsDocumentDescription,
22
+ )
23
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
24
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
10
25
  from PIL import ImageDraw
26
+ from pydantic import BaseModel, ConfigDict
11
27
 
12
- from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
- from docling.datamodel.document import ConversionResult
28
+ from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
29
+ from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
30
+ from docling.utils.utils import create_hash
31
+
32
+
33
+ class GlmOptions(BaseModel):
34
+ model_config = ConfigDict(protected_namespaces=())
35
+
36
+ model_names: str = "" # e.g. "language;term;reference"
14
37
 
15
38
 
16
39
  class GlmModel:
17
- def __init__(self, config):
18
- self.config = config
19
- self.model_names = self.config.get(
20
- "model_names", ""
21
- ) # "language;term;reference"
40
+ def __init__(self, options: GlmOptions):
41
+ self.options = options
42
+
22
43
  load_pretrained_nlp_models()
23
- # model = init_nlp_model(model_names="language;term;reference")
24
- model = init_nlp_model(model_names=self.model_names)
25
- self.model = model
44
+ self.model = init_nlp_model(model_names=self.options.model_names)
45
+
46
+ def _to_legacy_document(self, conv_res) -> DsDocument:
47
+ title = ""
48
+ desc: DsDocumentDescription = DsDocumentDescription(logs=[])
49
+
50
+ page_hashes = [
51
+ PageReference(
52
+ hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
53
+ page=p.page_no + 1,
54
+ model="default",
55
+ )
56
+ for p in conv_res.pages
57
+ ]
58
+
59
+ file_info = DsFileInfoObject(
60
+ filename=conv_res.input.file.name,
61
+ document_hash=conv_res.input.document_hash,
62
+ num_pages=conv_res.input.page_count,
63
+ page_hashes=page_hashes,
64
+ )
65
+
66
+ main_text: List[Union[Ref, BaseText]] = []
67
+ tables: List[DsSchemaTable] = []
68
+ figures: List[Figure] = []
69
+
70
+ page_no_to_page = {p.page_no: p for p in conv_res.pages}
71
+
72
+ for element in conv_res.assembled.elements:
73
+ # Convert bboxes to lower-left origin.
74
+ target_bbox = DsBoundingBox(
75
+ element.cluster.bbox.to_bottom_left_origin(
76
+ page_no_to_page[element.page_no].size.height
77
+ ).as_tuple()
78
+ )
79
+
80
+ if isinstance(element, TextElement):
81
+ main_text.append(
82
+ BaseText(
83
+ text=element.text,
84
+ obj_type=layout_label_to_ds_type.get(element.label),
85
+ name=element.label,
86
+ prov=[
87
+ Prov(
88
+ bbox=target_bbox,
89
+ page=element.page_no + 1,
90
+ span=[0, len(element.text)],
91
+ )
92
+ ],
93
+ )
94
+ )
95
+ elif isinstance(element, Table):
96
+ index = len(tables)
97
+ ref_str = f"#/tables/{index}"
98
+ main_text.append(
99
+ Ref(
100
+ name=element.label,
101
+ obj_type=layout_label_to_ds_type.get(element.label),
102
+ ref=ref_str,
103
+ ),
104
+ )
105
+
106
+ # Initialise empty table data grid (only empty cells)
107
+ table_data = [
108
+ [
109
+ TableCell(
110
+ text="",
111
+ # bbox=[0,0,0,0],
112
+ spans=[[i, j]],
113
+ obj_type="body",
114
+ )
115
+ for j in range(element.num_cols)
116
+ ]
117
+ for i in range(element.num_rows)
118
+ ]
26
119
 
27
- def __call__(self, conv_res: ConversionResult) -> DsDocument:
28
- ds_doc = conv_res._to_ds_document()
120
+ # Overwrite cells in table data for which there is actual cell content.
121
+ for cell in element.table_cells:
122
+ for i in range(
123
+ min(cell.start_row_offset_idx, element.num_rows),
124
+ min(cell.end_row_offset_idx, element.num_rows),
125
+ ):
126
+ for j in range(
127
+ min(cell.start_col_offset_idx, element.num_cols),
128
+ min(cell.end_col_offset_idx, element.num_cols),
129
+ ):
130
+ celltype = "body"
131
+ if cell.column_header:
132
+ celltype = "col_header"
133
+ elif cell.row_header:
134
+ celltype = "row_header"
135
+ elif cell.row_section:
136
+ celltype = "row_section"
137
+
138
+ def make_spans(cell):
139
+ for rspan in range(
140
+ min(cell.start_row_offset_idx, element.num_rows),
141
+ min(cell.end_row_offset_idx, element.num_rows),
142
+ ):
143
+ for cspan in range(
144
+ min(
145
+ cell.start_col_offset_idx, element.num_cols
146
+ ),
147
+ min(cell.end_col_offset_idx, element.num_cols),
148
+ ):
149
+ yield [rspan, cspan]
150
+
151
+ spans = list(make_spans(cell))
152
+ if cell.bbox is not None:
153
+ bbox = cell.bbox.to_bottom_left_origin(
154
+ page_no_to_page[element.page_no].size.height
155
+ ).as_tuple()
156
+ else:
157
+ bbox = None
158
+
159
+ table_data[i][j] = TableCell(
160
+ text=cell.text,
161
+ bbox=bbox,
162
+ # col=j,
163
+ # row=i,
164
+ spans=spans,
165
+ obj_type=celltype,
166
+ # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
167
+ # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
168
+ )
169
+
170
+ tables.append(
171
+ DsSchemaTable(
172
+ num_cols=element.num_cols,
173
+ num_rows=element.num_rows,
174
+ obj_type=layout_label_to_ds_type.get(element.label),
175
+ data=table_data,
176
+ prov=[
177
+ Prov(
178
+ bbox=target_bbox,
179
+ page=element.page_no + 1,
180
+ span=[0, 0],
181
+ )
182
+ ],
183
+ )
184
+ )
185
+
186
+ elif isinstance(element, FigureElement):
187
+ index = len(figures)
188
+ ref_str = f"#/figures/{index}"
189
+ main_text.append(
190
+ Ref(
191
+ name=element.label,
192
+ obj_type=layout_label_to_ds_type.get(element.label),
193
+ ref=ref_str,
194
+ ),
195
+ )
196
+ figures.append(
197
+ Figure(
198
+ prov=[
199
+ Prov(
200
+ bbox=target_bbox,
201
+ page=element.page_no + 1,
202
+ span=[0, 0],
203
+ )
204
+ ],
205
+ obj_type=layout_label_to_ds_type.get(element.label),
206
+ # data=[[]],
207
+ )
208
+ )
209
+
210
+ page_dimensions = [
211
+ PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
212
+ for p in conv_res.pages
213
+ if p.size is not None
214
+ ]
215
+
216
+ ds_doc: DsDocument = DsDocument(
217
+ name=title,
218
+ description=desc,
219
+ file_info=file_info,
220
+ main_text=main_text,
221
+ tables=tables,
222
+ figures=figures,
223
+ page_dimensions=page_dimensions,
224
+ )
225
+
226
+ return ds_doc
227
+
228
+ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
229
+ ds_doc = self._to_legacy_document(conv_res)
29
230
  ds_doc_dict = ds_doc.model_dump(by_alias=True)
30
231
 
31
232
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
32
- ds_doc_dict = to_legacy_document_format(
33
- glm_doc, ds_doc_dict, update_name_label=True
34
- )
35
233
 
36
- exported_doc = DsDocument.model_validate(ds_doc_dict)
234
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
37
235
 
38
236
  # DEBUG code:
39
237
  def draw_clusters_and_cells(ds_document, page_no):
@@ -48,7 +246,7 @@ class GlmModel:
48
246
  if arr == "tables":
49
247
  prov = ds_document.tables[index].prov[0]
50
248
  elif arr == "figures":
51
- prov = ds_document.figures[index].prov[0]
249
+ prov = ds_document.pictures[index].prov[0]
52
250
  else:
53
251
  prov = None
54
252
 
@@ -83,4 +281,4 @@ class GlmModel:
83
281
  # draw_clusters_and_cells(ds_doc, 0)
84
282
  # draw_clusters_and_cells(exported_doc, 0)
85
283
 
86
- return exported_doc
284
+ return docling_doc
@@ -2,8 +2,9 @@ import logging
2
2
  from typing import Iterable
3
3
 
4
4
  import numpy
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
6
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.datamodel.base_models import OcrCell, Page
7
8
  from docling.datamodel.pipeline_options import EasyOcrOptions
8
9
  from docling.models.base_ocr_model import BaseOcrModel
9
10
 
@@ -39,47 +40,51 @@ class EasyOcrModel(BaseOcrModel):
39
40
  return
40
41
 
41
42
  for page in page_batch:
42
- ocr_rects = self.get_ocr_rects(page)
43
-
44
- all_ocr_cells = []
45
- for ocr_rect in ocr_rects:
46
- # Skip zero area boxes
47
- if ocr_rect.area() == 0:
48
- continue
49
- high_res_image = page._backend.get_page_image(
50
- scale=self.scale, cropbox=ocr_rect
51
- )
52
- im = numpy.array(high_res_image)
53
- result = self.reader.readtext(im)
54
-
55
- del high_res_image
56
- del im
57
-
58
- cells = [
59
- OcrCell(
60
- id=ix,
61
- text=line[1],
62
- confidence=line[2],
63
- bbox=BoundingBox.from_tuple(
64
- coord=(
65
- (line[0][0][0] / self.scale) + ocr_rect.l,
66
- (line[0][0][1] / self.scale) + ocr_rect.t,
67
- (line[0][2][0] / self.scale) + ocr_rect.l,
68
- (line[0][2][1] / self.scale) + ocr_rect.t,
69
- ),
70
- origin=CoordOrigin.TOPLEFT,
71
- ),
43
+ assert page._backend is not None
44
+ if not page._backend.is_valid():
45
+ yield page
46
+ else:
47
+ ocr_rects = self.get_ocr_rects(page)
48
+
49
+ all_ocr_cells = []
50
+ for ocr_rect in ocr_rects:
51
+ # Skip zero area boxes
52
+ if ocr_rect.area() == 0:
53
+ continue
54
+ high_res_image = page._backend.get_page_image(
55
+ scale=self.scale, cropbox=ocr_rect
72
56
  )
73
- for ix, line in enumerate(result)
74
- ]
75
- all_ocr_cells.extend(cells)
57
+ im = numpy.array(high_res_image)
58
+ result = self.reader.readtext(im)
59
+
60
+ del high_res_image
61
+ del im
62
+
63
+ cells = [
64
+ OcrCell(
65
+ id=ix,
66
+ text=line[1],
67
+ confidence=line[2],
68
+ bbox=BoundingBox.from_tuple(
69
+ coord=(
70
+ (line[0][0][0] / self.scale) + ocr_rect.l,
71
+ (line[0][0][1] / self.scale) + ocr_rect.t,
72
+ (line[0][2][0] / self.scale) + ocr_rect.l,
73
+ (line[0][2][1] / self.scale) + ocr_rect.t,
74
+ ),
75
+ origin=CoordOrigin.TOPLEFT,
76
+ ),
77
+ )
78
+ for ix, line in enumerate(result)
79
+ ]
80
+ all_ocr_cells.extend(cells)
76
81
 
77
- ## Remove OCR cells which overlap with programmatic cells.
78
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
82
+ ## Remove OCR cells which overlap with programmatic cells.
83
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
79
84
 
80
- page.cells.extend(filtered_ocr_cells)
85
+ page.cells.extend(filtered_ocr_cells)
81
86
 
82
- # DEBUG code:
83
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
87
+ # DEBUG code:
88
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
84
89
 
85
- yield page
90
+ yield page