docling 2.23.0__py3-none-any.whl → 2.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
14
14
  from enum import Enum, unique
15
15
  from io import BytesIO
16
16
  from pathlib import Path
17
- from typing import Any, Final, Optional, Union
17
+ from typing import Final, Optional, Union
18
18
 
19
19
  from bs4 import BeautifulSoup, Tag
20
20
  from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
1406
1406
  http://oasis-open.org/specs/soextblx.dtd
1407
1407
  """
1408
1408
 
1409
+ class ColInfo(TypedDict):
1410
+ ncols: int
1411
+ colinfo: list[dict]
1412
+
1409
1413
  class MinColInfoType(TypedDict):
1410
1414
  offset: list[int]
1411
1415
  colwidth: list[int]
@@ -1425,7 +1429,7 @@ class XmlTable:
1425
1429
  self.empty_text = ""
1426
1430
  self._soup = BeautifulSoup(input, features="xml")
1427
1431
 
1428
- def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
1432
+ def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
1429
1433
  """Create a unified range along the table groups.
1430
1434
 
1431
1435
  Args:
@@ -1532,19 +1536,26 @@ class XmlTable:
1532
1536
  Returns:
1533
1537
  A docling table object.
1534
1538
  """
1535
- tgs_align = []
1536
- tg_secs = table.find_all("tgroup")
1539
+ tgs_align: list[XmlTable.ColInfo] = []
1540
+ tg_secs = table("tgroup")
1537
1541
  if tg_secs:
1538
1542
  for tg_sec in tg_secs:
1539
- ncols = tg_sec.get("cols", None)
1540
- if ncols:
1541
- ncols = int(ncols)
1542
- tg_align = {"ncols": ncols, "colinfo": []}
1543
- cs_secs = tg_sec.find_all("colspec")
1543
+ if not isinstance(tg_sec, Tag):
1544
+ continue
1545
+ col_val = tg_sec.get("cols")
1546
+ ncols = (
1547
+ int(col_val)
1548
+ if isinstance(col_val, str) and col_val.isnumeric()
1549
+ else 1
1550
+ )
1551
+ tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
1552
+ cs_secs = tg_sec("colspec")
1544
1553
  if cs_secs:
1545
1554
  for cs_sec in cs_secs:
1546
- colname = cs_sec.get("colname", None)
1547
- colwidth = cs_sec.get("colwidth", None)
1555
+ if not isinstance(cs_sec, Tag):
1556
+ continue
1557
+ colname = cs_sec.get("colname")
1558
+ colwidth = cs_sec.get("colwidth")
1548
1559
  tg_align["colinfo"].append(
1549
1560
  {"colname": colname, "colwidth": colwidth}
1550
1561
  )
@@ -1565,16 +1576,23 @@ class XmlTable:
1565
1576
  table_data: list[TableCell] = []
1566
1577
  i_row_global = 0
1567
1578
  is_row_empty: bool = True
1568
- tg_secs = table.find_all("tgroup")
1579
+ tg_secs = table("tgroup")
1569
1580
  if tg_secs:
1570
1581
  for itg, tg_sec in enumerate(tg_secs):
1582
+ if not isinstance(tg_sec, Tag):
1583
+ continue
1571
1584
  tg_range = tgs_range[itg]
1572
- row_secs = tg_sec.find_all(["row", "tr"])
1585
+ row_secs = tg_sec(["row", "tr"])
1573
1586
 
1574
1587
  if row_secs:
1575
1588
  for row_sec in row_secs:
1576
- entry_secs = row_sec.find_all(["entry", "td"])
1577
- is_header: bool = row_sec.parent.name in ["thead"]
1589
+ if not isinstance(row_sec, Tag):
1590
+ continue
1591
+ entry_secs = row_sec(["entry", "td"])
1592
+ is_header: bool = (
1593
+ row_sec.parent is not None
1594
+ and row_sec.parent.name == "thead"
1595
+ )
1578
1596
 
1579
1597
  ncols = 0
1580
1598
  local_row: list[TableCell] = []
@@ -1582,23 +1600,26 @@ class XmlTable:
1582
1600
  if entry_secs:
1583
1601
  wrong_nbr_cols = False
1584
1602
  for ientry, entry_sec in enumerate(entry_secs):
1603
+ if not isinstance(entry_sec, Tag):
1604
+ continue
1585
1605
  text = entry_sec.get_text().strip()
1586
1606
 
1587
1607
  # start-end
1588
- namest = entry_sec.attrs.get("namest", None)
1589
- nameend = entry_sec.attrs.get("nameend", None)
1590
- if isinstance(namest, str) and namest.isnumeric():
1591
- namest = int(namest)
1592
- else:
1593
- namest = ientry + 1
1608
+ namest = entry_sec.get("namest")
1609
+ nameend = entry_sec.get("nameend")
1610
+ start = (
1611
+ int(namest)
1612
+ if isinstance(namest, str) and namest.isnumeric()
1613
+ else ientry + 1
1614
+ )
1594
1615
  if isinstance(nameend, str) and nameend.isnumeric():
1595
- nameend = int(nameend)
1616
+ end = int(nameend)
1596
1617
  shift = 0
1597
1618
  else:
1598
- nameend = ientry + 2
1619
+ end = ientry + 2
1599
1620
  shift = 1
1600
1621
 
1601
- if nameend > len(tg_range["cell_offst"]):
1622
+ if end > len(tg_range["cell_offst"]):
1602
1623
  wrong_nbr_cols = True
1603
1624
  self.nbr_messages += 1
1604
1625
  if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ class XmlTable:
1608
1629
  break
1609
1630
 
1610
1631
  range_ = [
1611
- tg_range["cell_offst"][namest - 1],
1612
- tg_range["cell_offst"][nameend - 1] - shift,
1632
+ tg_range["cell_offst"][start - 1],
1633
+ tg_range["cell_offst"][end - 1] - shift,
1613
1634
  ]
1614
1635
 
1615
1636
  # add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ class XmlTable:
1668
1689
  A docling table data.
1669
1690
  """
1670
1691
  section = self._soup.find("table")
1671
- if section is not None:
1692
+ if isinstance(section, Tag):
1672
1693
  table = self._parse_table(section)
1673
1694
  if table.num_rows == 0 or table.num_cols == 0:
1674
1695
  _log.warning("The parsed USPTO table is empty")
@@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
52
52
 
53
53
  sanitized_text = "".join(lines)
54
54
 
55
+ # Text normalization
56
+ sanitized_text = sanitized_text.replace("⁄", "/")
57
+ sanitized_text = sanitized_text.replace("’", "'")
58
+ sanitized_text = sanitized_text.replace("‘", "'")
59
+ sanitized_text = sanitized_text.replace("“", '"')
60
+ sanitized_text = sanitized_text.replace("”", '"')
61
+ sanitized_text = sanitized_text.replace("•", "·")
62
+
55
63
  return sanitized_text.strip() # Strip any leading or trailing whitespace
56
64
 
57
65
  def __call__(
@@ -0,0 +1,389 @@
1
+ import copy
2
+ import random
3
+ from pathlib import Path
4
+ from typing import Dict, List
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ DocumentOrigin,
13
+ GroupLabel,
14
+ NodeItem,
15
+ ProvenanceItem,
16
+ RefItem,
17
+ TableData,
18
+ )
19
+ from docling_core.types.doc.document import ContentLayer
20
+ from docling_core.types.legacy_doc.base import Ref
21
+ from docling_core.types.legacy_doc.document import BaseText
22
+ from docling_ibm_models.reading_order.reading_order_rb import (
23
+ PageElement as ReadingOrderPageElement,
24
+ )
25
+ from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
26
+ from PIL import ImageDraw
27
+ from pydantic import BaseModel, ConfigDict
28
+
29
+ from docling.datamodel.base_models import (
30
+ BasePageElement,
31
+ Cluster,
32
+ ContainerElement,
33
+ FigureElement,
34
+ Table,
35
+ TextElement,
36
+ )
37
+ from docling.datamodel.document import ConversionResult
38
+ from docling.datamodel.settings import settings
39
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
40
+
41
+
42
+ class ReadingOrderOptions(BaseModel):
43
+ model_config = ConfigDict(protected_namespaces=())
44
+
45
+ model_names: str = "" # e.g. "language;term;reference"
46
+
47
+
48
+ class ReadingOrderModel:
49
+ def __init__(self, options: ReadingOrderOptions):
50
+ self.options = options
51
+ self.ro_model = ReadingOrderPredictor()
52
+
53
+ def _assembled_to_readingorder_elements(
54
+ self, conv_res: ConversionResult
55
+ ) -> List[ReadingOrderPageElement]:
56
+
57
+ elements: List[ReadingOrderPageElement] = []
58
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
59
+
60
+ for element in conv_res.assembled.elements:
61
+
62
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
63
+ bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
64
+ text = element.text or ""
65
+
66
+ elements.append(
67
+ ReadingOrderPageElement(
68
+ cid=len(elements),
69
+ ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
70
+ text=text,
71
+ page_no=element.page_no,
72
+ page_size=page_no_to_pages[element.page_no].size,
73
+ label=element.label,
74
+ l=bbox.l,
75
+ r=bbox.r,
76
+ b=bbox.b,
77
+ t=bbox.t,
78
+ coord_origin=bbox.coord_origin,
79
+ )
80
+ )
81
+
82
+ return elements
83
+
84
+ def _add_child_elements(
85
+ self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
86
+ ):
87
+
88
+ child: Cluster
89
+ for child in element.cluster.children:
90
+ c_label = child.label
91
+ c_bbox = child.bbox.to_bottom_left_origin(
92
+ doc.pages[element.page_no + 1].size.height
93
+ )
94
+ c_text = " ".join(
95
+ [
96
+ cell.text.replace("\x02", "-").strip()
97
+ for cell in child.cells
98
+ if len(cell.text.strip()) > 0
99
+ ]
100
+ )
101
+
102
+ c_prov = ProvenanceItem(
103
+ page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
104
+ )
105
+ if c_label == DocItemLabel.LIST_ITEM:
106
+ # TODO: Infer if this is a numbered or a bullet list item
107
+ doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
108
+ elif c_label == DocItemLabel.SECTION_HEADER:
109
+ doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
110
+ else:
111
+ doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
112
+
113
+ def _readingorder_elements_to_docling_doc(
114
+ self,
115
+ conv_res: ConversionResult,
116
+ ro_elements: List[ReadingOrderPageElement],
117
+ el_to_captions_mapping: Dict[int, List[int]],
118
+ el_to_footnotes_mapping: Dict[int, List[int]],
119
+ el_merges_mapping: Dict[int, List[int]],
120
+ ) -> DoclingDocument:
121
+
122
+ id_to_elem = {
123
+ RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
124
+ for elem in conv_res.assembled.elements
125
+ }
126
+ cid_to_rels = {rel.cid: rel for rel in ro_elements}
127
+
128
+ origin = DocumentOrigin(
129
+ mimetype="application/pdf",
130
+ filename=conv_res.input.file.name,
131
+ binary_hash=conv_res.input.document_hash,
132
+ )
133
+ doc_name = Path(origin.filename).stem
134
+ out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
135
+
136
+ for page in conv_res.pages:
137
+ page_no = page.page_no + 1
138
+ size = page.size
139
+
140
+ assert size is not None
141
+
142
+ out_doc.add_page(page_no=page_no, size=size)
143
+
144
+ current_list = None
145
+ skippable_cids = {
146
+ cid
147
+ for mapping in (
148
+ el_to_captions_mapping,
149
+ el_to_footnotes_mapping,
150
+ el_merges_mapping,
151
+ )
152
+ for lst in mapping.values()
153
+ for cid in lst
154
+ }
155
+
156
+ page_no_to_pages = {p.page_no: p for p in conv_res.pages}
157
+
158
+ for rel in ro_elements:
159
+ if rel.cid in skippable_cids:
160
+ continue
161
+ element = id_to_elem[rel.ref.cref]
162
+
163
+ page_height = page_no_to_pages[element.page_no].size.height # type: ignore
164
+
165
+ if isinstance(element, TextElement):
166
+ if element.label == DocItemLabel.CODE:
167
+ cap_text = element.text
168
+ prov = ProvenanceItem(
169
+ page_no=element.page_no + 1,
170
+ charspan=(0, len(cap_text)),
171
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
172
+ )
173
+ code_item = out_doc.add_code(text=cap_text, prov=prov)
174
+
175
+ if rel.cid in el_to_captions_mapping.keys():
176
+ for caption_cid in el_to_captions_mapping[rel.cid]:
177
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
178
+ new_cap_item = self._add_caption_or_footnote(
179
+ caption_elem, out_doc, code_item, page_height
180
+ )
181
+
182
+ code_item.captions.append(new_cap_item.get_ref())
183
+
184
+ if rel.cid in el_to_footnotes_mapping.keys():
185
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
186
+ footnote_elem = id_to_elem[
187
+ cid_to_rels[footnote_cid].ref.cref
188
+ ]
189
+ new_footnote_item = self._add_caption_or_footnote(
190
+ footnote_elem, out_doc, code_item, page_height
191
+ )
192
+
193
+ code_item.footnotes.append(new_footnote_item.get_ref())
194
+ else:
195
+
196
+ new_item, current_list = self._handle_text_element(
197
+ element, out_doc, current_list, page_height
198
+ )
199
+
200
+ if rel.cid in el_merges_mapping.keys():
201
+ for merged_cid in el_merges_mapping[rel.cid]:
202
+ merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
203
+
204
+ self._merge_elements(
205
+ element, merged_elem, new_item, page_height
206
+ )
207
+
208
+ elif isinstance(element, Table):
209
+
210
+ tbl_data = TableData(
211
+ num_rows=element.num_rows,
212
+ num_cols=element.num_cols,
213
+ table_cells=element.table_cells,
214
+ )
215
+
216
+ prov = ProvenanceItem(
217
+ page_no=element.page_no + 1,
218
+ charspan=(0, 0),
219
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
220
+ )
221
+
222
+ tbl = out_doc.add_table(
223
+ data=tbl_data, prov=prov, label=element.cluster.label
224
+ )
225
+
226
+ if rel.cid in el_to_captions_mapping.keys():
227
+ for caption_cid in el_to_captions_mapping[rel.cid]:
228
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
229
+ new_cap_item = self._add_caption_or_footnote(
230
+ caption_elem, out_doc, tbl, page_height
231
+ )
232
+
233
+ tbl.captions.append(new_cap_item.get_ref())
234
+
235
+ if rel.cid in el_to_footnotes_mapping.keys():
236
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
237
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
238
+ new_footnote_item = self._add_caption_or_footnote(
239
+ footnote_elem, out_doc, tbl, page_height
240
+ )
241
+
242
+ tbl.footnotes.append(new_footnote_item.get_ref())
243
+
244
+ # TODO: Consider adding children of Table.
245
+
246
+ elif isinstance(element, FigureElement):
247
+ cap_text = ""
248
+ prov = ProvenanceItem(
249
+ page_no=element.page_no + 1,
250
+ charspan=(0, len(cap_text)),
251
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
252
+ )
253
+ pic = out_doc.add_picture(prov=prov)
254
+
255
+ if rel.cid in el_to_captions_mapping.keys():
256
+ for caption_cid in el_to_captions_mapping[rel.cid]:
257
+ caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
258
+ new_cap_item = self._add_caption_or_footnote(
259
+ caption_elem, out_doc, pic, page_height
260
+ )
261
+
262
+ pic.captions.append(new_cap_item.get_ref())
263
+
264
+ if rel.cid in el_to_footnotes_mapping.keys():
265
+ for footnote_cid in el_to_footnotes_mapping[rel.cid]:
266
+ footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
267
+ new_footnote_item = self._add_caption_or_footnote(
268
+ footnote_elem, out_doc, pic, page_height
269
+ )
270
+
271
+ pic.footnotes.append(new_footnote_item.get_ref())
272
+
273
+ self._add_child_elements(element, pic, out_doc)
274
+
275
+ elif isinstance(element, ContainerElement): # Form, KV region
276
+ label = element.label
277
+ group_label = GroupLabel.UNSPECIFIED
278
+ if label == DocItemLabel.FORM:
279
+ group_label = GroupLabel.FORM_AREA
280
+ elif label == DocItemLabel.KEY_VALUE_REGION:
281
+ group_label = GroupLabel.KEY_VALUE_AREA
282
+
283
+ container_el = out_doc.add_group(label=group_label)
284
+
285
+ self._add_child_elements(element, container_el, out_doc)
286
+
287
+ return out_doc
288
+
289
+ def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
290
+ assert isinstance(elem, TextElement)
291
+ text = elem.text
292
+ prov = ProvenanceItem(
293
+ page_no=elem.page_no + 1,
294
+ charspan=(0, len(text)),
295
+ bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
296
+ )
297
+ new_item = out_doc.add_text(
298
+ label=elem.label, text=text, prov=prov, parent=parent
299
+ )
300
+ return new_item
301
+
302
+ def _handle_text_element(self, element, out_doc, current_list, page_height):
303
+ cap_text = element.text
304
+
305
+ prov = ProvenanceItem(
306
+ page_no=element.page_no + 1,
307
+ charspan=(0, len(cap_text)),
308
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
309
+ )
310
+ label = element.label
311
+ if label == DocItemLabel.LIST_ITEM:
312
+ if current_list is None:
313
+ current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
314
+
315
+ # TODO: Infer if this is a numbered or a bullet list item
316
+ new_item = out_doc.add_list_item(
317
+ text=cap_text, enumerated=False, prov=prov, parent=current_list
318
+ )
319
+ elif label == DocItemLabel.SECTION_HEADER:
320
+ current_list = None
321
+
322
+ new_item = out_doc.add_heading(text=cap_text, prov=prov)
323
+ elif label == DocItemLabel.FORMULA:
324
+ current_list = None
325
+
326
+ new_item = out_doc.add_text(
327
+ label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
328
+ )
329
+ else:
330
+ current_list = None
331
+
332
+ content_layer = ContentLayer.BODY
333
+ if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
334
+ content_layer = ContentLayer.FURNITURE
335
+
336
+ new_item = out_doc.add_text(
337
+ label=element.label,
338
+ text=cap_text,
339
+ prov=prov,
340
+ content_layer=content_layer,
341
+ )
342
+ return new_item, current_list
343
+
344
+ def _merge_elements(self, element, merged_elem, new_item, page_height):
345
+ assert isinstance(
346
+ merged_elem, type(element)
347
+ ), "Merged element must be of same type as element."
348
+ assert (
349
+ merged_elem.label == new_item.label
350
+ ), "Labels of merged elements must match."
351
+ prov = ProvenanceItem(
352
+ page_no=element.page_no + 1,
353
+ charspan=(
354
+ len(new_item.text) + 1,
355
+ len(new_item.text) + 1 + len(merged_elem.text),
356
+ ),
357
+ bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
358
+ )
359
+ new_item.text += f" {merged_elem.text}"
360
+ new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
361
+ new_item.prov.append(prov)
362
+
363
+ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
364
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
365
+ page_elements = self._assembled_to_readingorder_elements(conv_res)
366
+
367
+ # Apply reading order
368
+ sorted_elements = self.ro_model.predict_reading_order(
369
+ page_elements=page_elements
370
+ )
371
+ el_to_captions_mapping = self.ro_model.predict_to_captions(
372
+ sorted_elements=sorted_elements
373
+ )
374
+ el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
375
+ sorted_elements=sorted_elements
376
+ )
377
+ el_merges_mapping = self.ro_model.predict_merges(
378
+ sorted_elements=sorted_elements
379
+ )
380
+
381
+ docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
382
+ conv_res,
383
+ sorted_elements,
384
+ el_to_captions_mapping,
385
+ el_to_footnotes_mapping,
386
+ el_merges_mapping,
387
+ )
388
+
389
+ return docling_doc
@@ -114,7 +114,9 @@ class TesseractOcrCliModel(BaseOcrModel):
114
114
  # _log.info("df: ", df.head())
115
115
 
116
116
  # Filter rows that contain actual text (ignore header or empty rows)
117
- df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
117
+ df_filtered = df[
118
+ df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
119
+ ]
118
120
 
119
121
  return df_filtered
120
122
 
@@ -27,7 +27,6 @@ from docling.models.document_picture_classifier import (
27
27
  DocumentPictureClassifier,
28
28
  DocumentPictureClassifierOptions,
29
29
  )
30
- from docling.models.ds_glm_model import GlmModel, GlmOptions
31
30
  from docling.models.easyocr_model import EasyOcrModel
32
31
  from docling.models.layout_model import LayoutModel
33
32
  from docling.models.ocr_mac_model import OcrMacModel
@@ -40,6 +39,7 @@ from docling.models.picture_description_api_model import PictureDescriptionApiMo
40
39
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
41
40
  from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
42
41
  from docling.models.rapid_ocr_model import RapidOcrModel
42
+ from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
43
43
  from docling.models.table_structure_model import TableStructureModel
44
44
  from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
45
45
  from docling.models.tesseract_ocr_model import TesseractOcrModel
@@ -76,7 +76,7 @@ class StandardPdfPipeline(PaginatedPipeline):
76
76
  or self.pipeline_options.generate_table_images
77
77
  )
78
78
 
79
- self.glm_model = GlmModel(options=GlmOptions())
79
+ self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
80
80
 
81
81
  if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
82
82
  raise RuntimeError(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.23.0
3
+ Version: 2.24.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,11 +25,10 @@ Provides-Extra: ocrmac
25
25
  Provides-Extra: rapidocr
26
26
  Provides-Extra: tesserocr
27
27
  Provides-Extra: vlm
28
- Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
28
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
29
29
  Requires-Dist: certifi (>=2024.7.4)
30
- Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
31
30
  Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
32
- Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
+ Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
32
  Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
34
33
  Requires-Dist: easyocr (>=1.7,<2.0)
35
34
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
@@ -5,7 +5,7 @@ docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQ
5
5
  docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
6
6
  docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
7
7
  docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
8
- docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
8
+ docling/backend/html_backend.py,sha256=BxYvYmgcio6IqROMFKgyYyoankcNUccalCeYlmTE4fk,16094
9
9
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
11
11
  docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
@@ -15,8 +15,8 @@ docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4y
15
15
  docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
16
16
  docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
17
17
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/backend/xml/jats_backend.py,sha256=JI1iibmrob9Gv9y7zoFncavQ0oJaGWnQoLkozAIiTQU,27513
19
- docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
18
+ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqafCvdr7s,24945
19
+ docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDPcrI1IbQ,71040
20
20
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
21
21
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
@@ -34,23 +34,23 @@ docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,
34
34
  docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
35
35
  docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
36
36
  docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
37
- docling/models/ds_glm_model.py,sha256=1jLEM-B_oHFevKq23zDQpdifE3eJL7qiLr5YLpEf1kQ,15217
38
37
  docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
39
38
  docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
40
39
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
41
- docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
40
+ docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
42
41
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
43
42
  docling/models/picture_description_api_model.py,sha256=SKNoHpqzbfM8iO-DJJ4ccyNVqO0B2d9neLBnXqt50FY,3186
44
43
  docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0ZxwO7yubhh2RkaC_8e8,1910
45
44
  docling/models/picture_description_vlm_model.py,sha256=a2vYUdlcA0--_8neY0tTiU8reCf29NCbVMKwWdMy2QQ,3653
46
45
  docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
46
+ docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
47
47
  docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
48
- docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
48
+ docling/models/tesseract_ocr_cli_model.py,sha256=F5EhS4NDEmLkPq-a0P7o2LrzjmJgACzlYXTDvtD3NtY,9343
49
49
  docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
50
50
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
52
52
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
53
- docling/pipeline/standard_pdf_pipeline.py,sha256=Zoe8GGPujha16_TGYBAxcPriEwgYPaJPkp3BwG5XowU,12862
53
+ docling/pipeline/standard_pdf_pipeline.py,sha256=IQHktVYvueTrYnIgLonaMvfYKKsU3L-hC9dqrR-Lw8g,12904
54
54
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
55
55
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
@@ -62,8 +62,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
62
62
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
63
63
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
64
64
  docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
65
- docling-2.23.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
66
- docling-2.23.0.dist-info/METADATA,sha256=O4EJYC_yjLCFfKnhnzgSW4qGLOHaatDWDXsQS2EJDjU,8720
67
- docling-2.23.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
68
- docling-2.23.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
69
- docling-2.23.0.dist-info/RECORD,,
65
+ docling-2.24.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
66
+ docling-2.24.0.dist-info/METADATA,sha256=0MJ5mBt0GwsZotaSpHnAWzdzWcu_BQFGqGzNR3gRpG4,8672
67
+ docling-2.24.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
68
+ docling-2.24.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
69
+ docling-2.24.0.dist-info/RECORD,,