docling-core 2.8.0__tar.gz → 2.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (60) hide show
  1. {docling_core-2.8.0 → docling_core-2.9.0}/PKG-INFO +1 -1
  2. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/doc/document.py +42 -20
  3. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/legacy_doc/base.py +1 -0
  4. docling_core-2.9.0/docling_core/utils/legacy.py +346 -0
  5. {docling_core-2.8.0 → docling_core-2.9.0}/pyproject.toml +1 -1
  6. {docling_core-2.8.0 → docling_core-2.9.0}/LICENSE +0 -0
  7. {docling_core-2.8.0 → docling_core-2.9.0}/README.md +0 -0
  8. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/__init__.py +0 -0
  9. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/py.typed +0 -0
  10. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  11. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  12. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  13. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  14. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  15. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  16. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  17. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  18. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/search/__init__.py +0 -0
  19. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  20. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/search/mapping.py +0 -0
  21. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/search/meta.py +0 -0
  22. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/search/package.py +0 -0
  23. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/transforms/__init__.py +0 -0
  24. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/transforms/chunker/__init__.py +0 -0
  25. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/transforms/chunker/base.py +0 -0
  26. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  27. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  28. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/__init__.py +0 -0
  29. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/base.py +0 -0
  30. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/doc/__init__.py +0 -0
  31. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/doc/base.py +0 -0
  32. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/doc/labels.py +0 -0
  33. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  40. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  41. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  42. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/legacy_doc/document.py +0 -0
  43. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  44. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/nlp/__init__.py +0 -0
  45. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/nlp/qa.py +0 -0
  46. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/nlp/qa_labels.py +0 -0
  47. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/rec/__init__.py +0 -0
  48. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/rec/attribute.py +0 -0
  49. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/rec/base.py +0 -0
  50. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/rec/predicate.py +0 -0
  51. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/rec/record.py +0 -0
  52. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/rec/statement.py +0 -0
  53. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/types/rec/subject.py +0 -0
  54. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/utils/__init__.py +0 -0
  55. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/utils/alias.py +0 -0
  56. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/utils/file.py +0 -0
  57. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/utils/generate_docs.py +0 -0
  58. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/utils/generate_jsonschema.py +0 -0
  59. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/utils/validate.py +0 -0
  60. {docling_core-2.8.0 → docling_core-2.9.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.8.0
3
+ Version: 2.9.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -380,6 +380,7 @@ class DocumentOrigin(BaseModel):
380
380
  "application/vnd.openxmlformats-officedocument.presentationml.template",
381
381
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
382
382
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
383
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
383
384
  "text/asciidoc",
384
385
  "text/markdown",
385
386
  ]
@@ -445,7 +446,7 @@ class ImageRef(BaseModel):
445
446
  mimetype: str
446
447
  dpi: int
447
448
  size: Size
448
- uri: Union[AnyUrl, Path]
449
+ uri: Union[AnyUrl, Path] = Field(union_mode="left_to_right")
449
450
  _pil: Optional[PILImage.Image] = None
450
451
 
451
452
  @property
@@ -1668,7 +1669,7 @@ class DoclingDocument(BaseModel):
1668
1669
  self,
1669
1670
  root: Optional[NodeItem] = None,
1670
1671
  with_groups: bool = False,
1671
- traverse_pictures: bool = True,
1672
+ traverse_pictures: bool = False,
1672
1673
  page_no: Optional[int] = None,
1673
1674
  _level: int = 0, # fixed parameter, carries through the node nesting level
1674
1675
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
@@ -1685,30 +1686,31 @@ class DoclingDocument(BaseModel):
1685
1686
  if not root:
1686
1687
  root = self.body
1687
1688
 
1689
+ # Yield non-group items or group items when with_groups=True
1688
1690
  if not isinstance(root, GroupItem) or with_groups:
1689
1691
  if isinstance(root, DocItem):
1690
- if page_no is not None:
1691
- for prov in root.prov:
1692
- if prov.page_no == page_no:
1693
- yield root, _level
1694
- else:
1692
+ if page_no is None or any(
1693
+ prov.page_no == page_no for prov in root.prov
1694
+ ):
1695
1695
  yield root, _level
1696
1696
  else:
1697
1697
  yield root, _level
1698
1698
 
1699
+ # Handle picture traversal - only traverse children if requested
1700
+ if isinstance(root, PictureItem) and not traverse_pictures:
1701
+ return
1702
+
1699
1703
  # Traverse children
1700
1704
  for child_ref in root.children:
1701
1705
  child = child_ref.resolve(self)
1702
-
1703
1706
  if isinstance(child, NodeItem):
1704
- # If the child is a NodeItem, recursively traverse it
1705
- if not isinstance(child, PictureItem) or traverse_pictures:
1706
- yield from self.iterate_items(
1707
- child,
1708
- _level=_level + 1,
1709
- with_groups=with_groups,
1710
- page_no=page_no,
1711
- )
1707
+ yield from self.iterate_items(
1708
+ child,
1709
+ with_groups=with_groups,
1710
+ traverse_pictures=traverse_pictures,
1711
+ page_no=page_no,
1712
+ _level=_level + 1,
1713
+ )
1712
1714
 
1713
1715
  def _clear_picture_pil_cache(self):
1714
1716
  """Clear cache storage of all images."""
@@ -1864,7 +1866,7 @@ class DoclingDocument(BaseModel):
1864
1866
 
1865
1867
  """
1866
1868
  with open(filename, "r") as f:
1867
- return cls.model_validate(json.loads(f.read()))
1869
+ return cls.model_validate_json(f.read())
1868
1870
 
1869
1871
  def save_as_yaml(
1870
1872
  self,
@@ -2115,10 +2117,30 @@ class DoclingDocument(BaseModel):
2115
2117
  # Bold, Italic, or Bold-Italic
2116
2118
  # Hence, any underscore that we print into Markdown is coming from document text
2117
2119
  # That means we need to escape it, to properly reflect content in the markdown
2120
+ # However, we need to preserve underscores in image URLs
2121
+ # to maintain their validity
2122
+ # For example: ![image](path/to_image.png) should remain unchanged
2118
2123
  def escape_underscores(text):
2119
- # Replace "_" with "\_" only if it's not already escaped
2120
- escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
2121
- return escaped_text
2124
+ """Escape underscores but leave them intact in the URL.."""
2125
+ # Firstly, identify all the URL patterns.
2126
+ url_pattern = r"!\[.*?\]\((.*?)\)"
2127
+ parts = []
2128
+ last_end = 0
2129
+
2130
+ for match in re.finditer(url_pattern, text):
2131
+ # Text to add before the URL (needs to be escaped)
2132
+ before_url = text[last_end : match.start()]
2133
+ parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2134
+
2135
+ # Add the full URL part (do not escape)
2136
+ parts.append(match.group(0))
2137
+ last_end = match.end()
2138
+
2139
+ # Add the final part of the text (which needs to be escaped)
2140
+ if last_end < len(text):
2141
+ parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2142
+
2143
+ return "".join(parts)
2122
2144
 
2123
2145
  mdtext = escape_underscores(mdtext)
2124
2146
 
@@ -140,6 +140,7 @@ class BaseCell(AliasModel):
140
140
  obj_type: str = Field(
141
141
  alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
142
142
  )
143
+ payload: Optional[dict] = None
143
144
 
144
145
  def get_location_tokens(
145
146
  self,
@@ -0,0 +1,346 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Utilities for converting between legacy and new document format."""
7
+
8
+ import hashlib
9
+ import uuid
10
+ from typing import Union
11
+
12
+ from docling_core.types.doc import (
13
+ DocItem,
14
+ DocItemLabel,
15
+ DoclingDocument,
16
+ PictureItem,
17
+ SectionHeaderItem,
18
+ TableCell,
19
+ TableItem,
20
+ TextItem,
21
+ )
22
+ from docling_core.types.doc.document import ListItem
23
+ from docling_core.types.legacy_doc.base import (
24
+ BaseCell,
25
+ BaseText,
26
+ Figure,
27
+ GlmTableCell,
28
+ PageDimensions,
29
+ PageReference,
30
+ Prov,
31
+ Ref,
32
+ )
33
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
34
+ from docling_core.types.legacy_doc.base import TableCell as DsTableCell
35
+ from docling_core.types.legacy_doc.document import (
36
+ CCSDocumentDescription as DsDocumentDescription,
37
+ )
38
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
39
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
40
+
41
+
42
+ def _create_hash(string: str):
43
+ hasher = hashlib.sha256()
44
+ hasher.update(string.encode("utf-8"))
45
+
46
+ return hasher.hexdigest()
47
+
48
+
49
+ def doc_item_label_to_legacy_type(label: DocItemLabel):
50
+ """Convert the DocItemLabel to the legacy type."""
51
+ _label_to_ds_type = {
52
+ DocItemLabel.TITLE: "title",
53
+ DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
54
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
55
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
56
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
57
+ DocItemLabel.CAPTION: "caption",
58
+ DocItemLabel.PAGE_HEADER: "page-header",
59
+ DocItemLabel.PAGE_FOOTER: "page-footer",
60
+ DocItemLabel.FOOTNOTE: "footnote",
61
+ DocItemLabel.TABLE: "table",
62
+ DocItemLabel.FORMULA: "equation",
63
+ DocItemLabel.LIST_ITEM: "paragraph",
64
+ DocItemLabel.CODE: "paragraph",
65
+ DocItemLabel.PICTURE: "figure",
66
+ DocItemLabel.TEXT: "paragraph",
67
+ DocItemLabel.PARAGRAPH: "paragraph",
68
+ }
69
+ if label in _label_to_ds_type:
70
+ return _label_to_ds_type[label]
71
+ return label.value
72
+
73
+
74
+ def doc_item_label_to_legacy_name(label: DocItemLabel):
75
+ """Convert the DocItemLabel to the legacy name."""
76
+ _reverse_label_name_mapping = {
77
+ DocItemLabel.CAPTION: "Caption",
78
+ DocItemLabel.FOOTNOTE: "Footnote",
79
+ DocItemLabel.FORMULA: "Formula",
80
+ DocItemLabel.LIST_ITEM: "List-item",
81
+ DocItemLabel.PAGE_FOOTER: "Page-footer",
82
+ DocItemLabel.PAGE_HEADER: "Page-header",
83
+ DocItemLabel.PICTURE: "Picture",
84
+ DocItemLabel.SECTION_HEADER: "Section-header",
85
+ DocItemLabel.TABLE: "Table",
86
+ DocItemLabel.TEXT: "Text",
87
+ DocItemLabel.TITLE: "Title",
88
+ DocItemLabel.DOCUMENT_INDEX: "Document Index",
89
+ DocItemLabel.CODE: "Code",
90
+ DocItemLabel.CHECKBOX_SELECTED: "Checkbox-Selected",
91
+ DocItemLabel.CHECKBOX_UNSELECTED: "Checkbox-Unselected",
92
+ DocItemLabel.FORM: "Form",
93
+ DocItemLabel.KEY_VALUE_REGION: "Key-Value Region",
94
+ DocItemLabel.PARAGRAPH: "paragraph",
95
+ }
96
+ if label in _reverse_label_name_mapping:
97
+ return _reverse_label_name_mapping[label]
98
+ return label.value
99
+
100
+
101
+ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "file"):
102
+ """Convert a DoclingDocument to the legacy format."""
103
+ title = ""
104
+ desc: DsDocumentDescription = DsDocumentDescription(logs=[])
105
+
106
+ if doc.origin is not None:
107
+ document_hash = _create_hash(str(doc.origin.binary_hash))
108
+ filename = doc.origin.filename
109
+ else:
110
+ document_hash = _create_hash(str(uuid.uuid4()))
111
+ filename = fallback_filaname
112
+
113
+ page_hashes = [
114
+ PageReference(
115
+ hash=_create_hash(document_hash + ":" + str(p.page_no - 1)),
116
+ page=p.page_no,
117
+ model="default",
118
+ )
119
+ for p in doc.pages.values()
120
+ ]
121
+
122
+ file_info = DsFileInfoObject(
123
+ filename=filename,
124
+ document_hash=document_hash,
125
+ num_pages=len(doc.pages),
126
+ page_hashes=page_hashes,
127
+ )
128
+
129
+ main_text: list[Union[Ref, BaseText]] = []
130
+ tables: list[DsSchemaTable] = []
131
+ figures: list[Figure] = []
132
+ equations: list[BaseCell] = []
133
+ footnotes: list[BaseText] = []
134
+ page_headers: list[BaseText] = []
135
+ page_footers: list[BaseText] = []
136
+
137
+ # TODO: populate page_headers page_footers from doc.furniture
138
+
139
+ embedded_captions = set()
140
+ for ix, (item, level) in enumerate(doc.iterate_items(doc.body)):
141
+
142
+ if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
143
+ caption = item.caption_text(doc)
144
+ if caption:
145
+ embedded_captions.add(caption)
146
+
147
+ for item, level in doc.iterate_items():
148
+ if isinstance(item, DocItem):
149
+ item_type = item.label
150
+
151
+ if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
152
+
153
+ if isinstance(item, ListItem) and item.marker:
154
+ text = f"{item.marker} {item.text}"
155
+ else:
156
+ text = item.text
157
+
158
+ # Can be empty.
159
+ prov = [
160
+ Prov(
161
+ bbox=p.bbox.as_tuple(),
162
+ page=p.page_no,
163
+ span=[0, len(item.text)],
164
+ )
165
+ for p in item.prov
166
+ ]
167
+ main_text.append(
168
+ BaseText(
169
+ text=text,
170
+ obj_type=doc_item_label_to_legacy_type(item.label),
171
+ name=doc_item_label_to_legacy_name(item.label),
172
+ prov=prov,
173
+ )
174
+ )
175
+
176
+ # skip captions of they are embedded in the actual
177
+ # floating object
178
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
179
+ continue
180
+
181
+ elif isinstance(item, TableItem) and item.data:
182
+ index = len(tables)
183
+ ref_str = f"#/tables/{index}"
184
+ main_text.append(
185
+ Ref(
186
+ name=doc_item_label_to_legacy_name(item.label),
187
+ obj_type=doc_item_label_to_legacy_type(item.label),
188
+ ref=ref_str,
189
+ ),
190
+ )
191
+
192
+ # Initialise empty table data grid (only empty cells)
193
+ table_data = [
194
+ [
195
+ DsTableCell(
196
+ text="",
197
+ # bbox=[0,0,0,0],
198
+ spans=[[i, j]],
199
+ obj_type="body",
200
+ )
201
+ for j in range(item.data.num_cols)
202
+ ]
203
+ for i in range(item.data.num_rows)
204
+ ]
205
+
206
+ # Overwrite cells in table data for which there is actual cell content.
207
+ for cell in item.data.table_cells:
208
+ for i in range(
209
+ min(cell.start_row_offset_idx, item.data.num_rows),
210
+ min(cell.end_row_offset_idx, item.data.num_rows),
211
+ ):
212
+ for j in range(
213
+ min(cell.start_col_offset_idx, item.data.num_cols),
214
+ min(cell.end_col_offset_idx, item.data.num_cols),
215
+ ):
216
+ celltype = "body"
217
+ if cell.column_header:
218
+ celltype = "col_header"
219
+ elif cell.row_header:
220
+ celltype = "row_header"
221
+ elif cell.row_section:
222
+ celltype = "row_section"
223
+
224
+ def _make_spans(cell: TableCell, table_item: TableItem):
225
+ for rspan in range(
226
+ min(
227
+ cell.start_row_offset_idx,
228
+ table_item.data.num_rows,
229
+ ),
230
+ min(
231
+ cell.end_row_offset_idx,
232
+ table_item.data.num_rows,
233
+ ),
234
+ ):
235
+ for cspan in range(
236
+ min(
237
+ cell.start_col_offset_idx,
238
+ table_item.data.num_cols,
239
+ ),
240
+ min(
241
+ cell.end_col_offset_idx,
242
+ table_item.data.num_cols,
243
+ ),
244
+ ):
245
+ yield [rspan, cspan]
246
+
247
+ spans = list(_make_spans(cell, item))
248
+ table_data[i][j] = GlmTableCell(
249
+ text=cell.text,
250
+ bbox=(
251
+ cell.bbox.as_tuple()
252
+ if cell.bbox is not None
253
+ else None
254
+ ), # check if this is bottom-left
255
+ spans=spans,
256
+ obj_type=celltype,
257
+ col=j,
258
+ row=i,
259
+ row_header=cell.row_header,
260
+ row_section=cell.row_section,
261
+ col_header=cell.column_header,
262
+ row_span=[
263
+ cell.start_row_offset_idx,
264
+ cell.end_row_offset_idx,
265
+ ],
266
+ col_span=[
267
+ cell.start_col_offset_idx,
268
+ cell.end_col_offset_idx,
269
+ ],
270
+ )
271
+
272
+ # Compute the caption
273
+ caption = item.caption_text(doc)
274
+
275
+ tables.append(
276
+ DsSchemaTable(
277
+ text=caption,
278
+ num_cols=item.data.num_cols,
279
+ num_rows=item.data.num_rows,
280
+ obj_type=doc_item_label_to_legacy_type(item.label),
281
+ data=table_data,
282
+ prov=[
283
+ Prov(
284
+ bbox=p.bbox.as_tuple(),
285
+ page=p.page_no,
286
+ span=[0, 0],
287
+ )
288
+ for p in item.prov
289
+ ],
290
+ )
291
+ )
292
+
293
+ elif isinstance(item, PictureItem):
294
+ index = len(figures)
295
+ ref_str = f"#/figures/{index}"
296
+ main_text.append(
297
+ Ref(
298
+ name=doc_item_label_to_legacy_name(item.label),
299
+ obj_type=doc_item_label_to_legacy_type(item.label),
300
+ ref=ref_str,
301
+ ),
302
+ )
303
+
304
+ # Compute the caption
305
+ caption = item.caption_text(doc)
306
+
307
+ figures.append(
308
+ Figure(
309
+ prov=[
310
+ Prov(
311
+ bbox=p.bbox.as_tuple(),
312
+ page=p.page_no,
313
+ span=[0, len(caption)],
314
+ )
315
+ for p in item.prov
316
+ ],
317
+ obj_type=doc_item_label_to_legacy_type(item.label),
318
+ text=caption,
319
+ # data=[[]],
320
+ )
321
+ )
322
+
323
+ page_dimensions = [
324
+ PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
325
+ for p in doc.pages.values()
326
+ ]
327
+
328
+ legacy_doc: DsDocument = DsDocument(
329
+ name=title,
330
+ description=desc,
331
+ file_info=file_info,
332
+ main_text=main_text,
333
+ equations=equations,
334
+ footnotes=footnotes,
335
+ page_headers=page_headers,
336
+ page_footers=page_footers,
337
+ tables=tables,
338
+ figures=figures,
339
+ page_dimensions=page_dimensions,
340
+ )
341
+
342
+ return legacy_doc
343
+
344
+
345
+ # def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:
346
+ # """Convert a legacy document to DoclingDocument."""
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.8.0"
3
+ version = "2.9.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes