docling-core 2.0.1__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (55) hide show
  1. {docling_core-2.0.1 → docling_core-2.1.0}/PKG-INFO +2 -2
  2. {docling_core-2.0.1 → docling_core-2.1.0}/README.md +1 -1
  3. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/chunker/__init__.py +4 -1
  4. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/chunker/base.py +1 -1
  5. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/chunker/hierarchical_chunker.py +58 -6
  6. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/__init__.py +1 -1
  7. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/base.py +7 -0
  8. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/document.py +245 -111
  9. {docling_core-2.0.1 → docling_core-2.1.0}/pyproject.toml +1 -1
  10. {docling_core-2.0.1 → docling_core-2.1.0}/LICENSE +0 -0
  11. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/__init__.py +0 -0
  12. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/py.typed +0 -0
  13. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  14. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  15. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  16. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  17. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  18. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  19. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  20. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  21. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/__init__.py +0 -0
  22. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  23. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/mapping.py +0 -0
  24. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/meta.py +0 -0
  25. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/package.py +0 -0
  26. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/__init__.py +0 -0
  27. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/__init__.py +0 -0
  28. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/base.py +0 -0
  29. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/labels.py +0 -0
  30. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/gen/__init__.py +0 -0
  31. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/gen/generic.py +0 -0
  32. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  33. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/base.py +0 -0
  34. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  35. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  36. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  37. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/document.py +0 -0
  38. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  39. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/nlp/__init__.py +0 -0
  40. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/nlp/qa.py +0 -0
  41. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/nlp/qa_labels.py +0 -0
  42. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/__init__.py +0 -0
  43. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/attribute.py +0 -0
  44. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/base.py +0 -0
  45. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/predicate.py +0 -0
  46. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/record.py +0 -0
  47. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/statement.py +0 -0
  48. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/subject.py +0 -0
  49. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/__init__.py +0 -0
  50. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/alias.py +0 -0
  51. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/file.py +0 -0
  52. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/generate_docs.py +0 -0
  53. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/generate_jsonschema.py +0 -0
  54. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/validate.py +0 -0
  55. {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.0.1
3
+ Version: 2.1.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -95,7 +95,7 @@ poetry run pytest test
95
95
 
96
96
  Docling Core contains 3 top-level data types:
97
97
 
98
- - **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
98
+ - **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
99
99
  The DoclingDocument type also models the metadata that may be attached to the converted document.
100
100
  Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
101
101
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
@@ -59,7 +59,7 @@ poetry run pytest test
59
59
 
60
60
  Docling Core contains 3 top-level data types:
61
61
 
62
- - **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
62
+ - **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
63
63
  The DoclingDocument type also models the metadata that may be attached to the converted document.
64
64
  Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
65
65
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
@@ -6,4 +6,7 @@
6
6
  """Define the chunker types."""
7
7
 
8
8
  from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9
- from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
9
+ from docling_core.transforms.chunker.hierarchical_chunker import (
10
+ DocMeta,
11
+ HierarchicalChunker,
12
+ )
@@ -13,7 +13,7 @@ from docling_core.types.doc import DoclingDocument as DLDocument
13
13
 
14
14
 
15
15
  class BaseMeta(BaseModel):
16
- """Metadata base class."""
16
+ """Chunk metadata base class."""
17
17
 
18
18
  excluded_embed: ClassVar[list[str]] = []
19
19
  excluded_llm: ClassVar[list[str]] = []
@@ -8,15 +8,19 @@
8
8
  from __future__ import annotations
9
9
 
10
10
  import logging
11
- from typing import Any, ClassVar, Iterator, Optional
11
+ import re
12
+ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
12
13
 
13
14
  from pandas import DataFrame
14
- from pydantic import Field
15
+ from pydantic import Field, StringConstraints, field_validator
16
+ from typing_extensions import Annotated
15
17
 
18
+ from docling_core.search.package import VERSION_PATTERN
16
19
  from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
17
- from docling_core.types.doc import DoclingDocument as DLDocument
20
+ from docling_core.types import DoclingDocument as DLDocument
18
21
  from docling_core.types.doc.document import (
19
22
  DocItem,
23
+ DocumentOrigin,
20
24
  LevelNumber,
21
25
  ListItem,
22
26
  SectionHeaderItem,
@@ -25,16 +29,31 @@ from docling_core.types.doc.document import (
25
29
  )
26
30
  from docling_core.types.doc.labels import DocItemLabel
27
31
 
32
+ _VERSION: Final = "1.0.0"
33
+
34
+ _KEY_SCHEMA_NAME = "schema_name"
35
+ _KEY_VERSION = "version"
28
36
  _KEY_DOC_ITEMS = "doc_items"
29
37
  _KEY_HEADINGS = "headings"
30
38
  _KEY_CAPTIONS = "captions"
39
+ _KEY_ORIGIN = "origin"
31
40
 
32
41
  _logger = logging.getLogger(__name__)
33
42
 
34
43
 
35
44
  class DocMeta(BaseMeta):
36
- """Data model for Hierarchical Chunker metadata."""
45
+ """Data model for Hierarchical Chunker chunk metadata."""
37
46
 
47
+ schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
48
+ default="docling_core.transforms.chunker.DocMeta",
49
+ alias=_KEY_SCHEMA_NAME,
50
+ )
51
+ version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
52
+ Field(
53
+ default=_VERSION,
54
+ alias=_KEY_VERSION,
55
+ )
56
+ )
38
57
  doc_items: list[DocItem] = Field(
39
58
  alias=_KEY_DOC_ITEMS,
40
59
  min_length=1,
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
49
68
  alias=_KEY_CAPTIONS,
50
69
  min_length=1,
51
70
  )
71
+ origin: Optional[DocumentOrigin] = Field(
72
+ default=None,
73
+ alias=_KEY_ORIGIN,
74
+ )
52
75
 
53
- excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
54
- excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
76
+ excluded_embed: ClassVar[list[str]] = [
77
+ _KEY_SCHEMA_NAME,
78
+ _KEY_VERSION,
79
+ _KEY_DOC_ITEMS,
80
+ _KEY_ORIGIN,
81
+ ]
82
+ excluded_llm: ClassVar[list[str]] = [
83
+ _KEY_SCHEMA_NAME,
84
+ _KEY_VERSION,
85
+ _KEY_DOC_ITEMS,
86
+ _KEY_ORIGIN,
87
+ ]
88
+
89
+ @field_validator(_KEY_VERSION)
90
+ @classmethod
91
+ def check_version_is_compatible(cls, v: str) -> str:
92
+ """Check if this meta item version is compatible with current version."""
93
+ current_match = re.match(VERSION_PATTERN, _VERSION)
94
+ doc_match = re.match(VERSION_PATTERN, v)
95
+ if (
96
+ doc_match is None
97
+ or current_match is None
98
+ or doc_match["major"] != current_match["major"]
99
+ or doc_match["minor"] > current_match["minor"]
100
+ ):
101
+ raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
102
+ else:
103
+ return _VERSION
55
104
 
56
105
 
57
106
  class DocChunk(BaseChunk):
@@ -129,6 +178,7 @@ class HierarchicalChunker(BaseChunker):
129
178
  for k in sorted(heading_by_level)
130
179
  ]
131
180
  or None,
181
+ origin=dl_doc.origin,
132
182
  ),
133
183
  )
134
184
  list_items = [] # reset
@@ -171,6 +221,7 @@ class HierarchicalChunker(BaseChunker):
171
221
  headings=[heading_by_level[k] for k in sorted(heading_by_level)]
172
222
  or None,
173
223
  captions=captions,
224
+ origin=dl_doc.origin,
174
225
  ),
175
226
  )
176
227
  yield c
@@ -182,5 +233,6 @@ class HierarchicalChunker(BaseChunker):
182
233
  doc_items=list_items,
183
234
  headings=[heading_by_level[k] for k in sorted(heading_by_level)]
184
235
  or None,
236
+ origin=dl_doc.origin,
185
237
  ),
186
238
  )
@@ -5,7 +5,7 @@
5
5
 
6
6
  """Package for models defined by the Document type."""
7
7
 
8
- from .base import BoundingBox, CoordOrigin, Size
8
+ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
10
  DocItem,
11
11
  DoclingDocument,
@@ -7,6 +7,13 @@ from typing import Tuple
7
7
  from pydantic import BaseModel
8
8
 
9
9
 
10
+ class ImageRefMode(str, Enum):
11
+ """ImageRefMode."""
12
+
13
+ PLACEHOLDER = "placeholder"
14
+ EMBEDDED = "embedded"
15
+
16
+
10
17
  class CoordOrigin(str, Enum):
11
18
  """CoordOrigin."""
12
19
 
@@ -3,6 +3,7 @@
3
3
  import base64
4
4
  import mimetypes
5
5
  import re
6
+ import sys
6
7
  import typing
7
8
  from io import BytesIO
8
9
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
@@ -25,6 +26,7 @@ from typing_extensions import Annotated, Self
25
26
  from docling_core.search.package import VERSION_PATTERN
26
27
  from docling_core.types.base import _JSON_POINTER_REGEX
27
28
  from docling_core.types.doc import BoundingBox, Size
29
+ from docling_core.types.doc.base import ImageRefMode
28
30
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
29
31
  from docling_core.types.legacy_doc.tokens import DocumentToken
30
32
 
@@ -215,6 +217,7 @@ class DocumentOrigin(BaseModel):
215
217
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
216
218
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
217
219
  "text/asciidoc",
220
+ "text/markdown",
218
221
  ]
219
222
 
220
223
  @field_validator("binary_hash", mode="before")
@@ -1108,12 +1111,14 @@ class DoclingDocument(BaseModel):
1108
1111
 
1109
1112
  def export_to_markdown( # noqa: C901
1110
1113
  self,
1111
- delim: str = "\n\n",
1114
+ delim: str = "\n",
1112
1115
  from_element: int = 0,
1113
- to_element: Optional[int] = None,
1116
+ to_element: int = sys.maxsize,
1114
1117
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1115
1118
  strict_text: bool = False,
1116
1119
  image_placeholder: str = "<!-- image -->",
1120
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1121
+ indent: int = 4,
1117
1122
  ) -> str:
1118
1123
  r"""Serialize to Markdown.
1119
1124
 
@@ -1143,136 +1148,150 @@ class DoclingDocument(BaseModel):
1143
1148
  :param strict_text: bool: (Default value = False)
1144
1149
  :param image_placeholder str: (Default value = "<!-- image -->")
1145
1150
  the placeholder to include to position images in the markdown.
1151
+ :param indent: int (default=4): indent of the nested lists
1146
1152
  :returns: The exported Markdown representation.
1147
1153
  :rtype: str
1148
1154
  """
1149
- has_title = False
1150
- prev_text = ""
1151
- md_texts: list[str] = []
1155
+ mdtexts: list[str] = []
1156
+ list_nesting_level = 0 # Track the current list nesting level
1157
+ previous_level = 0 # Track the previous item's level
1158
+ in_list = False # Track if we're currently processing list items
1152
1159
 
1153
- # collect all captions embedded in table and figure objects
1154
- # to avoid repeating them
1155
- embedded_captions = set()
1156
- skip_count = 0
1157
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1158
- if skip_count < from_element:
1159
- skip_count += 1
1160
- continue # skip as many items as you want
1161
-
1162
- if to_element and ix >= to_element:
1163
- break
1160
+ for ix, (item, level) in enumerate(
1161
+ self.iterate_items(self.body, with_groups=True)
1162
+ ):
1163
+ # If we've moved to a lower level, we're exiting one or more groups
1164
+ if level < previous_level:
1165
+ # Calculate how many levels we've exited
1166
+ level_difference = previous_level - level
1167
+ # Decrement list_nesting_level for each list group we've exited
1168
+ list_nesting_level = max(0, list_nesting_level - level_difference)
1164
1169
 
1165
- if (
1166
- isinstance(item, (TableItem, PictureItem))
1167
- and len(item.captions) > 0
1168
- and item.label in labels
1169
- ):
1170
- caption = item.caption_text(self)
1171
- if caption:
1172
- embedded_captions.add(caption)
1170
+ previous_level = level # Update previous_level for next iteration
1173
1171
 
1174
- skip_count = 0
1175
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1176
- if skip_count < from_element:
1177
- skip_count += 1
1172
+ if ix < from_element and to_element <= ix:
1178
1173
  continue # skip as many items as you want
1179
1174
 
1180
- if to_element and ix >= to_element:
1181
- break
1182
-
1183
- markdown_text = ""
1184
-
1185
- if isinstance(item, DocItem):
1186
- item_type = item.label
1187
-
1188
- if isinstance(item, TextItem) and item_type in labels:
1189
- text = item.text
1190
-
1191
- # skip captions of they are embedded in the actual
1192
- # floating object
1193
- if item_type == DocItemLabel.CAPTION and text in embedded_captions:
1194
- continue
1195
-
1196
- # ignore repeated text
1197
- if prev_text == text or text is None:
1198
- continue
1199
- else:
1200
- prev_text = text
1201
-
1202
- # first title match
1203
- if item_type == DocItemLabel.TITLE and not has_title:
1204
- if strict_text:
1205
- markdown_text = f"{text}"
1206
- else:
1207
- markdown_text = f"# {text}"
1208
- has_title = True
1209
-
1210
- # secondary titles
1211
- elif item_type in {
1212
- DocItemLabel.TITLE,
1213
- DocItemLabel.SECTION_HEADER,
1214
- } or (has_title and item_type == DocItemLabel.TITLE):
1215
- if strict_text:
1216
- markdown_text = f"{text}"
1217
- else:
1218
- markdown_text = f"## {text}"
1219
-
1220
- # secondary titles
1221
- elif isinstance(item, ListItem):
1222
- if item.enumerated:
1223
- marker = item.marker
1224
- else:
1225
- marker = "-"
1226
-
1227
- markdown_text = f"{marker} {text}"
1228
-
1229
- # normal text
1230
- else:
1231
- markdown_text = text
1232
-
1233
- elif isinstance(item, TableItem) and item.data and item_type in labels:
1234
- parts = []
1235
-
1236
- # Compute the caption
1237
- if caption := item.caption_text(self):
1238
- parts.append(caption)
1239
- parts.append("\n")
1175
+ # Handle newlines between different types of content
1176
+ if (
1177
+ len(mdtexts) > 0
1178
+ and not isinstance(item, (ListItem, GroupItem))
1179
+ and in_list
1180
+ ):
1181
+ mdtexts[-1] += "\n"
1182
+ in_list = False
1240
1183
 
1241
- # Rendered the item
1242
- if not strict_text:
1243
- md_table = item.export_to_markdown()
1244
- if md_table:
1245
- parts.append(item.export_to_markdown())
1184
+ if isinstance(item, GroupItem) and item.label in [
1185
+ GroupLabel.LIST,
1186
+ GroupLabel.ORDERED_LIST,
1187
+ ]:
1246
1188
 
1247
- # Combine parts
1248
- markdown_text = "\n".join(parts)
1189
+ if list_nesting_level == 0: # Check if we're on the top level.
1190
+ # In that case a new list starts directly after another list.
1191
+ mdtexts.append("\n") # Add a blank line
1249
1192
 
1250
- elif isinstance(item, PictureItem) and item_type in labels:
1251
- parts = []
1193
+ # Increment list nesting level when entering a new list
1194
+ list_nesting_level += 1
1195
+ in_list = True
1196
+ continue
1252
1197
 
1253
- # Compute the caption
1254
- if caption := item.caption_text(self):
1255
- parts.append(caption)
1256
- parts.append("\n")
1198
+ elif isinstance(item, GroupItem):
1199
+ continue
1257
1200
 
1258
- # Rendered the item
1259
- if not strict_text:
1260
- parts.append(f"{image_placeholder}")
1201
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
1202
+ in_list = False
1203
+ marker = "" if strict_text else "#"
1204
+ text = f"{marker} {item.text}\n"
1205
+ mdtexts.append(text.strip())
1206
+
1207
+ elif (
1208
+ isinstance(item, TextItem)
1209
+ and item.label in [DocItemLabel.SECTION_HEADER]
1210
+ ) or isinstance(item, SectionHeaderItem):
1211
+ in_list = False
1212
+ marker = ""
1213
+ if not strict_text:
1214
+ marker = "#" * level
1215
+ if len(marker) < 2:
1216
+ marker = "##"
1217
+ text = f"{marker} {item.text}\n"
1218
+ mdtexts.append(text.strip() + "\n")
1219
+
1220
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
1221
+ in_list = False
1222
+ text = f"```\n{item.text}\n```\n"
1223
+ mdtexts.append(text)
1224
+
1225
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
1226
+ # captions are printed in picture and table ... skipping for now
1227
+ continue
1261
1228
 
1262
- # Combine parts
1263
- markdown_text = "\n".join(parts)
1229
+ elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
1230
+ in_list = True
1231
+ # Calculate indent based on list_nesting_level
1232
+ # -1 because level 1 needs no indent
1233
+ list_indent = " " * (indent * (list_nesting_level - 1))
1234
+
1235
+ marker = ""
1236
+ if strict_text:
1237
+ marker = ""
1238
+ elif item.enumerated:
1239
+ marker = item.marker
1240
+ else:
1241
+ marker = "-" # Markdown needs only dash as item marker.
1242
+
1243
+ text = f"{list_indent}{marker} {item.text}"
1244
+ mdtexts.append(text)
1245
+
1246
+ elif isinstance(item, TextItem) and item.label in labels:
1247
+ in_list = False
1248
+ if len(item.text):
1249
+ text = f"{item.text}\n"
1250
+ mdtexts.append(text)
1251
+
1252
+ elif isinstance(item, TableItem) and not strict_text:
1253
+ in_list = False
1254
+ mdtexts.append(item.caption_text(self))
1255
+ md_table = item.export_to_markdown()
1256
+ mdtexts.append("\n" + md_table + "\n")
1257
+
1258
+ elif isinstance(item, PictureItem) and not strict_text:
1259
+ in_list = False
1260
+ mdtexts.append(item.caption_text(self))
1261
+
1262
+ if image_mode == ImageRefMode.PLACEHOLDER:
1263
+ mdtexts.append("\n" + image_placeholder + "\n")
1264
+ elif image_mode == ImageRefMode.EMBEDDED and isinstance(
1265
+ item.image, ImageRef
1266
+ ):
1267
+ text = f"![Local Image]({item.image.uri})\n"
1268
+ mdtexts.append(text)
1269
+ elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
1270
+ item.image, ImageRef
1271
+ ):
1272
+ text = (
1273
+ "<!-- 🖼️❌ Image not available. "
1274
+ "Please use `PdfPipelineOptions(generate_picture_images=True)`"
1275
+ " --> "
1276
+ )
1277
+ mdtexts.append(text)
1264
1278
 
1265
- if markdown_text:
1266
- md_texts.append(markdown_text)
1279
+ elif isinstance(item, DocItem) and item.label in labels:
1280
+ in_list = False
1281
+ text = "<missing-text>"
1282
+ mdtexts.append(text)
1267
1283
 
1268
- result = delim.join(md_texts)
1269
- return result
1284
+ mdtext = (delim.join(mdtexts)).strip()
1285
+ mdtext = re.sub(
1286
+ r"\n\n\n+", "\n\n", mdtext
1287
+ ) # remove cases of double or more empty lines.
1288
+ return mdtext
1270
1289
 
1271
1290
  def export_to_text( # noqa: C901
1272
1291
  self,
1273
1292
  delim: str = "\n\n",
1274
1293
  from_element: int = 0,
1275
- to_element: Optional[int] = None,
1294
+ to_element: int = 1000000,
1276
1295
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1277
1296
  ) -> str:
1278
1297
  """export_to_text."""
@@ -1399,6 +1418,121 @@ class DoclingDocument(BaseModel):
1399
1418
 
1400
1419
  return doctags
1401
1420
 
1421
+ def _export_to_indented_text(
1422
+ self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
1423
+ ):
1424
+ """Export the document to indented text to expose hierarchy."""
1425
+ result = []
1426
+
1427
+ def get_text(text: str, max_text_len: int):
1428
+
1429
+ middle = " ... "
1430
+
1431
+ if max_text_len == -1:
1432
+ return text
1433
+ elif len(text) < max_text_len + len(middle):
1434
+ return text
1435
+ else:
1436
+ tbeg = int((max_text_len - len(middle)) / 2)
1437
+ tend = int(max_text_len - tbeg)
1438
+
1439
+ return text[0:tbeg] + middle + text[-tend:]
1440
+
1441
+ for i, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1442
+ if isinstance(item, GroupItem):
1443
+ result.append(
1444
+ indent * level
1445
+ + f"item-{i} at level {level}: {item.label}: group {item.name}"
1446
+ )
1447
+
1448
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
1449
+ text = get_text(text=item.text, max_text_len=max_text_len)
1450
+
1451
+ result.append(
1452
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1453
+ )
1454
+
1455
+ elif isinstance(item, SectionHeaderItem):
1456
+ text = get_text(text=item.text, max_text_len=max_text_len)
1457
+
1458
+ result.append(
1459
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1460
+ )
1461
+
1462
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
1463
+ text = get_text(text=item.text, max_text_len=max_text_len)
1464
+
1465
+ result.append(
1466
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1467
+ )
1468
+
1469
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
1470
+ # captions are printed in picture and table ... skipping for now
1471
+ continue
1472
+
1473
+ elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
1474
+ text = get_text(text=item.text, max_text_len=max_text_len)
1475
+
1476
+ result.append(
1477
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1478
+ )
1479
+
1480
+ elif isinstance(item, TextItem):
1481
+ text = get_text(text=item.text, max_text_len=max_text_len)
1482
+
1483
+ result.append(
1484
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1485
+ )
1486
+
1487
+ elif isinstance(item, TableItem):
1488
+
1489
+ result.append(
1490
+ indent * level
1491
+ + f"item-{i} at level {level}: {item.label} with "
1492
+ + f"[{item.data.num_rows}x{item.data.num_cols}]"
1493
+ )
1494
+
1495
+ for _ in item.captions:
1496
+ caption = _.resolve(self)
1497
+ result.append(
1498
+ indent * (level + 1)
1499
+ + f"item-{i} at level {level + 1}: {caption.label}: "
1500
+ + f"{caption.text}"
1501
+ )
1502
+
1503
+ if explicit_tables:
1504
+ grid: list[list[str]] = []
1505
+ for i, row in enumerate(item.data.grid):
1506
+ grid.append([])
1507
+ for j, cell in enumerate(row):
1508
+ if j < 10:
1509
+ text = get_text(text=cell.text, max_text_len=16)
1510
+ grid[-1].append(text)
1511
+
1512
+ result.append("\n" + tabulate(grid) + "\n")
1513
+
1514
+ elif isinstance(item, PictureItem):
1515
+
1516
+ result.append(
1517
+ indent * level + f"item-{i} at level {level}: {item.label}"
1518
+ )
1519
+
1520
+ for _ in item.captions:
1521
+ caption = _.resolve(self)
1522
+ result.append(
1523
+ indent * (level + 1)
1524
+ + f"item-{i} at level {level + 1}: {caption.label}: "
1525
+ + f"{caption.text}"
1526
+ )
1527
+
1528
+ elif isinstance(item, DocItem):
1529
+ result.append(
1530
+ indent * (level + 1)
1531
+ + f"item-{i} at level {level}: {item.label}: ignored"
1532
+ )
1533
+
1534
+ return "\n".join(result)
1535
+
1402
1536
  def add_page(
1403
1537
  self, page_no: int, size: Size, image: Optional[ImageRef] = None
1404
1538
  ) -> PageItem:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.0.1"
3
+ version = "2.1.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes