docling-core 2.0.1__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (55) hide show
  1. {docling_core-2.0.1 → docling_core-2.2.0}/PKG-INFO +2 -2
  2. {docling_core-2.0.1 → docling_core-2.2.0}/README.md +1 -1
  3. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/transforms/chunker/__init__.py +4 -1
  4. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/transforms/chunker/base.py +1 -1
  5. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/transforms/chunker/hierarchical_chunker.py +58 -6
  6. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/doc/__init__.py +1 -1
  7. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/doc/base.py +7 -0
  8. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/doc/document.py +252 -112
  9. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/file.py +17 -4
  10. {docling_core-2.0.1 → docling_core-2.2.0}/pyproject.toml +1 -1
  11. {docling_core-2.0.1 → docling_core-2.2.0}/LICENSE +0 -0
  12. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/__init__.py +0 -0
  13. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/py.typed +0 -0
  14. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  15. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  16. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  17. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  18. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  19. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  20. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  21. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  22. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/search/__init__.py +0 -0
  23. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  24. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/search/mapping.py +0 -0
  25. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/search/meta.py +0 -0
  26. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/search/package.py +0 -0
  27. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/transforms/__init__.py +0 -0
  28. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/__init__.py +0 -0
  29. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/base.py +0 -0
  30. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/doc/labels.py +0 -0
  31. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/gen/__init__.py +0 -0
  32. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/gen/generic.py +0 -0
  33. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  34. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/legacy_doc/base.py +0 -0
  35. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  36. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  37. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  38. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/legacy_doc/document.py +0 -0
  39. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  40. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/nlp/__init__.py +0 -0
  41. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/nlp/qa.py +0 -0
  42. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/nlp/qa_labels.py +0 -0
  43. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/rec/__init__.py +0 -0
  44. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/rec/attribute.py +0 -0
  45. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/rec/base.py +0 -0
  46. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/rec/predicate.py +0 -0
  47. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/rec/record.py +0 -0
  48. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/rec/statement.py +0 -0
  49. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/rec/subject.py +0 -0
  50. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/__init__.py +0 -0
  51. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/alias.py +0 -0
  52. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/generate_docs.py +0 -0
  53. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/generate_jsonschema.py +0 -0
  54. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/validate.py +0 -0
  55. {docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.0.1
3
+ Version: 2.2.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -95,7 +95,7 @@ poetry run pytest test
95
95
 
96
96
  Docling Core contains 3 top-level data types:
97
97
 
98
- - **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
98
+ - **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
99
99
  The DoclingDocument type also models the metadata that may be attached to the converted document.
100
100
  Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
101
101
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
@@ -59,7 +59,7 @@ poetry run pytest test
59
59
 
60
60
  Docling Core contains 3 top-level data types:
61
61
 
62
- - **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
62
+ - **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
63
63
  The DoclingDocument type also models the metadata that may be attached to the converted document.
64
64
  Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
65
65
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
@@ -6,4 +6,7 @@
6
6
  """Define the chunker types."""
7
7
 
8
8
  from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9
- from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
9
+ from docling_core.transforms.chunker.hierarchical_chunker import (
10
+ DocMeta,
11
+ HierarchicalChunker,
12
+ )
@@ -13,7 +13,7 @@ from docling_core.types.doc import DoclingDocument as DLDocument
13
13
 
14
14
 
15
15
  class BaseMeta(BaseModel):
16
- """Metadata base class."""
16
+ """Chunk metadata base class."""
17
17
 
18
18
  excluded_embed: ClassVar[list[str]] = []
19
19
  excluded_llm: ClassVar[list[str]] = []
@@ -8,15 +8,19 @@
8
8
  from __future__ import annotations
9
9
 
10
10
  import logging
11
- from typing import Any, ClassVar, Iterator, Optional
11
+ import re
12
+ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
12
13
 
13
14
  from pandas import DataFrame
14
- from pydantic import Field
15
+ from pydantic import Field, StringConstraints, field_validator
16
+ from typing_extensions import Annotated
15
17
 
18
+ from docling_core.search.package import VERSION_PATTERN
16
19
  from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
17
- from docling_core.types.doc import DoclingDocument as DLDocument
20
+ from docling_core.types import DoclingDocument as DLDocument
18
21
  from docling_core.types.doc.document import (
19
22
  DocItem,
23
+ DocumentOrigin,
20
24
  LevelNumber,
21
25
  ListItem,
22
26
  SectionHeaderItem,
@@ -25,16 +29,31 @@ from docling_core.types.doc.document import (
25
29
  )
26
30
  from docling_core.types.doc.labels import DocItemLabel
27
31
 
32
+ _VERSION: Final = "1.0.0"
33
+
34
+ _KEY_SCHEMA_NAME = "schema_name"
35
+ _KEY_VERSION = "version"
28
36
  _KEY_DOC_ITEMS = "doc_items"
29
37
  _KEY_HEADINGS = "headings"
30
38
  _KEY_CAPTIONS = "captions"
39
+ _KEY_ORIGIN = "origin"
31
40
 
32
41
  _logger = logging.getLogger(__name__)
33
42
 
34
43
 
35
44
  class DocMeta(BaseMeta):
36
- """Data model for Hierarchical Chunker metadata."""
45
+ """Data model for Hierarchical Chunker chunk metadata."""
37
46
 
47
+ schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
48
+ default="docling_core.transforms.chunker.DocMeta",
49
+ alias=_KEY_SCHEMA_NAME,
50
+ )
51
+ version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
52
+ Field(
53
+ default=_VERSION,
54
+ alias=_KEY_VERSION,
55
+ )
56
+ )
38
57
  doc_items: list[DocItem] = Field(
39
58
  alias=_KEY_DOC_ITEMS,
40
59
  min_length=1,
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
49
68
  alias=_KEY_CAPTIONS,
50
69
  min_length=1,
51
70
  )
71
+ origin: Optional[DocumentOrigin] = Field(
72
+ default=None,
73
+ alias=_KEY_ORIGIN,
74
+ )
52
75
 
53
- excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
54
- excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
76
+ excluded_embed: ClassVar[list[str]] = [
77
+ _KEY_SCHEMA_NAME,
78
+ _KEY_VERSION,
79
+ _KEY_DOC_ITEMS,
80
+ _KEY_ORIGIN,
81
+ ]
82
+ excluded_llm: ClassVar[list[str]] = [
83
+ _KEY_SCHEMA_NAME,
84
+ _KEY_VERSION,
85
+ _KEY_DOC_ITEMS,
86
+ _KEY_ORIGIN,
87
+ ]
88
+
89
+ @field_validator(_KEY_VERSION)
90
+ @classmethod
91
+ def check_version_is_compatible(cls, v: str) -> str:
92
+ """Check if this meta item version is compatible with current version."""
93
+ current_match = re.match(VERSION_PATTERN, _VERSION)
94
+ doc_match = re.match(VERSION_PATTERN, v)
95
+ if (
96
+ doc_match is None
97
+ or current_match is None
98
+ or doc_match["major"] != current_match["major"]
99
+ or doc_match["minor"] > current_match["minor"]
100
+ ):
101
+ raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
102
+ else:
103
+ return _VERSION
55
104
 
56
105
 
57
106
  class DocChunk(BaseChunk):
@@ -129,6 +178,7 @@ class HierarchicalChunker(BaseChunker):
129
178
  for k in sorted(heading_by_level)
130
179
  ]
131
180
  or None,
181
+ origin=dl_doc.origin,
132
182
  ),
133
183
  )
134
184
  list_items = [] # reset
@@ -171,6 +221,7 @@ class HierarchicalChunker(BaseChunker):
171
221
  headings=[heading_by_level[k] for k in sorted(heading_by_level)]
172
222
  or None,
173
223
  captions=captions,
224
+ origin=dl_doc.origin,
174
225
  ),
175
226
  )
176
227
  yield c
@@ -182,5 +233,6 @@ class HierarchicalChunker(BaseChunker):
182
233
  doc_items=list_items,
183
234
  headings=[heading_by_level[k] for k in sorted(heading_by_level)]
184
235
  or None,
236
+ origin=dl_doc.origin,
185
237
  ),
186
238
  )
@@ -5,7 +5,7 @@
5
5
 
6
6
  """Package for models defined by the Document type."""
7
7
 
8
- from .base import BoundingBox, CoordOrigin, Size
8
+ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
10
  DocItem,
11
11
  DoclingDocument,
@@ -7,6 +7,13 @@ from typing import Tuple
7
7
  from pydantic import BaseModel
8
8
 
9
9
 
10
+ class ImageRefMode(str, Enum):
11
+ """ImageRefMode."""
12
+
13
+ PLACEHOLDER = "placeholder"
14
+ EMBEDDED = "embedded"
15
+
16
+
10
17
  class CoordOrigin(str, Enum):
11
18
  """CoordOrigin."""
12
19
 
@@ -3,6 +3,7 @@
3
3
  import base64
4
4
  import mimetypes
5
5
  import re
6
+ import sys
6
7
  import typing
7
8
  from io import BytesIO
8
9
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
@@ -25,6 +26,7 @@ from typing_extensions import Annotated, Self
25
26
  from docling_core.search.package import VERSION_PATTERN
26
27
  from docling_core.types.base import _JSON_POINTER_REGEX
27
28
  from docling_core.types.doc import BoundingBox, Size
29
+ from docling_core.types.doc.base import ImageRefMode
28
30
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
29
31
  from docling_core.types.legacy_doc.tokens import DocumentToken
30
32
 
@@ -215,6 +217,7 @@ class DocumentOrigin(BaseModel):
215
217
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
216
218
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
217
219
  "text/asciidoc",
220
+ "text/markdown",
218
221
  ]
219
222
 
220
223
  @field_validator("binary_hash", mode="before")
@@ -588,7 +591,13 @@ class TableItem(FloatingItem):
588
591
  for row in self.data.grid:
589
592
  tmp = []
590
593
  for col in row:
591
- tmp.append(col.text)
594
+
595
+ # make sure that md tables are not broken
596
+ # due to newline chars in the text
597
+ text = col.text
598
+ text = text.replace("\n", " ")
599
+ tmp.append(text)
600
+
592
601
  table.append(tmp)
593
602
 
594
603
  md_table = ""
@@ -1108,12 +1117,14 @@ class DoclingDocument(BaseModel):
1108
1117
 
1109
1118
  def export_to_markdown( # noqa: C901
1110
1119
  self,
1111
- delim: str = "\n\n",
1120
+ delim: str = "\n",
1112
1121
  from_element: int = 0,
1113
- to_element: Optional[int] = None,
1122
+ to_element: int = sys.maxsize,
1114
1123
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1115
1124
  strict_text: bool = False,
1116
1125
  image_placeholder: str = "<!-- image -->",
1126
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1127
+ indent: int = 4,
1117
1128
  ) -> str:
1118
1129
  r"""Serialize to Markdown.
1119
1130
 
@@ -1143,136 +1154,150 @@ class DoclingDocument(BaseModel):
1143
1154
  :param strict_text: bool: (Default value = False)
1144
1155
  :param image_placeholder str: (Default value = "<!-- image -->")
1145
1156
  the placeholder to include to position images in the markdown.
1157
+ :param indent: int (default=4): indent of the nested lists
1146
1158
  :returns: The exported Markdown representation.
1147
1159
  :rtype: str
1148
1160
  """
1149
- has_title = False
1150
- prev_text = ""
1151
- md_texts: list[str] = []
1161
+ mdtexts: list[str] = []
1162
+ list_nesting_level = 0 # Track the current list nesting level
1163
+ previous_level = 0 # Track the previous item's level
1164
+ in_list = False # Track if we're currently processing list items
1152
1165
 
1153
- # collect all captions embedded in table and figure objects
1154
- # to avoid repeating them
1155
- embedded_captions = set()
1156
- skip_count = 0
1157
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1158
- if skip_count < from_element:
1159
- skip_count += 1
1160
- continue # skip as many items as you want
1161
-
1162
- if to_element and ix >= to_element:
1163
- break
1166
+ for ix, (item, level) in enumerate(
1167
+ self.iterate_items(self.body, with_groups=True)
1168
+ ):
1169
+ # If we've moved to a lower level, we're exiting one or more groups
1170
+ if level < previous_level:
1171
+ # Calculate how many levels we've exited
1172
+ level_difference = previous_level - level
1173
+ # Decrement list_nesting_level for each list group we've exited
1174
+ list_nesting_level = max(0, list_nesting_level - level_difference)
1164
1175
 
1165
- if (
1166
- isinstance(item, (TableItem, PictureItem))
1167
- and len(item.captions) > 0
1168
- and item.label in labels
1169
- ):
1170
- caption = item.caption_text(self)
1171
- if caption:
1172
- embedded_captions.add(caption)
1176
+ previous_level = level # Update previous_level for next iteration
1173
1177
 
1174
- skip_count = 0
1175
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1176
- if skip_count < from_element:
1177
- skip_count += 1
1178
+ if ix < from_element and to_element <= ix:
1178
1179
  continue # skip as many items as you want
1179
1180
 
1180
- if to_element and ix >= to_element:
1181
- break
1182
-
1183
- markdown_text = ""
1184
-
1185
- if isinstance(item, DocItem):
1186
- item_type = item.label
1187
-
1188
- if isinstance(item, TextItem) and item_type in labels:
1189
- text = item.text
1190
-
1191
- # skip captions of they are embedded in the actual
1192
- # floating object
1193
- if item_type == DocItemLabel.CAPTION and text in embedded_captions:
1194
- continue
1195
-
1196
- # ignore repeated text
1197
- if prev_text == text or text is None:
1198
- continue
1199
- else:
1200
- prev_text = text
1201
-
1202
- # first title match
1203
- if item_type == DocItemLabel.TITLE and not has_title:
1204
- if strict_text:
1205
- markdown_text = f"{text}"
1206
- else:
1207
- markdown_text = f"# {text}"
1208
- has_title = True
1209
-
1210
- # secondary titles
1211
- elif item_type in {
1212
- DocItemLabel.TITLE,
1213
- DocItemLabel.SECTION_HEADER,
1214
- } or (has_title and item_type == DocItemLabel.TITLE):
1215
- if strict_text:
1216
- markdown_text = f"{text}"
1217
- else:
1218
- markdown_text = f"## {text}"
1219
-
1220
- # secondary titles
1221
- elif isinstance(item, ListItem):
1222
- if item.enumerated:
1223
- marker = item.marker
1224
- else:
1225
- marker = "-"
1226
-
1227
- markdown_text = f"{marker} {text}"
1228
-
1229
- # normal text
1230
- else:
1231
- markdown_text = text
1232
-
1233
- elif isinstance(item, TableItem) and item.data and item_type in labels:
1234
- parts = []
1235
-
1236
- # Compute the caption
1237
- if caption := item.caption_text(self):
1238
- parts.append(caption)
1239
- parts.append("\n")
1181
+ # Handle newlines between different types of content
1182
+ if (
1183
+ len(mdtexts) > 0
1184
+ and not isinstance(item, (ListItem, GroupItem))
1185
+ and in_list
1186
+ ):
1187
+ mdtexts[-1] += "\n"
1188
+ in_list = False
1240
1189
 
1241
- # Rendered the item
1242
- if not strict_text:
1243
- md_table = item.export_to_markdown()
1244
- if md_table:
1245
- parts.append(item.export_to_markdown())
1190
+ if isinstance(item, GroupItem) and item.label in [
1191
+ GroupLabel.LIST,
1192
+ GroupLabel.ORDERED_LIST,
1193
+ ]:
1246
1194
 
1247
- # Combine parts
1248
- markdown_text = "\n".join(parts)
1195
+ if list_nesting_level == 0: # Check if we're on the top level.
1196
+ # In that case a new list starts directly after another list.
1197
+ mdtexts.append("\n") # Add a blank line
1249
1198
 
1250
- elif isinstance(item, PictureItem) and item_type in labels:
1251
- parts = []
1199
+ # Increment list nesting level when entering a new list
1200
+ list_nesting_level += 1
1201
+ in_list = True
1202
+ continue
1252
1203
 
1253
- # Compute the caption
1254
- if caption := item.caption_text(self):
1255
- parts.append(caption)
1256
- parts.append("\n")
1204
+ elif isinstance(item, GroupItem):
1205
+ continue
1257
1206
 
1258
- # Rendered the item
1259
- if not strict_text:
1260
- parts.append(f"{image_placeholder}")
1207
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
1208
+ in_list = False
1209
+ marker = "" if strict_text else "#"
1210
+ text = f"{marker} {item.text}\n"
1211
+ mdtexts.append(text.strip())
1212
+
1213
+ elif (
1214
+ isinstance(item, TextItem)
1215
+ and item.label in [DocItemLabel.SECTION_HEADER]
1216
+ ) or isinstance(item, SectionHeaderItem):
1217
+ in_list = False
1218
+ marker = ""
1219
+ if not strict_text:
1220
+ marker = "#" * level
1221
+ if len(marker) < 2:
1222
+ marker = "##"
1223
+ text = f"{marker} {item.text}\n"
1224
+ mdtexts.append(text.strip() + "\n")
1225
+
1226
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
1227
+ in_list = False
1228
+ text = f"```\n{item.text}\n```\n"
1229
+ mdtexts.append(text)
1230
+
1231
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
1232
+ # captions are printed in picture and table ... skipping for now
1233
+ continue
1261
1234
 
1262
- # Combine parts
1263
- markdown_text = "\n".join(parts)
1235
+ elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
1236
+ in_list = True
1237
+ # Calculate indent based on list_nesting_level
1238
+ # -1 because level 1 needs no indent
1239
+ list_indent = " " * (indent * (list_nesting_level - 1))
1240
+
1241
+ marker = ""
1242
+ if strict_text:
1243
+ marker = ""
1244
+ elif item.enumerated:
1245
+ marker = item.marker
1246
+ else:
1247
+ marker = "-" # Markdown needs only dash as item marker.
1248
+
1249
+ text = f"{list_indent}{marker} {item.text}"
1250
+ mdtexts.append(text)
1251
+
1252
+ elif isinstance(item, TextItem) and item.label in labels:
1253
+ in_list = False
1254
+ if len(item.text):
1255
+ text = f"{item.text}\n"
1256
+ mdtexts.append(text)
1257
+
1258
+ elif isinstance(item, TableItem) and not strict_text:
1259
+ in_list = False
1260
+ mdtexts.append(item.caption_text(self))
1261
+ md_table = item.export_to_markdown()
1262
+ mdtexts.append("\n" + md_table + "\n")
1263
+
1264
+ elif isinstance(item, PictureItem) and not strict_text:
1265
+ in_list = False
1266
+ mdtexts.append(item.caption_text(self))
1267
+
1268
+ if image_mode == ImageRefMode.PLACEHOLDER:
1269
+ mdtexts.append("\n" + image_placeholder + "\n")
1270
+ elif image_mode == ImageRefMode.EMBEDDED and isinstance(
1271
+ item.image, ImageRef
1272
+ ):
1273
+ text = f"![Local Image]({item.image.uri})\n"
1274
+ mdtexts.append(text)
1275
+ elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
1276
+ item.image, ImageRef
1277
+ ):
1278
+ text = (
1279
+ "<!-- 🖼️❌ Image not available. "
1280
+ "Please use `PdfPipelineOptions(generate_picture_images=True)`"
1281
+ " --> "
1282
+ )
1283
+ mdtexts.append(text)
1264
1284
 
1265
- if markdown_text:
1266
- md_texts.append(markdown_text)
1285
+ elif isinstance(item, DocItem) and item.label in labels:
1286
+ in_list = False
1287
+ text = "<missing-text>"
1288
+ mdtexts.append(text)
1267
1289
 
1268
- result = delim.join(md_texts)
1269
- return result
1290
+ mdtext = (delim.join(mdtexts)).strip()
1291
+ mdtext = re.sub(
1292
+ r"\n\n\n+", "\n\n", mdtext
1293
+ ) # remove cases of double or more empty lines.
1294
+ return mdtext
1270
1295
 
1271
1296
  def export_to_text( # noqa: C901
1272
1297
  self,
1273
1298
  delim: str = "\n\n",
1274
1299
  from_element: int = 0,
1275
- to_element: Optional[int] = None,
1300
+ to_element: int = 1000000,
1276
1301
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1277
1302
  ) -> str:
1278
1303
  """export_to_text."""
@@ -1399,6 +1424,121 @@ class DoclingDocument(BaseModel):
1399
1424
 
1400
1425
  return doctags
1401
1426
 
1427
+ def _export_to_indented_text(
1428
+ self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
1429
+ ):
1430
+ """Export the document to indented text to expose hierarchy."""
1431
+ result = []
1432
+
1433
+ def get_text(text: str, max_text_len: int):
1434
+
1435
+ middle = " ... "
1436
+
1437
+ if max_text_len == -1:
1438
+ return text
1439
+ elif len(text) < max_text_len + len(middle):
1440
+ return text
1441
+ else:
1442
+ tbeg = int((max_text_len - len(middle)) / 2)
1443
+ tend = int(max_text_len - tbeg)
1444
+
1445
+ return text[0:tbeg] + middle + text[-tend:]
1446
+
1447
+ for i, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1448
+ if isinstance(item, GroupItem):
1449
+ result.append(
1450
+ indent * level
1451
+ + f"item-{i} at level {level}: {item.label}: group {item.name}"
1452
+ )
1453
+
1454
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
1455
+ text = get_text(text=item.text, max_text_len=max_text_len)
1456
+
1457
+ result.append(
1458
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1459
+ )
1460
+
1461
+ elif isinstance(item, SectionHeaderItem):
1462
+ text = get_text(text=item.text, max_text_len=max_text_len)
1463
+
1464
+ result.append(
1465
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1466
+ )
1467
+
1468
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
1469
+ text = get_text(text=item.text, max_text_len=max_text_len)
1470
+
1471
+ result.append(
1472
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1473
+ )
1474
+
1475
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
1476
+ # captions are printed in picture and table ... skipping for now
1477
+ continue
1478
+
1479
+ elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
1480
+ text = get_text(text=item.text, max_text_len=max_text_len)
1481
+
1482
+ result.append(
1483
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1484
+ )
1485
+
1486
+ elif isinstance(item, TextItem):
1487
+ text = get_text(text=item.text, max_text_len=max_text_len)
1488
+
1489
+ result.append(
1490
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1491
+ )
1492
+
1493
+ elif isinstance(item, TableItem):
1494
+
1495
+ result.append(
1496
+ indent * level
1497
+ + f"item-{i} at level {level}: {item.label} with "
1498
+ + f"[{item.data.num_rows}x{item.data.num_cols}]"
1499
+ )
1500
+
1501
+ for _ in item.captions:
1502
+ caption = _.resolve(self)
1503
+ result.append(
1504
+ indent * (level + 1)
1505
+ + f"item-{i} at level {level + 1}: {caption.label}: "
1506
+ + f"{caption.text}"
1507
+ )
1508
+
1509
+ if explicit_tables:
1510
+ grid: list[list[str]] = []
1511
+ for i, row in enumerate(item.data.grid):
1512
+ grid.append([])
1513
+ for j, cell in enumerate(row):
1514
+ if j < 10:
1515
+ text = get_text(text=cell.text, max_text_len=16)
1516
+ grid[-1].append(text)
1517
+
1518
+ result.append("\n" + tabulate(grid) + "\n")
1519
+
1520
+ elif isinstance(item, PictureItem):
1521
+
1522
+ result.append(
1523
+ indent * level + f"item-{i} at level {level}: {item.label}"
1524
+ )
1525
+
1526
+ for _ in item.captions:
1527
+ caption = _.resolve(self)
1528
+ result.append(
1529
+ indent * (level + 1)
1530
+ + f"item-{i} at level {level + 1}: {caption.label}: "
1531
+ + f"{caption.text}"
1532
+ )
1533
+
1534
+ elif isinstance(item, DocItem):
1535
+ result.append(
1536
+ indent * (level + 1)
1537
+ + f"item-{i} at level {level}: {item.label}: ignored"
1538
+ )
1539
+
1540
+ return "\n".join(result)
1541
+
1402
1542
  def add_page(
1403
1543
  self, page_no: int, size: Size, image: Optional[ImageRef] = None
1404
1544
  ) -> PageItem:
@@ -5,15 +5,18 @@
5
5
 
6
6
  """File-related utilities."""
7
7
 
8
+ import importlib
8
9
  import tempfile
9
10
  from pathlib import Path
10
- from typing import Union
11
+ from typing import Dict, Optional, Union
11
12
 
12
13
  import requests
13
14
  from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
14
15
 
15
16
 
16
- def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
17
+ def resolve_file_source(
18
+ source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
19
+ ) -> Path:
17
20
  """Resolves the source (URL, path) of a file to a local file path.
18
21
 
19
22
  If a URL is provided, the content is first downloaded to a temporary local file.
@@ -29,7 +32,17 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
29
32
  """
30
33
  try:
31
34
  http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
32
- res = requests.get(http_url, stream=True)
35
+
36
+ # make all header keys lower case
37
+ _headers = headers or {}
38
+ req_headers = {k.lower(): v for k, v in _headers.items()}
39
+ # add user-agent is not set
40
+ if "user-agent" not in req_headers:
41
+ agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
42
+ req_headers["user-agent"] = agent_name
43
+
44
+ # fetch the page
45
+ res = requests.get(http_url, stream=True, headers=req_headers)
33
46
  res.raise_for_status()
34
47
  fname = None
35
48
  # try to get filename from response header
@@ -41,7 +54,7 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
41
54
  break
42
55
  # otherwise, use name from URL:
43
56
  if fname is None:
44
- fname = Path(http_url.path or "file").name
57
+ fname = Path(http_url.path or "").name or "file"
45
58
  local_path = Path(tempfile.mkdtemp()) / fname
46
59
  with open(local_path, "wb") as f:
47
60
  for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.0.1"
3
+ version = "2.2.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes