docling-core 2.0.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (56) hide show
  1. {docling_core-2.0.0 → docling_core-2.1.0}/PKG-INFO +12 -12
  2. {docling_core-2.0.0 → docling_core-2.1.0}/README.md +11 -11
  3. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/chunker/__init__.py +4 -1
  4. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/chunker/base.py +1 -1
  5. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/chunker/hierarchical_chunker.py +58 -6
  6. docling_core-2.1.0/docling_core/types/__init__.py +10 -0
  7. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/__init__.py +1 -1
  8. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/base.py +7 -0
  9. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/document.py +246 -111
  10. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/generate_docs.py +1 -1
  11. {docling_core-2.0.0 → docling_core-2.1.0}/pyproject.toml +1 -1
  12. docling_core-2.0.0/docling_core/types/__init__.py +0 -29
  13. {docling_core-2.0.0 → docling_core-2.1.0}/LICENSE +0 -0
  14. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/__init__.py +0 -0
  15. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/py.typed +0 -0
  16. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  17. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  18. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  19. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  20. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  21. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  22. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  23. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  24. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/__init__.py +0 -0
  25. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  26. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/mapping.py +0 -0
  27. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/meta.py +0 -0
  28. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/package.py +0 -0
  29. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/__init__.py +0 -0
  30. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/base.py +0 -0
  31. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/labels.py +0 -0
  32. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/gen/__init__.py +0 -0
  33. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/gen/generic.py +0 -0
  34. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  35. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/base.py +0 -0
  36. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  37. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  38. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  39. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/document.py +0 -0
  40. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  41. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/nlp/__init__.py +0 -0
  42. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/nlp/qa.py +0 -0
  43. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/nlp/qa_labels.py +0 -0
  44. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/__init__.py +0 -0
  45. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/attribute.py +0 -0
  46. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/base.py +0 -0
  47. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/predicate.py +0 -0
  48. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/record.py +0 -0
  49. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/statement.py +0 -0
  50. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/subject.py +0 -0
  51. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/__init__.py +0 -0
  52. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/alias.py +0 -0
  53. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/file.py +0 -0
  54. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/generate_jsonschema.py +0 -0
  55. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/validate.py +0 -0
  56. {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -72,20 +72,20 @@ poetry run pytest test
72
72
  - You can validate your JSON objects using the pydantic class definition.
73
73
 
74
74
  ```py
75
- from docling_core.types import Document
75
+ from docling_core.types import DoclingDocument
76
76
 
77
77
  data_dict = {...} # here the object you want to validate, as a dictionary
78
- Document.model_validate(data_dict)
78
+ DoclingDocument.model_validate(data_dict)
79
79
 
80
80
  data_str = {...} # here the object as a JSON string
81
- Document.model_validate_json(data_str)
81
+ DoclingDocument.model_validate_json(data_str)
82
82
  ```
83
83
 
84
84
  - You can generate the JSON schema of a model with the script `generate_jsonschema`.
85
85
 
86
86
  ```py
87
- # for the `Document` type
88
- generate_jsonschema Document
87
+ # for the `DoclingDocument` type
88
+ generate_jsonschema DoclingDocument
89
89
 
90
90
  # for the use `Record` type
91
91
  generate_jsonschema Record
@@ -93,16 +93,16 @@ poetry run pytest test
93
93
 
94
94
  ## Documentation
95
95
 
96
- Docling supports 3 main data types:
96
+ Docling Core contains 3 top-level data types:
97
97
 
98
- - **Document** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
99
- The Document type also models the metadata that may be attached to the converted document.
100
- Check [Document](docs/Document.json) for the full JSON schema.
98
+ - **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
99
+ The DoclingDocument type also models the metadata that may be attached to the converted document.
100
+ Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
101
101
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
102
102
  Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
103
- Check [Record](docs/Record.json) for the full JSON schema.
103
+ Check [Record](docs/Record.json) for the full JSON schema.
104
104
  - **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
105
- Check [Generic](docs/Generic.json) for the full JSON schema.
105
+ Check [Generic](docs/Generic.json) for the full JSON schema.
106
106
 
107
107
  The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
108
108
 
@@ -36,20 +36,20 @@ poetry run pytest test
36
36
  - You can validate your JSON objects using the pydantic class definition.
37
37
 
38
38
  ```py
39
- from docling_core.types import Document
39
+ from docling_core.types import DoclingDocument
40
40
 
41
41
  data_dict = {...} # here the object you want to validate, as a dictionary
42
- Document.model_validate(data_dict)
42
+ DoclingDocument.model_validate(data_dict)
43
43
 
44
44
  data_str = {...} # here the object as a JSON string
45
- Document.model_validate_json(data_str)
45
+ DoclingDocument.model_validate_json(data_str)
46
46
  ```
47
47
 
48
48
  - You can generate the JSON schema of a model with the script `generate_jsonschema`.
49
49
 
50
50
  ```py
51
- # for the `Document` type
52
- generate_jsonschema Document
51
+ # for the `DoclingDocument` type
52
+ generate_jsonschema DoclingDocument
53
53
 
54
54
  # for the use `Record` type
55
55
  generate_jsonschema Record
@@ -57,16 +57,16 @@ poetry run pytest test
57
57
 
58
58
  ## Documentation
59
59
 
60
- Docling supports 3 main data types:
60
+ Docling Core contains 3 top-level data types:
61
61
 
62
- - **Document** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
63
- The Document type also models the metadata that may be attached to the converted document.
64
- Check [Document](docs/Document.json) for the full JSON schema.
62
+ - **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
63
+ The DoclingDocument type also models the metadata that may be attached to the converted document.
64
+ Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
65
65
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
66
66
  Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
67
- Check [Record](docs/Record.json) for the full JSON schema.
67
+ Check [Record](docs/Record.json) for the full JSON schema.
68
68
  - **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
69
- Check [Generic](docs/Generic.json) for the full JSON schema.
69
+ Check [Generic](docs/Generic.json) for the full JSON schema.
70
70
 
71
71
  The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
72
72
 
@@ -6,4 +6,7 @@
6
6
  """Define the chunker types."""
7
7
 
8
8
  from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9
- from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
9
+ from docling_core.transforms.chunker.hierarchical_chunker import (
10
+ DocMeta,
11
+ HierarchicalChunker,
12
+ )
@@ -13,7 +13,7 @@ from docling_core.types.doc import DoclingDocument as DLDocument
13
13
 
14
14
 
15
15
  class BaseMeta(BaseModel):
16
- """Metadata base class."""
16
+ """Chunk metadata base class."""
17
17
 
18
18
  excluded_embed: ClassVar[list[str]] = []
19
19
  excluded_llm: ClassVar[list[str]] = []
@@ -8,15 +8,19 @@
8
8
  from __future__ import annotations
9
9
 
10
10
  import logging
11
- from typing import Any, ClassVar, Iterator, Optional
11
+ import re
12
+ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
12
13
 
13
14
  from pandas import DataFrame
14
- from pydantic import Field
15
+ from pydantic import Field, StringConstraints, field_validator
16
+ from typing_extensions import Annotated
15
17
 
18
+ from docling_core.search.package import VERSION_PATTERN
16
19
  from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
17
- from docling_core.types.doc import DoclingDocument as DLDocument
20
+ from docling_core.types import DoclingDocument as DLDocument
18
21
  from docling_core.types.doc.document import (
19
22
  DocItem,
23
+ DocumentOrigin,
20
24
  LevelNumber,
21
25
  ListItem,
22
26
  SectionHeaderItem,
@@ -25,16 +29,31 @@ from docling_core.types.doc.document import (
25
29
  )
26
30
  from docling_core.types.doc.labels import DocItemLabel
27
31
 
32
+ _VERSION: Final = "1.0.0"
33
+
34
+ _KEY_SCHEMA_NAME = "schema_name"
35
+ _KEY_VERSION = "version"
28
36
  _KEY_DOC_ITEMS = "doc_items"
29
37
  _KEY_HEADINGS = "headings"
30
38
  _KEY_CAPTIONS = "captions"
39
+ _KEY_ORIGIN = "origin"
31
40
 
32
41
  _logger = logging.getLogger(__name__)
33
42
 
34
43
 
35
44
  class DocMeta(BaseMeta):
36
- """Data model for Hierarchical Chunker metadata."""
45
+ """Data model for Hierarchical Chunker chunk metadata."""
37
46
 
47
+ schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
48
+ default="docling_core.transforms.chunker.DocMeta",
49
+ alias=_KEY_SCHEMA_NAME,
50
+ )
51
+ version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
52
+ Field(
53
+ default=_VERSION,
54
+ alias=_KEY_VERSION,
55
+ )
56
+ )
38
57
  doc_items: list[DocItem] = Field(
39
58
  alias=_KEY_DOC_ITEMS,
40
59
  min_length=1,
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
49
68
  alias=_KEY_CAPTIONS,
50
69
  min_length=1,
51
70
  )
71
+ origin: Optional[DocumentOrigin] = Field(
72
+ default=None,
73
+ alias=_KEY_ORIGIN,
74
+ )
52
75
 
53
- excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
54
- excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
76
+ excluded_embed: ClassVar[list[str]] = [
77
+ _KEY_SCHEMA_NAME,
78
+ _KEY_VERSION,
79
+ _KEY_DOC_ITEMS,
80
+ _KEY_ORIGIN,
81
+ ]
82
+ excluded_llm: ClassVar[list[str]] = [
83
+ _KEY_SCHEMA_NAME,
84
+ _KEY_VERSION,
85
+ _KEY_DOC_ITEMS,
86
+ _KEY_ORIGIN,
87
+ ]
88
+
89
+ @field_validator(_KEY_VERSION)
90
+ @classmethod
91
+ def check_version_is_compatible(cls, v: str) -> str:
92
+ """Check if this meta item version is compatible with current version."""
93
+ current_match = re.match(VERSION_PATTERN, _VERSION)
94
+ doc_match = re.match(VERSION_PATTERN, v)
95
+ if (
96
+ doc_match is None
97
+ or current_match is None
98
+ or doc_match["major"] != current_match["major"]
99
+ or doc_match["minor"] > current_match["minor"]
100
+ ):
101
+ raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
102
+ else:
103
+ return _VERSION
55
104
 
56
105
 
57
106
  class DocChunk(BaseChunk):
@@ -129,6 +178,7 @@ class HierarchicalChunker(BaseChunker):
129
178
  for k in sorted(heading_by_level)
130
179
  ]
131
180
  or None,
181
+ origin=dl_doc.origin,
132
182
  ),
133
183
  )
134
184
  list_items = [] # reset
@@ -171,6 +221,7 @@ class HierarchicalChunker(BaseChunker):
171
221
  headings=[heading_by_level[k] for k in sorted(heading_by_level)]
172
222
  or None,
173
223
  captions=captions,
224
+ origin=dl_doc.origin,
174
225
  ),
175
226
  )
176
227
  yield c
@@ -182,5 +233,6 @@ class HierarchicalChunker(BaseChunker):
182
233
  doc_items=list_items,
183
234
  headings=[heading_by_level[k] for k in sorted(heading_by_level)]
184
235
  or None,
236
+ origin=dl_doc.origin,
185
237
  ),
186
238
  )
@@ -0,0 +1,10 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the main types."""
7
+
8
+ from docling_core.types.doc.document import DoclingDocument
9
+ from docling_core.types.gen.generic import Generic
10
+ from docling_core.types.rec.record import Record
@@ -5,7 +5,7 @@
5
5
 
6
6
  """Package for models defined by the Document type."""
7
7
 
8
- from .base import BoundingBox, CoordOrigin, Size
8
+ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
10
  DocItem,
11
11
  DoclingDocument,
@@ -7,6 +7,13 @@ from typing import Tuple
7
7
  from pydantic import BaseModel
8
8
 
9
9
 
10
+ class ImageRefMode(str, Enum):
11
+ """ImageRefMode."""
12
+
13
+ PLACEHOLDER = "placeholder"
14
+ EMBEDDED = "embedded"
15
+
16
+
10
17
  class CoordOrigin(str, Enum):
11
18
  """CoordOrigin."""
12
19
 
@@ -3,6 +3,7 @@
3
3
  import base64
4
4
  import mimetypes
5
5
  import re
6
+ import sys
6
7
  import typing
7
8
  from io import BytesIO
8
9
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
@@ -25,6 +26,7 @@ from typing_extensions import Annotated, Self
25
26
  from docling_core.search.package import VERSION_PATTERN
26
27
  from docling_core.types.base import _JSON_POINTER_REGEX
27
28
  from docling_core.types.doc import BoundingBox, Size
29
+ from docling_core.types.doc.base import ImageRefMode
28
30
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
29
31
  from docling_core.types.legacy_doc.tokens import DocumentToken
30
32
 
@@ -214,6 +216,8 @@ class DocumentOrigin(BaseModel):
214
216
  "application/vnd.openxmlformats-officedocument.presentationml.template",
215
217
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
216
218
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
219
+ "text/asciidoc",
220
+ "text/markdown",
217
221
  ]
218
222
 
219
223
  @field_validator("binary_hash", mode="before")
@@ -1107,12 +1111,14 @@ class DoclingDocument(BaseModel):
1107
1111
 
1108
1112
  def export_to_markdown( # noqa: C901
1109
1113
  self,
1110
- delim: str = "\n\n",
1114
+ delim: str = "\n",
1111
1115
  from_element: int = 0,
1112
- to_element: Optional[int] = None,
1116
+ to_element: int = sys.maxsize,
1113
1117
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1114
1118
  strict_text: bool = False,
1115
1119
  image_placeholder: str = "<!-- image -->",
1120
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1121
+ indent: int = 4,
1116
1122
  ) -> str:
1117
1123
  r"""Serialize to Markdown.
1118
1124
 
@@ -1142,136 +1148,150 @@ class DoclingDocument(BaseModel):
1142
1148
  :param strict_text: bool: (Default value = False)
1143
1149
  :param image_placeholder str: (Default value = "<!-- image -->")
1144
1150
  the placeholder to include to position images in the markdown.
1151
+ :param indent: int (default=4): indent of the nested lists
1145
1152
  :returns: The exported Markdown representation.
1146
1153
  :rtype: str
1147
1154
  """
1148
- has_title = False
1149
- prev_text = ""
1150
- md_texts: list[str] = []
1155
+ mdtexts: list[str] = []
1156
+ list_nesting_level = 0 # Track the current list nesting level
1157
+ previous_level = 0 # Track the previous item's level
1158
+ in_list = False # Track if we're currently processing list items
1151
1159
 
1152
- # collect all captions embedded in table and figure objects
1153
- # to avoid repeating them
1154
- embedded_captions = set()
1155
- skip_count = 0
1156
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1157
- if skip_count < from_element:
1158
- skip_count += 1
1159
- continue # skip as many items as you want
1160
-
1161
- if to_element and ix >= to_element:
1162
- break
1160
+ for ix, (item, level) in enumerate(
1161
+ self.iterate_items(self.body, with_groups=True)
1162
+ ):
1163
+ # If we've moved to a lower level, we're exiting one or more groups
1164
+ if level < previous_level:
1165
+ # Calculate how many levels we've exited
1166
+ level_difference = previous_level - level
1167
+ # Decrement list_nesting_level for each list group we've exited
1168
+ list_nesting_level = max(0, list_nesting_level - level_difference)
1163
1169
 
1164
- if (
1165
- isinstance(item, (TableItem, PictureItem))
1166
- and len(item.captions) > 0
1167
- and item.label in labels
1168
- ):
1169
- caption = item.caption_text(self)
1170
- if caption:
1171
- embedded_captions.add(caption)
1170
+ previous_level = level # Update previous_level for next iteration
1172
1171
 
1173
- skip_count = 0
1174
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1175
- if skip_count < from_element:
1176
- skip_count += 1
1172
+ if ix < from_element and to_element <= ix:
1177
1173
  continue # skip as many items as you want
1178
1174
 
1179
- if to_element and ix >= to_element:
1180
- break
1181
-
1182
- markdown_text = ""
1183
-
1184
- if isinstance(item, DocItem):
1185
- item_type = item.label
1186
-
1187
- if isinstance(item, TextItem) and item_type in labels:
1188
- text = item.text
1189
-
1190
- # skip captions of they are embedded in the actual
1191
- # floating object
1192
- if item_type == DocItemLabel.CAPTION and text in embedded_captions:
1193
- continue
1194
-
1195
- # ignore repeated text
1196
- if prev_text == text or text is None:
1197
- continue
1198
- else:
1199
- prev_text = text
1200
-
1201
- # first title match
1202
- if item_type == DocItemLabel.TITLE and not has_title:
1203
- if strict_text:
1204
- markdown_text = f"{text}"
1205
- else:
1206
- markdown_text = f"# {text}"
1207
- has_title = True
1208
-
1209
- # secondary titles
1210
- elif item_type in {
1211
- DocItemLabel.TITLE,
1212
- DocItemLabel.SECTION_HEADER,
1213
- } or (has_title and item_type == DocItemLabel.TITLE):
1214
- if strict_text:
1215
- markdown_text = f"{text}"
1216
- else:
1217
- markdown_text = f"## {text}"
1218
-
1219
- # secondary titles
1220
- elif isinstance(item, ListItem):
1221
- if item.enumerated:
1222
- marker = item.marker
1223
- else:
1224
- marker = "-"
1225
-
1226
- markdown_text = f"{marker} {text}"
1227
-
1228
- # normal text
1229
- else:
1230
- markdown_text = text
1231
-
1232
- elif isinstance(item, TableItem) and item.data and item_type in labels:
1233
- parts = []
1234
-
1235
- # Compute the caption
1236
- if caption := item.caption_text(self):
1237
- parts.append(caption)
1238
- parts.append("\n")
1175
+ # Handle newlines between different types of content
1176
+ if (
1177
+ len(mdtexts) > 0
1178
+ and not isinstance(item, (ListItem, GroupItem))
1179
+ and in_list
1180
+ ):
1181
+ mdtexts[-1] += "\n"
1182
+ in_list = False
1239
1183
 
1240
- # Rendered the item
1241
- if not strict_text:
1242
- md_table = item.export_to_markdown()
1243
- if md_table:
1244
- parts.append(item.export_to_markdown())
1184
+ if isinstance(item, GroupItem) and item.label in [
1185
+ GroupLabel.LIST,
1186
+ GroupLabel.ORDERED_LIST,
1187
+ ]:
1245
1188
 
1246
- # Combine parts
1247
- markdown_text = "\n".join(parts)
1189
+ if list_nesting_level == 0: # Check if we're on the top level.
1190
+ # In that case a new list starts directly after another list.
1191
+ mdtexts.append("\n") # Add a blank line
1248
1192
 
1249
- elif isinstance(item, PictureItem) and item_type in labels:
1250
- parts = []
1193
+ # Increment list nesting level when entering a new list
1194
+ list_nesting_level += 1
1195
+ in_list = True
1196
+ continue
1251
1197
 
1252
- # Compute the caption
1253
- if caption := item.caption_text(self):
1254
- parts.append(caption)
1255
- parts.append("\n")
1198
+ elif isinstance(item, GroupItem):
1199
+ continue
1256
1200
 
1257
- # Rendered the item
1258
- if not strict_text:
1259
- parts.append(f"{image_placeholder}")
1201
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
1202
+ in_list = False
1203
+ marker = "" if strict_text else "#"
1204
+ text = f"{marker} {item.text}\n"
1205
+ mdtexts.append(text.strip())
1206
+
1207
+ elif (
1208
+ isinstance(item, TextItem)
1209
+ and item.label in [DocItemLabel.SECTION_HEADER]
1210
+ ) or isinstance(item, SectionHeaderItem):
1211
+ in_list = False
1212
+ marker = ""
1213
+ if not strict_text:
1214
+ marker = "#" * level
1215
+ if len(marker) < 2:
1216
+ marker = "##"
1217
+ text = f"{marker} {item.text}\n"
1218
+ mdtexts.append(text.strip() + "\n")
1219
+
1220
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
1221
+ in_list = False
1222
+ text = f"```\n{item.text}\n```\n"
1223
+ mdtexts.append(text)
1224
+
1225
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
1226
+ # captions are printed in picture and table ... skipping for now
1227
+ continue
1260
1228
 
1261
- # Combine parts
1262
- markdown_text = "\n".join(parts)
1229
+ elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
1230
+ in_list = True
1231
+ # Calculate indent based on list_nesting_level
1232
+ # -1 because level 1 needs no indent
1233
+ list_indent = " " * (indent * (list_nesting_level - 1))
1234
+
1235
+ marker = ""
1236
+ if strict_text:
1237
+ marker = ""
1238
+ elif item.enumerated:
1239
+ marker = item.marker
1240
+ else:
1241
+ marker = "-" # Markdown needs only dash as item marker.
1242
+
1243
+ text = f"{list_indent}{marker} {item.text}"
1244
+ mdtexts.append(text)
1245
+
1246
+ elif isinstance(item, TextItem) and item.label in labels:
1247
+ in_list = False
1248
+ if len(item.text):
1249
+ text = f"{item.text}\n"
1250
+ mdtexts.append(text)
1251
+
1252
+ elif isinstance(item, TableItem) and not strict_text:
1253
+ in_list = False
1254
+ mdtexts.append(item.caption_text(self))
1255
+ md_table = item.export_to_markdown()
1256
+ mdtexts.append("\n" + md_table + "\n")
1257
+
1258
+ elif isinstance(item, PictureItem) and not strict_text:
1259
+ in_list = False
1260
+ mdtexts.append(item.caption_text(self))
1261
+
1262
+ if image_mode == ImageRefMode.PLACEHOLDER:
1263
+ mdtexts.append("\n" + image_placeholder + "\n")
1264
+ elif image_mode == ImageRefMode.EMBEDDED and isinstance(
1265
+ item.image, ImageRef
1266
+ ):
1267
+ text = f"![Local Image]({item.image.uri})\n"
1268
+ mdtexts.append(text)
1269
+ elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
1270
+ item.image, ImageRef
1271
+ ):
1272
+ text = (
1273
+ "<!-- 🖼️❌ Image not available. "
1274
+ "Please use `PdfPipelineOptions(generate_picture_images=True)`"
1275
+ " --> "
1276
+ )
1277
+ mdtexts.append(text)
1263
1278
 
1264
- if markdown_text:
1265
- md_texts.append(markdown_text)
1279
+ elif isinstance(item, DocItem) and item.label in labels:
1280
+ in_list = False
1281
+ text = "<missing-text>"
1282
+ mdtexts.append(text)
1266
1283
 
1267
- result = delim.join(md_texts)
1268
- return result
1284
+ mdtext = (delim.join(mdtexts)).strip()
1285
+ mdtext = re.sub(
1286
+ r"\n\n\n+", "\n\n", mdtext
1287
+ ) # remove cases of double or more empty lines.
1288
+ return mdtext
1269
1289
 
1270
1290
  def export_to_text( # noqa: C901
1271
1291
  self,
1272
1292
  delim: str = "\n\n",
1273
1293
  from_element: int = 0,
1274
- to_element: Optional[int] = None,
1294
+ to_element: int = 1000000,
1275
1295
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1276
1296
  ) -> str:
1277
1297
  """export_to_text."""
@@ -1398,6 +1418,121 @@ class DoclingDocument(BaseModel):
1398
1418
 
1399
1419
  return doctags
1400
1420
 
1421
+ def _export_to_indented_text(
1422
+ self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
1423
+ ):
1424
+ """Export the document to indented text to expose hierarchy."""
1425
+ result = []
1426
+
1427
+ def get_text(text: str, max_text_len: int):
1428
+
1429
+ middle = " ... "
1430
+
1431
+ if max_text_len == -1:
1432
+ return text
1433
+ elif len(text) < max_text_len + len(middle):
1434
+ return text
1435
+ else:
1436
+ tbeg = int((max_text_len - len(middle)) / 2)
1437
+ tend = int(max_text_len - tbeg)
1438
+
1439
+ return text[0:tbeg] + middle + text[-tend:]
1440
+
1441
+ for i, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1442
+ if isinstance(item, GroupItem):
1443
+ result.append(
1444
+ indent * level
1445
+ + f"item-{i} at level {level}: {item.label}: group {item.name}"
1446
+ )
1447
+
1448
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
1449
+ text = get_text(text=item.text, max_text_len=max_text_len)
1450
+
1451
+ result.append(
1452
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1453
+ )
1454
+
1455
+ elif isinstance(item, SectionHeaderItem):
1456
+ text = get_text(text=item.text, max_text_len=max_text_len)
1457
+
1458
+ result.append(
1459
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1460
+ )
1461
+
1462
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
1463
+ text = get_text(text=item.text, max_text_len=max_text_len)
1464
+
1465
+ result.append(
1466
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1467
+ )
1468
+
1469
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
1470
+ # captions are printed in picture and table ... skipping for now
1471
+ continue
1472
+
1473
+ elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
1474
+ text = get_text(text=item.text, max_text_len=max_text_len)
1475
+
1476
+ result.append(
1477
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1478
+ )
1479
+
1480
+ elif isinstance(item, TextItem):
1481
+ text = get_text(text=item.text, max_text_len=max_text_len)
1482
+
1483
+ result.append(
1484
+ indent * level + f"item-{i} at level {level}: {item.label}: {text}"
1485
+ )
1486
+
1487
+ elif isinstance(item, TableItem):
1488
+
1489
+ result.append(
1490
+ indent * level
1491
+ + f"item-{i} at level {level}: {item.label} with "
1492
+ + f"[{item.data.num_rows}x{item.data.num_cols}]"
1493
+ )
1494
+
1495
+ for _ in item.captions:
1496
+ caption = _.resolve(self)
1497
+ result.append(
1498
+ indent * (level + 1)
1499
+ + f"item-{i} at level {level + 1}: {caption.label}: "
1500
+ + f"{caption.text}"
1501
+ )
1502
+
1503
+ if explicit_tables:
1504
+ grid: list[list[str]] = []
1505
+ for i, row in enumerate(item.data.grid):
1506
+ grid.append([])
1507
+ for j, cell in enumerate(row):
1508
+ if j < 10:
1509
+ text = get_text(text=cell.text, max_text_len=16)
1510
+ grid[-1].append(text)
1511
+
1512
+ result.append("\n" + tabulate(grid) + "\n")
1513
+
1514
+ elif isinstance(item, PictureItem):
1515
+
1516
+ result.append(
1517
+ indent * level + f"item-{i} at level {level}: {item.label}"
1518
+ )
1519
+
1520
+ for _ in item.captions:
1521
+ caption = _.resolve(self)
1522
+ result.append(
1523
+ indent * (level + 1)
1524
+ + f"item-{i} at level {level + 1}: {caption.label}: "
1525
+ + f"{caption.text}"
1526
+ )
1527
+
1528
+ elif isinstance(item, DocItem):
1529
+ result.append(
1530
+ indent * (level + 1)
1531
+ + f"item-{i} at level {level}: {item.label}: ignored"
1532
+ )
1533
+
1534
+ return "\n".join(result)
1535
+
1401
1536
  def add_page(
1402
1537
  self, page_no: int, size: Size, image: Optional[ImageRef] = None
1403
1538
  ) -> PageItem:
@@ -18,7 +18,7 @@ from typing import Final
18
18
 
19
19
  from docling_core.utils.generate_jsonschema import generate_json_schema
20
20
 
21
- MODELS: Final = ["Document", "Record", "Generic"]
21
+ MODELS: Final = ["DoclingDocument", "Record", "Generic"]
22
22
 
23
23
 
24
24
  def _prepare_directory(folder: str, clean: bool = False) -> None:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.0.0"
3
+ version = "2.1.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -1,29 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Define the main types."""
7
-
8
- from docling_core.types.gen.generic import Generic # noqa
9
- from docling_core.types.legacy_doc.base import BoundingBox # noqa
10
- from docling_core.types.legacy_doc.base import Table # noqa
11
- from docling_core.types.legacy_doc.base import TableCell # noqa
12
- from docling_core.types.legacy_doc.base import ( # noqa
13
- BaseCell,
14
- BaseText,
15
- PageDimensions,
16
- PageReference,
17
- Prov,
18
- Ref,
19
- )
20
- from docling_core.types.legacy_doc.document import ( # noqa
21
- CCSDocumentDescription as DocumentDescription,
22
- )
23
- from docling_core.types.legacy_doc.document import ( # noqa
24
- CCSFileInfoObject as FileInfoObject,
25
- )
26
- from docling_core.types.legacy_doc.document import ( # noqa
27
- ExportedCCSDocument as Document,
28
- )
29
- from docling_core.types.rec.record import Record # noqa
File without changes