docling-core 2.7.1__tar.gz → 2.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (60) hide show
  1. {docling_core-2.7.1 → docling_core-2.9.0}/PKG-INFO +4 -1
  2. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/transforms/chunker/__init__.py +1 -0
  3. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/transforms/chunker/base.py +34 -0
  4. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/transforms/chunker/hierarchical_chunker.py +1 -2
  5. docling_core-2.9.0/docling_core/transforms/chunker/hybrid_chunker.py +272 -0
  6. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/doc/document.py +42 -20
  7. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/legacy_doc/base.py +1 -0
  8. docling_core-2.9.0/docling_core/utils/legacy.py +346 -0
  9. {docling_core-2.7.1 → docling_core-2.9.0}/pyproject.toml +7 -1
  10. {docling_core-2.7.1 → docling_core-2.9.0}/LICENSE +0 -0
  11. {docling_core-2.7.1 → docling_core-2.9.0}/README.md +0 -0
  12. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/__init__.py +0 -0
  13. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/py.typed +0 -0
  14. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  15. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  16. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  17. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  18. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  19. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  20. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  21. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  22. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/search/__init__.py +0 -0
  23. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  24. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/search/mapping.py +0 -0
  25. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/search/meta.py +0 -0
  26. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/search/package.py +0 -0
  27. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/transforms/__init__.py +0 -0
  28. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/__init__.py +0 -0
  29. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/base.py +0 -0
  30. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/doc/__init__.py +0 -0
  31. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/doc/base.py +0 -0
  32. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/doc/labels.py +0 -0
  33. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  40. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  41. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  42. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/legacy_doc/document.py +0 -0
  43. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  44. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/nlp/__init__.py +0 -0
  45. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/nlp/qa.py +0 -0
  46. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/nlp/qa_labels.py +0 -0
  47. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/rec/__init__.py +0 -0
  48. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/rec/attribute.py +0 -0
  49. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/rec/base.py +0 -0
  50. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/rec/predicate.py +0 -0
  51. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/rec/record.py +0 -0
  52. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/rec/statement.py +0 -0
  53. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/types/rec/subject.py +0 -0
  54. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/utils/__init__.py +0 -0
  55. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/utils/alias.py +0 -0
  56. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/utils/file.py +0 -0
  57. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/utils/generate_docs.py +0 -0
  58. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/utils/generate_jsonschema.py +0 -0
  59. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/utils/validate.py +0 -0
  60. {docling_core-2.7.1 → docling_core-2.9.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.7.1
3
+ Version: 2.9.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -25,13 +25,16 @@ Classifier: Topic :: Database
25
25
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Classifier: Typing :: Typed
28
+ Provides-Extra: chunking
28
29
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
29
30
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
30
31
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
32
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
33
  Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
33
34
  Requires-Dist: pyyaml (>=5.1,<7.0.0)
35
+ Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
34
36
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
37
+ Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
35
38
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
36
39
  Project-URL: Repository, https://github.com/DS4SD/docling-core
37
40
  Description-Content-Type: text/markdown
@@ -7,6 +7,7 @@
7
7
 
8
8
  from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9
9
  from docling_core.transforms.chunker.hierarchical_chunker import (
10
+ DocChunk,
10
11
  DocMeta,
11
12
  HierarchicalChunker,
12
13
  )
@@ -4,6 +4,7 @@
4
4
  #
5
5
 
6
6
  """Define base classes for chunking."""
7
+ import json
7
8
  from abc import ABC, abstractmethod
8
9
  from typing import Any, ClassVar, Iterator
9
10
 
@@ -11,6 +12,8 @@ from pydantic import BaseModel
11
12
 
12
13
  from docling_core.types.doc import DoclingDocument as DLDocument
13
14
 
15
+ DFLT_DELIM = "\n"
16
+
14
17
 
15
18
  class BaseMeta(BaseModel):
16
19
  """Chunk metadata base class."""
@@ -45,6 +48,8 @@ class BaseChunk(BaseModel):
45
48
  class BaseChunker(BaseModel, ABC):
46
49
  """Chunker base class."""
47
50
 
51
+ delim: str = DFLT_DELIM
52
+
48
53
  @abstractmethod
49
54
  def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
50
55
  """Chunk the provided document.
@@ -59,3 +64,32 @@ class BaseChunker(BaseModel, ABC):
59
64
  Iterator[BaseChunk]: iterator over extracted chunks
60
65
  """
61
66
  raise NotImplementedError()
67
+
68
+ def serialize(self, chunk: BaseChunk) -> str:
69
+ """Serialize the given chunk. This base implementation is embedding-targeted.
70
+
71
+ Args:
72
+ chunk: chunk to serialize
73
+
74
+ Returns:
75
+ str: the serialized form of the chunk
76
+ """
77
+ meta = chunk.meta.export_json_dict()
78
+
79
+ items = []
80
+ for k in meta:
81
+ if k not in chunk.meta.excluded_embed:
82
+ if isinstance(meta[k], list):
83
+ items.append(
84
+ self.delim.join(
85
+ [
86
+ d if isinstance(d, str) else json.dumps(d)
87
+ for d in meta[k]
88
+ ]
89
+ )
90
+ )
91
+ else:
92
+ items.append(json.dumps(meta[k]))
93
+ items.append(chunk.text)
94
+
95
+ return self.delim.join(items)
@@ -104,7 +104,7 @@ class DocMeta(BaseMeta):
104
104
 
105
105
 
106
106
  class DocChunk(BaseChunk):
107
- """Data model for Hierarchical Chunker chunks."""
107
+ """Data model for document chunks."""
108
108
 
109
109
  meta: DocMeta
110
110
 
@@ -119,7 +119,6 @@ class HierarchicalChunker(BaseChunker):
119
119
  """
120
120
 
121
121
  merge_list_items: bool = True
122
- delim: str = "\n"
123
122
 
124
123
  @classmethod
125
124
  def _triplet_serialize(cls, table_df: DataFrame) -> str:
@@ -0,0 +1,272 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Hybrid chunker implementation leveraging both doc structure & token awareness."""
7
+
8
+ import warnings
9
+ from typing import Iterable, Iterator, Optional, Union
10
+
11
+ from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
12
+ from typing_extensions import Self
13
+
14
+ try:
15
+ import semchunk
16
+ from transformers import AutoTokenizer, PreTrainedTokenizerBase
17
+ except ImportError:
18
+ raise RuntimeError(
19
+ "Module requires 'chunking' extra; to install, run: "
20
+ "`pip install 'docling-core[chunking]'`"
21
+ )
22
+
23
+ from docling_core.transforms.chunker import (
24
+ BaseChunk,
25
+ BaseChunker,
26
+ DocChunk,
27
+ DocMeta,
28
+ HierarchicalChunker,
29
+ )
30
+ from docling_core.types import DoclingDocument
31
+ from docling_core.types.doc.document import TextItem
32
+
33
+
34
+ class HybridChunker(BaseChunker):
35
+ r"""Chunker doing tokenization-aware refinements on top of document layout chunking.
36
+
37
+ Args:
38
+ tokenizer: The tokenizer to use; either instantiated object or name or path of
39
+ respective pretrained model
40
+ max_tokens: The maximum number of tokens per chunk. If not set, limit is
41
+ resolved from the tokenizer
42
+ merge_peers: Whether to merge undersized chunks sharing same relevant metadata
43
+ """
44
+
45
+ model_config = ConfigDict(arbitrary_types_allowed=True)
46
+
47
+ tokenizer: Union[PreTrainedTokenizerBase, str]
48
+ max_tokens: int = None # type: ignore[assignment]
49
+ merge_peers: bool = True
50
+
51
+ _inner_chunker: HierarchicalChunker = HierarchicalChunker()
52
+
53
+ @model_validator(mode="after")
54
+ def _patch_tokenizer_and_max_tokens(self) -> Self:
55
+ self._tokenizer = (
56
+ self.tokenizer
57
+ if isinstance(self.tokenizer, PreTrainedTokenizerBase)
58
+ else AutoTokenizer.from_pretrained(self.tokenizer)
59
+ )
60
+ if self.max_tokens is None:
61
+ self.max_tokens = TypeAdapter(PositiveInt).validate_python(
62
+ self._tokenizer.model_max_length
63
+ )
64
+ return self
65
+
66
+ def _count_tokens(self, text: Optional[Union[str, list[str]]]):
67
+ if text is None:
68
+ return 0
69
+ elif isinstance(text, list):
70
+ total = 0
71
+ for t in text:
72
+ total += self._count_tokens(t)
73
+ return total
74
+ return len(self._tokenizer.tokenize(text, max_length=None))
75
+
76
+ class _ChunkLengthInfo(BaseModel):
77
+ total_len: int
78
+ text_len: int
79
+ other_len: int
80
+
81
+ def _doc_chunk_length(self, doc_chunk: DocChunk):
82
+ text_length = self._count_tokens(doc_chunk.text)
83
+ headings_length = self._count_tokens(doc_chunk.meta.headings)
84
+ captions_length = self._count_tokens(doc_chunk.meta.captions)
85
+ total = text_length + headings_length + captions_length
86
+ return self._ChunkLengthInfo(
87
+ total_len=total,
88
+ text_len=text_length,
89
+ other_len=total - text_length,
90
+ )
91
+
92
+ def _make_chunk_from_doc_items(
93
+ self, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int
94
+ ):
95
+ meta = DocMeta(
96
+ doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
97
+ headings=doc_chunk.meta.headings,
98
+ captions=doc_chunk.meta.captions,
99
+ )
100
+ new_chunk = DocChunk(text=window_text, meta=meta)
101
+ return new_chunk
102
+
103
+ def _merge_text(self, t1, t2):
104
+ if t1 == "":
105
+ return t2
106
+ elif t2 == "":
107
+ return t1
108
+ else:
109
+ return f"{t1}{self.delim}{t2}"
110
+
111
+ def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
112
+ if doc_chunk.meta.doc_items is None or len(doc_chunk.meta.doc_items) <= 1:
113
+ return [doc_chunk]
114
+ length = self._doc_chunk_length(doc_chunk)
115
+ if length.total_len <= self.max_tokens:
116
+ return [doc_chunk]
117
+ else:
118
+ chunks = []
119
+ window_start = 0
120
+ window_end = 0
121
+ window_text = ""
122
+ window_text_length = 0
123
+ other_length = length.other_len
124
+ num_items = len(doc_chunk.meta.doc_items)
125
+ while window_end < num_items:
126
+ doc_item = doc_chunk.meta.doc_items[window_end]
127
+ if isinstance(doc_item, TextItem):
128
+ text = doc_item.text
129
+ else:
130
+ raise RuntimeError("Non-TextItem split not implemented yet")
131
+ text_length = self._count_tokens(text)
132
+ if (
133
+ text_length + window_text_length + other_length < self.max_tokens
134
+ and window_end < num_items - 1
135
+ ):
136
+ # Still room left to add more to this chunk AND still at least one
137
+ # item left
138
+ window_end += 1
139
+ window_text_length += text_length
140
+ window_text = self._merge_text(window_text, text)
141
+ elif text_length + window_text_length + other_length < self.max_tokens:
142
+ # All the items in the window fit into the chunk and there are no
143
+ # other items left
144
+ window_text = self._merge_text(window_text, text)
145
+ new_chunk = self._make_chunk_from_doc_items(
146
+ doc_chunk, window_text, window_start, window_end
147
+ )
148
+ chunks.append(new_chunk)
149
+ window_end = num_items
150
+ elif window_start == window_end:
151
+ # Only one item in the window and it doesn't fit into the chunk. So
152
+ # we'll just make it a chunk for now and it will get split in the
153
+ # plain text splitter.
154
+ window_text = self._merge_text(window_text, text)
155
+ new_chunk = self._make_chunk_from_doc_items(
156
+ doc_chunk, window_text, window_start, window_end
157
+ )
158
+ chunks.append(new_chunk)
159
+ window_start = window_end + 1
160
+ window_end = window_start
161
+ window_text = ""
162
+ window_text_length = 0
163
+ else:
164
+ # Multiple items in the window but they don't fit into the chunk.
165
+ # However, the existing items must have fit or we wouldn't have
166
+ # gotten here. So we put everything but the last item into the chunk
167
+ # and then start a new window INCLUDING the current window end.
168
+ new_chunk = self._make_chunk_from_doc_items(
169
+ doc_chunk, window_text, window_start, window_end - 1
170
+ )
171
+ chunks.append(new_chunk)
172
+ window_start = window_end
173
+ window_text = ""
174
+ window_text_length = 0
175
+ return chunks
176
+
177
+ def _split_using_plain_text(
178
+ self,
179
+ doc_chunk: DocChunk,
180
+ ) -> list[DocChunk]:
181
+ lengths = self._doc_chunk_length(doc_chunk)
182
+ if lengths.total_len <= self.max_tokens:
183
+ return [DocChunk(**doc_chunk.export_json_dict())]
184
+ else:
185
+ # How much room is there for text after subtracting out the headers and
186
+ # captions:
187
+ available_length = self.max_tokens - lengths.other_len
188
+ sem_chunker = semchunk.chunkerify(
189
+ self._tokenizer, chunk_size=available_length
190
+ )
191
+ if available_length <= 0:
192
+ warnings.warn(
193
+ f"Headers and captions for this chunk are longer than the total amount of size for the chunk, chunk will be ignored: {doc_chunk.text=}" # noqa
194
+ )
195
+ return []
196
+ text = doc_chunk.text
197
+ segments = sem_chunker.chunk(text)
198
+ chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
199
+ return chunks
200
+
201
+ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
202
+ output_chunks = []
203
+ window_start = 0
204
+ window_end = 0
205
+ num_chunks = len(chunks)
206
+ while window_end < num_chunks:
207
+ chunk = chunks[window_end]
208
+ lengths = self._doc_chunk_length(chunk)
209
+ headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
210
+ ready_to_append = False
211
+ if window_start == window_end:
212
+ # starting a new block of chunks to potentially merge
213
+ current_headings_and_captions = headings_and_captions
214
+ window_text = chunk.text
215
+ window_other_length = lengths.other_len
216
+ window_text_length = lengths.text_len
217
+ window_items = chunk.meta.doc_items
218
+ window_end += 1
219
+ first_chunk_of_window = chunk
220
+ elif (
221
+ headings_and_captions == current_headings_and_captions
222
+ and window_text_length + window_other_length + lengths.text_len
223
+ <= self.max_tokens
224
+ ):
225
+ # there is room to include the new chunk so add it to the window and
226
+ # continue
227
+ window_text = self._merge_text(window_text, chunk.text)
228
+ window_text_length += lengths.text_len
229
+ window_items = window_items + chunk.meta.doc_items
230
+ window_end += 1
231
+ else:
232
+ ready_to_append = True
233
+
234
+ if ready_to_append or window_end == num_chunks:
235
+ # no more room OR the start of new metadata. Either way, end the block
236
+ # and use the current window_end as the start of a new block
237
+ if window_start + 1 == window_end:
238
+ # just one chunk so use it as is
239
+ output_chunks.append(first_chunk_of_window)
240
+ else:
241
+ new_meta = DocMeta(
242
+ doc_items=window_items,
243
+ headings=current_headings_and_captions[0],
244
+ captions=current_headings_and_captions[1],
245
+ )
246
+ new_chunk = DocChunk(
247
+ text=window_text,
248
+ meta=new_meta,
249
+ )
250
+ output_chunks.append(new_chunk)
251
+ # no need to reset window_text, etc. because that will be reset in the
252
+ # next iteration in the if window_start == window_end block
253
+ window_start = window_end
254
+
255
+ return output_chunks
256
+
257
+ def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:
258
+ r"""Chunk the provided document.
259
+
260
+ Args:
261
+ dl_doc (DLDocument): document to chunk
262
+
263
+ Yields:
264
+ Iterator[Chunk]: iterator over extracted chunks
265
+ """
266
+ res: Iterable[DocChunk]
267
+ res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # type: ignore
268
+ res = [x for c in res for x in self._split_by_doc_items(c)]
269
+ res = [x for c in res for x in self._split_using_plain_text(c)]
270
+ if self.merge_peers:
271
+ res = self._merge_chunks_with_matching_metadata(res)
272
+ return iter(res)
@@ -380,6 +380,7 @@ class DocumentOrigin(BaseModel):
380
380
  "application/vnd.openxmlformats-officedocument.presentationml.template",
381
381
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
382
382
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
383
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
383
384
  "text/asciidoc",
384
385
  "text/markdown",
385
386
  ]
@@ -445,7 +446,7 @@ class ImageRef(BaseModel):
445
446
  mimetype: str
446
447
  dpi: int
447
448
  size: Size
448
- uri: Union[AnyUrl, Path]
449
+ uri: Union[AnyUrl, Path] = Field(union_mode="left_to_right")
449
450
  _pil: Optional[PILImage.Image] = None
450
451
 
451
452
  @property
@@ -1668,7 +1669,7 @@ class DoclingDocument(BaseModel):
1668
1669
  self,
1669
1670
  root: Optional[NodeItem] = None,
1670
1671
  with_groups: bool = False,
1671
- traverse_pictures: bool = True,
1672
+ traverse_pictures: bool = False,
1672
1673
  page_no: Optional[int] = None,
1673
1674
  _level: int = 0, # fixed parameter, carries through the node nesting level
1674
1675
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
@@ -1685,30 +1686,31 @@ class DoclingDocument(BaseModel):
1685
1686
  if not root:
1686
1687
  root = self.body
1687
1688
 
1689
+ # Yield non-group items or group items when with_groups=True
1688
1690
  if not isinstance(root, GroupItem) or with_groups:
1689
1691
  if isinstance(root, DocItem):
1690
- if page_no is not None:
1691
- for prov in root.prov:
1692
- if prov.page_no == page_no:
1693
- yield root, _level
1694
- else:
1692
+ if page_no is None or any(
1693
+ prov.page_no == page_no for prov in root.prov
1694
+ ):
1695
1695
  yield root, _level
1696
1696
  else:
1697
1697
  yield root, _level
1698
1698
 
1699
+ # Handle picture traversal - only traverse children if requested
1700
+ if isinstance(root, PictureItem) and not traverse_pictures:
1701
+ return
1702
+
1699
1703
  # Traverse children
1700
1704
  for child_ref in root.children:
1701
1705
  child = child_ref.resolve(self)
1702
-
1703
1706
  if isinstance(child, NodeItem):
1704
- # If the child is a NodeItem, recursively traverse it
1705
- if not isinstance(child, PictureItem) or traverse_pictures:
1706
- yield from self.iterate_items(
1707
- child,
1708
- _level=_level + 1,
1709
- with_groups=with_groups,
1710
- page_no=page_no,
1711
- )
1707
+ yield from self.iterate_items(
1708
+ child,
1709
+ with_groups=with_groups,
1710
+ traverse_pictures=traverse_pictures,
1711
+ page_no=page_no,
1712
+ _level=_level + 1,
1713
+ )
1712
1714
 
1713
1715
  def _clear_picture_pil_cache(self):
1714
1716
  """Clear cache storage of all images."""
@@ -1864,7 +1866,7 @@ class DoclingDocument(BaseModel):
1864
1866
 
1865
1867
  """
1866
1868
  with open(filename, "r") as f:
1867
- return cls.model_validate(json.loads(f.read()))
1869
+ return cls.model_validate_json(f.read())
1868
1870
 
1869
1871
  def save_as_yaml(
1870
1872
  self,
@@ -2115,10 +2117,30 @@ class DoclingDocument(BaseModel):
2115
2117
  # Bold, Italic, or Bold-Italic
2116
2118
  # Hence, any underscore that we print into Markdown is coming from document text
2117
2119
  # That means we need to escape it, to properly reflect content in the markdown
2120
+ # However, we need to preserve underscores in image URLs
2121
+ # to maintain their validity
2122
+ # For example: ![image](path/to_image.png) should remain unchanged
2118
2123
  def escape_underscores(text):
2119
- # Replace "_" with "\_" only if it's not already escaped
2120
- escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
2121
- return escaped_text
2124
+ """Escape underscores but leave them intact in the URL.."""
2125
+ # Firstly, identify all the URL patterns.
2126
+ url_pattern = r"!\[.*?\]\((.*?)\)"
2127
+ parts = []
2128
+ last_end = 0
2129
+
2130
+ for match in re.finditer(url_pattern, text):
2131
+ # Text to add before the URL (needs to be escaped)
2132
+ before_url = text[last_end : match.start()]
2133
+ parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2134
+
2135
+ # Add the full URL part (do not escape)
2136
+ parts.append(match.group(0))
2137
+ last_end = match.end()
2138
+
2139
+ # Add the final part of the text (which needs to be escaped)
2140
+ if last_end < len(text):
2141
+ parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2142
+
2143
+ return "".join(parts)
2122
2144
 
2123
2145
  mdtext = escape_underscores(mdtext)
2124
2146
 
@@ -140,6 +140,7 @@ class BaseCell(AliasModel):
140
140
  obj_type: str = Field(
141
141
  alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
142
142
  )
143
+ payload: Optional[dict] = None
143
144
 
144
145
  def get_location_tokens(
145
146
  self,
@@ -0,0 +1,346 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Utilities for converting between legacy and new document format."""
7
+
8
+ import hashlib
9
+ import uuid
10
+ from typing import Union
11
+
12
+ from docling_core.types.doc import (
13
+ DocItem,
14
+ DocItemLabel,
15
+ DoclingDocument,
16
+ PictureItem,
17
+ SectionHeaderItem,
18
+ TableCell,
19
+ TableItem,
20
+ TextItem,
21
+ )
22
+ from docling_core.types.doc.document import ListItem
23
+ from docling_core.types.legacy_doc.base import (
24
+ BaseCell,
25
+ BaseText,
26
+ Figure,
27
+ GlmTableCell,
28
+ PageDimensions,
29
+ PageReference,
30
+ Prov,
31
+ Ref,
32
+ )
33
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
34
+ from docling_core.types.legacy_doc.base import TableCell as DsTableCell
35
+ from docling_core.types.legacy_doc.document import (
36
+ CCSDocumentDescription as DsDocumentDescription,
37
+ )
38
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
39
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
40
+
41
+
42
+ def _create_hash(string: str):
43
+ hasher = hashlib.sha256()
44
+ hasher.update(string.encode("utf-8"))
45
+
46
+ return hasher.hexdigest()
47
+
48
+
49
+ def doc_item_label_to_legacy_type(label: DocItemLabel):
50
+ """Convert the DocItemLabel to the legacy type."""
51
+ _label_to_ds_type = {
52
+ DocItemLabel.TITLE: "title",
53
+ DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
54
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
55
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
56
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
57
+ DocItemLabel.CAPTION: "caption",
58
+ DocItemLabel.PAGE_HEADER: "page-header",
59
+ DocItemLabel.PAGE_FOOTER: "page-footer",
60
+ DocItemLabel.FOOTNOTE: "footnote",
61
+ DocItemLabel.TABLE: "table",
62
+ DocItemLabel.FORMULA: "equation",
63
+ DocItemLabel.LIST_ITEM: "paragraph",
64
+ DocItemLabel.CODE: "paragraph",
65
+ DocItemLabel.PICTURE: "figure",
66
+ DocItemLabel.TEXT: "paragraph",
67
+ DocItemLabel.PARAGRAPH: "paragraph",
68
+ }
69
+ if label in _label_to_ds_type:
70
+ return _label_to_ds_type[label]
71
+ return label.value
72
+
73
+
74
+ def doc_item_label_to_legacy_name(label: DocItemLabel):
75
+ """Convert the DocItemLabel to the legacy name."""
76
+ _reverse_label_name_mapping = {
77
+ DocItemLabel.CAPTION: "Caption",
78
+ DocItemLabel.FOOTNOTE: "Footnote",
79
+ DocItemLabel.FORMULA: "Formula",
80
+ DocItemLabel.LIST_ITEM: "List-item",
81
+ DocItemLabel.PAGE_FOOTER: "Page-footer",
82
+ DocItemLabel.PAGE_HEADER: "Page-header",
83
+ DocItemLabel.PICTURE: "Picture",
84
+ DocItemLabel.SECTION_HEADER: "Section-header",
85
+ DocItemLabel.TABLE: "Table",
86
+ DocItemLabel.TEXT: "Text",
87
+ DocItemLabel.TITLE: "Title",
88
+ DocItemLabel.DOCUMENT_INDEX: "Document Index",
89
+ DocItemLabel.CODE: "Code",
90
+ DocItemLabel.CHECKBOX_SELECTED: "Checkbox-Selected",
91
+ DocItemLabel.CHECKBOX_UNSELECTED: "Checkbox-Unselected",
92
+ DocItemLabel.FORM: "Form",
93
+ DocItemLabel.KEY_VALUE_REGION: "Key-Value Region",
94
+ DocItemLabel.PARAGRAPH: "paragraph",
95
+ }
96
+ if label in _reverse_label_name_mapping:
97
+ return _reverse_label_name_mapping[label]
98
+ return label.value
99
+
100
+
101
+ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "file"):
102
+ """Convert a DoclingDocument to the legacy format."""
103
+ title = ""
104
+ desc: DsDocumentDescription = DsDocumentDescription(logs=[])
105
+
106
+ if doc.origin is not None:
107
+ document_hash = _create_hash(str(doc.origin.binary_hash))
108
+ filename = doc.origin.filename
109
+ else:
110
+ document_hash = _create_hash(str(uuid.uuid4()))
111
+ filename = fallback_filaname
112
+
113
+ page_hashes = [
114
+ PageReference(
115
+ hash=_create_hash(document_hash + ":" + str(p.page_no - 1)),
116
+ page=p.page_no,
117
+ model="default",
118
+ )
119
+ for p in doc.pages.values()
120
+ ]
121
+
122
+ file_info = DsFileInfoObject(
123
+ filename=filename,
124
+ document_hash=document_hash,
125
+ num_pages=len(doc.pages),
126
+ page_hashes=page_hashes,
127
+ )
128
+
129
+ main_text: list[Union[Ref, BaseText]] = []
130
+ tables: list[DsSchemaTable] = []
131
+ figures: list[Figure] = []
132
+ equations: list[BaseCell] = []
133
+ footnotes: list[BaseText] = []
134
+ page_headers: list[BaseText] = []
135
+ page_footers: list[BaseText] = []
136
+
137
+ # TODO: populate page_headers page_footers from doc.furniture
138
+
139
+ embedded_captions = set()
140
+ for ix, (item, level) in enumerate(doc.iterate_items(doc.body)):
141
+
142
+ if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
143
+ caption = item.caption_text(doc)
144
+ if caption:
145
+ embedded_captions.add(caption)
146
+
147
+ for item, level in doc.iterate_items():
148
+ if isinstance(item, DocItem):
149
+ item_type = item.label
150
+
151
+ if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
152
+
153
+ if isinstance(item, ListItem) and item.marker:
154
+ text = f"{item.marker} {item.text}"
155
+ else:
156
+ text = item.text
157
+
158
+ # Can be empty.
159
+ prov = [
160
+ Prov(
161
+ bbox=p.bbox.as_tuple(),
162
+ page=p.page_no,
163
+ span=[0, len(item.text)],
164
+ )
165
+ for p in item.prov
166
+ ]
167
+ main_text.append(
168
+ BaseText(
169
+ text=text,
170
+ obj_type=doc_item_label_to_legacy_type(item.label),
171
+ name=doc_item_label_to_legacy_name(item.label),
172
+ prov=prov,
173
+ )
174
+ )
175
+
176
+ # skip captions of they are embedded in the actual
177
+ # floating object
178
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
179
+ continue
180
+
181
+ elif isinstance(item, TableItem) and item.data:
182
+ index = len(tables)
183
+ ref_str = f"#/tables/{index}"
184
+ main_text.append(
185
+ Ref(
186
+ name=doc_item_label_to_legacy_name(item.label),
187
+ obj_type=doc_item_label_to_legacy_type(item.label),
188
+ ref=ref_str,
189
+ ),
190
+ )
191
+
192
+ # Initialise empty table data grid (only empty cells)
193
+ table_data = [
194
+ [
195
+ DsTableCell(
196
+ text="",
197
+ # bbox=[0,0,0,0],
198
+ spans=[[i, j]],
199
+ obj_type="body",
200
+ )
201
+ for j in range(item.data.num_cols)
202
+ ]
203
+ for i in range(item.data.num_rows)
204
+ ]
205
+
206
+ # Overwrite cells in table data for which there is actual cell content.
207
+ for cell in item.data.table_cells:
208
+ for i in range(
209
+ min(cell.start_row_offset_idx, item.data.num_rows),
210
+ min(cell.end_row_offset_idx, item.data.num_rows),
211
+ ):
212
+ for j in range(
213
+ min(cell.start_col_offset_idx, item.data.num_cols),
214
+ min(cell.end_col_offset_idx, item.data.num_cols),
215
+ ):
216
+ celltype = "body"
217
+ if cell.column_header:
218
+ celltype = "col_header"
219
+ elif cell.row_header:
220
+ celltype = "row_header"
221
+ elif cell.row_section:
222
+ celltype = "row_section"
223
+
224
+ def _make_spans(cell: TableCell, table_item: TableItem):
225
+ for rspan in range(
226
+ min(
227
+ cell.start_row_offset_idx,
228
+ table_item.data.num_rows,
229
+ ),
230
+ min(
231
+ cell.end_row_offset_idx,
232
+ table_item.data.num_rows,
233
+ ),
234
+ ):
235
+ for cspan in range(
236
+ min(
237
+ cell.start_col_offset_idx,
238
+ table_item.data.num_cols,
239
+ ),
240
+ min(
241
+ cell.end_col_offset_idx,
242
+ table_item.data.num_cols,
243
+ ),
244
+ ):
245
+ yield [rspan, cspan]
246
+
247
+ spans = list(_make_spans(cell, item))
248
+ table_data[i][j] = GlmTableCell(
249
+ text=cell.text,
250
+ bbox=(
251
+ cell.bbox.as_tuple()
252
+ if cell.bbox is not None
253
+ else None
254
+ ), # check if this is bottom-left
255
+ spans=spans,
256
+ obj_type=celltype,
257
+ col=j,
258
+ row=i,
259
+ row_header=cell.row_header,
260
+ row_section=cell.row_section,
261
+ col_header=cell.column_header,
262
+ row_span=[
263
+ cell.start_row_offset_idx,
264
+ cell.end_row_offset_idx,
265
+ ],
266
+ col_span=[
267
+ cell.start_col_offset_idx,
268
+ cell.end_col_offset_idx,
269
+ ],
270
+ )
271
+
272
+ # Compute the caption
273
+ caption = item.caption_text(doc)
274
+
275
+ tables.append(
276
+ DsSchemaTable(
277
+ text=caption,
278
+ num_cols=item.data.num_cols,
279
+ num_rows=item.data.num_rows,
280
+ obj_type=doc_item_label_to_legacy_type(item.label),
281
+ data=table_data,
282
+ prov=[
283
+ Prov(
284
+ bbox=p.bbox.as_tuple(),
285
+ page=p.page_no,
286
+ span=[0, 0],
287
+ )
288
+ for p in item.prov
289
+ ],
290
+ )
291
+ )
292
+
293
+ elif isinstance(item, PictureItem):
294
+ index = len(figures)
295
+ ref_str = f"#/figures/{index}"
296
+ main_text.append(
297
+ Ref(
298
+ name=doc_item_label_to_legacy_name(item.label),
299
+ obj_type=doc_item_label_to_legacy_type(item.label),
300
+ ref=ref_str,
301
+ ),
302
+ )
303
+
304
+ # Compute the caption
305
+ caption = item.caption_text(doc)
306
+
307
+ figures.append(
308
+ Figure(
309
+ prov=[
310
+ Prov(
311
+ bbox=p.bbox.as_tuple(),
312
+ page=p.page_no,
313
+ span=[0, len(caption)],
314
+ )
315
+ for p in item.prov
316
+ ],
317
+ obj_type=doc_item_label_to_legacy_type(item.label),
318
+ text=caption,
319
+ # data=[[]],
320
+ )
321
+ )
322
+
323
+ page_dimensions = [
324
+ PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
325
+ for p in doc.pages.values()
326
+ ]
327
+
328
+ legacy_doc: DsDocument = DsDocument(
329
+ name=title,
330
+ description=desc,
331
+ file_info=file_info,
332
+ main_text=main_text,
333
+ equations=equations,
334
+ footnotes=footnotes,
335
+ page_headers=page_headers,
336
+ page_footers=page_footers,
337
+ tables=tables,
338
+ figures=figures,
339
+ page_dimensions=page_dimensions,
340
+ )
341
+
342
+ return legacy_doc
343
+
344
+
345
+ # def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:
346
+ # """Convert a legacy document to DoclingDocument."""
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.7.1"
3
+ version = "2.9.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -55,6 +55,11 @@ pandas = "^2.1.4"
55
55
  pillow = "^10.3.0"
56
56
  pyyaml = ">=5.1,<7.0.0"
57
57
  typing-extensions = "^4.12.2"
58
+ transformers = { version = "^4.34.0", optional = true }
59
+ semchunk = { version = "^2.2.0", optional = true }
60
+
61
+ [tool.poetry.extras]
62
+ chunking = ["transformers", "semchunk"]
58
63
 
59
64
  [tool.poetry.group.dev.dependencies]
60
65
  black = "^24.4.2"
@@ -121,6 +126,7 @@ module = [
121
126
  "jsonschema.*",
122
127
  "requests.*",
123
128
  "tabulate.*",
129
+ "transformers.*",
124
130
  "yaml.*",
125
131
  ]
126
132
  ignore_missing_imports = true
File without changes
File without changes