docling-core 2.23.2__tar.gz → 2.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (71) hide show
  1. {docling_core-2.23.2 → docling_core-2.24.0}/PKG-INFO +1 -1
  2. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/experimental/serializer/base.py +2 -2
  3. docling_core-2.24.0/docling_core/experimental/serializer/common.py +407 -0
  4. docling_core-2.24.0/docling_core/experimental/serializer/doctags.py +492 -0
  5. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/experimental/serializer/markdown.py +80 -43
  6. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/doc/document.py +412 -418
  7. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/doc/page.py +18 -6
  8. docling_core-2.24.0/docling_core/types/doc/tokens.py +292 -0
  9. {docling_core-2.23.2 → docling_core-2.24.0}/pyproject.toml +1 -1
  10. docling_core-2.23.2/docling_core/experimental/serializer/common.py +0 -353
  11. docling_core-2.23.2/docling_core/types/doc/tokens.py +0 -126
  12. {docling_core-2.23.2 → docling_core-2.24.0}/LICENSE +0 -0
  13. {docling_core-2.23.2 → docling_core-2.24.0}/README.md +0 -0
  14. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/__init__.py +0 -0
  15. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/cli/__init__.py +0 -0
  16. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/cli/view.py +0 -0
  17. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/experimental/__init__.py +0 -0
  18. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/experimental/serializer/__init__.py +0 -0
  19. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/py.typed +0 -0
  20. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  21. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  22. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  23. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  24. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  25. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  26. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  27. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  28. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/search/__init__.py +0 -0
  29. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  30. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/search/mapping.py +0 -0
  31. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/search/meta.py +0 -0
  32. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/search/package.py +0 -0
  33. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/transforms/__init__.py +0 -0
  34. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/transforms/chunker/__init__.py +0 -0
  35. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/transforms/chunker/base.py +0 -0
  36. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  37. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  38. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/__init__.py +0 -0
  39. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/base.py +0 -0
  40. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/doc/__init__.py +0 -0
  41. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/doc/base.py +0 -0
  42. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/doc/labels.py +0 -0
  43. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/doc/utils.py +0 -0
  44. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/gen/__init__.py +0 -0
  45. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/gen/generic.py +0 -0
  46. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/io/__init__.py +0 -0
  47. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  48. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/legacy_doc/base.py +0 -0
  49. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  50. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  51. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  52. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/legacy_doc/document.py +0 -0
  53. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  54. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/nlp/__init__.py +0 -0
  55. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/nlp/qa.py +0 -0
  56. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/nlp/qa_labels.py +0 -0
  57. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/rec/__init__.py +0 -0
  58. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/rec/attribute.py +0 -0
  59. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/rec/base.py +0 -0
  60. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/rec/predicate.py +0 -0
  61. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/rec/record.py +0 -0
  62. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/rec/statement.py +0 -0
  63. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/types/rec/subject.py +0 -0
  64. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/__init__.py +0 -0
  65. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/alias.py +0 -0
  66. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/file.py +0 -0
  67. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/generate_docs.py +0 -0
  68. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/generate_jsonschema.py +0 -0
  69. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/legacy.py +0 -0
  70. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/validate.py +0 -0
  71. {docling_core-2.23.2 → docling_core-2.24.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.23.2
3
+ Version: 2.24.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -197,7 +197,7 @@ class BaseDocSerializer(ABC):
197
197
  @abstractmethod
198
198
  def get_parts(
199
199
  self,
200
- node: Optional[NodeItem] = None,
200
+ item: Optional[NodeItem] = None,
201
201
  **kwargs,
202
202
  ) -> list[SerializationResult]:
203
203
  """Get the components to be combined for serializing this node."""
@@ -222,6 +222,6 @@ class BaseDocSerializer(ABC):
222
222
  ...
223
223
 
224
224
  @abstractmethod
225
- def get_excluded_refs(self) -> list[str]:
225
+ def get_excluded_refs(self, **kwargs) -> list[str]:
226
226
  """Get references to excluded items."""
227
227
  ...
@@ -0,0 +1,407 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2025
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define base classes for serialization."""
7
+ import sys
8
+ from abc import abstractmethod
9
+ from copy import deepcopy
10
+ from functools import cached_property
11
+ from pathlib import Path
12
+ from typing import Any, Optional, Union
13
+
14
+ from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
15
+ from typing_extensions import Self, override
16
+
17
+ from docling_core.experimental.serializer.base import (
18
+ BaseDocSerializer,
19
+ BaseFallbackSerializer,
20
+ BaseFormSerializer,
21
+ BaseInlineSerializer,
22
+ BaseKeyValueSerializer,
23
+ BaseListSerializer,
24
+ BasePictureSerializer,
25
+ BaseTableSerializer,
26
+ BaseTextSerializer,
27
+ SerializationResult,
28
+ )
29
+ from docling_core.types.doc.document import (
30
+ DOCUMENT_TOKENS_EXPORT_LABELS,
31
+ ContentLayer,
32
+ DocItem,
33
+ DoclingDocument,
34
+ FloatingItem,
35
+ Formatting,
36
+ FormItem,
37
+ InlineGroup,
38
+ KeyValueItem,
39
+ NodeItem,
40
+ OrderedList,
41
+ PictureItem,
42
+ TableItem,
43
+ TextItem,
44
+ UnorderedList,
45
+ )
46
+ from docling_core.types.doc.labels import DocItemLabel
47
+
48
+ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
49
+ _DEFAULT_LAYERS = {cl for cl in ContentLayer}
50
+
51
+
52
+ class CommonParams(BaseModel):
53
+ """Common serialization parameters."""
54
+
55
+ # allowlists with non-recursive semantics, i.e. if a list group node is outside the
56
+ # range and some of its children items are within, they will be serialized
57
+ labels: set[DocItemLabel] = _DEFAULT_LABELS
58
+ layers: set[ContentLayer] = _DEFAULT_LAYERS
59
+ pages: Optional[set[int]] = None # None means all pages are allowed
60
+
61
+ # slice-like semantics: start is included, stop is excluded
62
+ start_idx: NonNegativeInt = 0
63
+ stop_idx: NonNegativeInt = sys.maxsize
64
+
65
+ include_formatting: bool = True
66
+ include_hyperlinks: bool = True
67
+ caption_delim: str = " "
68
+
69
+ def merge_with_patch(self, patch: dict[str, Any]) -> Self:
70
+ """Create an instance by merging the provided patch dict on top of self."""
71
+ res = self.model_validate({**self.model_dump(), **patch})
72
+ return res
73
+
74
+
75
+ class DocSerializer(BaseModel, BaseDocSerializer):
76
+ """Class for document serializers."""
77
+
78
+ class Config:
79
+ """Pydantic config."""
80
+
81
+ arbitrary_types_allowed = True
82
+ extra = "forbid"
83
+
84
+ doc: DoclingDocument
85
+
86
+ text_serializer: BaseTextSerializer
87
+ table_serializer: BaseTableSerializer
88
+ picture_serializer: BasePictureSerializer
89
+ key_value_serializer: BaseKeyValueSerializer
90
+ form_serializer: BaseFormSerializer
91
+ fallback_serializer: BaseFallbackSerializer
92
+
93
+ list_serializer: BaseListSerializer
94
+ inline_serializer: BaseInlineSerializer
95
+
96
+ params: CommonParams = CommonParams()
97
+
98
+ _excluded_refs_cache: dict[str, list[str]] = {}
99
+
100
+ @computed_field # type: ignore[misc]
101
+ @cached_property
102
+ def _captions_of_some_item(self) -> set[str]:
103
+ layers = {cl for cl in ContentLayer} # TODO review
104
+ refs = {
105
+ cap.cref
106
+ for (item, _) in self.doc.iterate_items(
107
+ with_groups=True,
108
+ traverse_pictures=True,
109
+ included_content_layers=layers,
110
+ )
111
+ for cap in (item.captions if isinstance(item, FloatingItem) else [])
112
+ }
113
+ return refs
114
+
115
+ @override
116
+ def get_excluded_refs(self, **kwargs) -> list[str]:
117
+ """References to excluded items."""
118
+ params = self.params.merge_with_patch(patch=kwargs)
119
+ params_json = params.model_dump_json()
120
+ refs = self._excluded_refs_cache.get(params_json)
121
+ if refs is None:
122
+ refs = [
123
+ item.self_ref
124
+ for ix, (item, _) in enumerate(
125
+ self.doc.iterate_items(
126
+ with_groups=True,
127
+ traverse_pictures=True,
128
+ included_content_layers=params.layers,
129
+ )
130
+ )
131
+ if (
132
+ (ix < params.start_idx or ix >= params.stop_idx)
133
+ or (
134
+ isinstance(item, DocItem)
135
+ and (
136
+ item.label not in params.labels
137
+ or item.content_layer not in params.layers
138
+ or (
139
+ params.pages is not None
140
+ and (
141
+ (not item.prov)
142
+ or item.prov[0].page_no not in params.pages
143
+ )
144
+ )
145
+ )
146
+ )
147
+ )
148
+ ]
149
+ self._excluded_refs_cache[params_json] = refs
150
+ return refs
151
+
152
+ @abstractmethod
153
+ def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
154
+ """Serialize a page out of its parts."""
155
+ ...
156
+
157
+ @abstractmethod
158
+ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
159
+ """Serialize a document out of its pages."""
160
+ ...
161
+
162
+ def _serialize_body(self) -> SerializationResult:
163
+ """Serialize the document body."""
164
+ # find page ranges if available; otherwise regard whole doc as a single page
165
+ last_page: Optional[int] = None
166
+ starts: list[int] = []
167
+ for ix, (item, _) in enumerate(
168
+ self.doc.iterate_items(
169
+ with_groups=True,
170
+ traverse_pictures=True,
171
+ included_content_layers=self.params.layers,
172
+ )
173
+ ):
174
+ if isinstance(item, DocItem):
175
+ if item.prov:
176
+ if last_page is None or item.prov[0].page_no > last_page:
177
+ starts.append(ix)
178
+ last_page = item.prov[0].page_no
179
+ page_ranges = [
180
+ (
181
+ (starts[i] if i > 0 else 0),
182
+ (starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
183
+ )
184
+ for i, _ in enumerate(starts)
185
+ ] or [
186
+ (0, sys.maxsize)
187
+ ] # use whole range if no pages detected
188
+
189
+ page_results: list[SerializationResult] = []
190
+ for page_range in page_ranges:
191
+ params_to_pass = deepcopy(self.params)
192
+ params_to_pass.start_idx = page_range[0]
193
+ params_to_pass.stop_idx = page_range[1]
194
+ subparts = self.get_parts(**params_to_pass.model_dump())
195
+ page_res = self.serialize_page(subparts)
196
+ page_results.append(page_res)
197
+ res = self.serialize_doc(page_results)
198
+ return res
199
+
200
+ @override
201
+ def serialize(
202
+ self,
203
+ *,
204
+ item: Optional[NodeItem] = None,
205
+ list_level: int = 0,
206
+ is_inline_scope: bool = False,
207
+ visited: Optional[set[str]] = None, # refs of visited items
208
+ **kwargs,
209
+ ) -> SerializationResult:
210
+ """Serialize a given node."""
211
+ my_visited: set[str] = visited if visited is not None else set()
212
+ empty_res = SerializationResult(text="")
213
+ if item is None or item == self.doc.body:
214
+ if self.doc.body.self_ref not in my_visited:
215
+ my_visited.add(self.doc.body.self_ref)
216
+ return self._serialize_body()
217
+ else:
218
+ return empty_res
219
+
220
+ ########
221
+ # groups
222
+ ########
223
+ if isinstance(item, (UnorderedList, OrderedList)):
224
+ part = self.list_serializer.serialize(
225
+ item=item,
226
+ doc_serializer=self,
227
+ doc=self.doc,
228
+ list_level=list_level,
229
+ is_inline_scope=is_inline_scope,
230
+ visited=my_visited,
231
+ **kwargs,
232
+ )
233
+ elif isinstance(item, InlineGroup):
234
+ part = self.inline_serializer.serialize(
235
+ item=item,
236
+ doc_serializer=self,
237
+ doc=self.doc,
238
+ list_level=list_level,
239
+ visited=my_visited,
240
+ **kwargs,
241
+ )
242
+ ###########
243
+ # doc items
244
+ ###########
245
+ elif isinstance(item, TextItem):
246
+ if item.self_ref in self._captions_of_some_item:
247
+ # those captions will be handled by the floating item holding them
248
+ return empty_res
249
+ else:
250
+ part = (
251
+ self.text_serializer.serialize(
252
+ item=item,
253
+ doc_serializer=self,
254
+ doc=self.doc,
255
+ is_inline_scope=is_inline_scope,
256
+ **kwargs,
257
+ )
258
+ if item.self_ref not in self.get_excluded_refs(**kwargs)
259
+ else empty_res
260
+ )
261
+ elif isinstance(item, TableItem):
262
+ part = self.table_serializer.serialize(
263
+ item=item,
264
+ doc_serializer=self,
265
+ doc=self.doc,
266
+ **kwargs,
267
+ )
268
+ elif isinstance(item, PictureItem):
269
+ part = self.picture_serializer.serialize(
270
+ item=item,
271
+ doc_serializer=self,
272
+ doc=self.doc,
273
+ visited=my_visited,
274
+ **kwargs,
275
+ )
276
+ elif isinstance(item, KeyValueItem):
277
+ part = self.key_value_serializer.serialize(
278
+ item=item,
279
+ doc_serializer=self,
280
+ doc=self.doc,
281
+ **kwargs,
282
+ )
283
+ elif isinstance(item, FormItem):
284
+ part = self.form_serializer.serialize(
285
+ item=item,
286
+ doc_serializer=self,
287
+ doc=self.doc,
288
+ **kwargs,
289
+ )
290
+ else:
291
+ part = self.fallback_serializer.serialize(
292
+ item=item,
293
+ doc_serializer=self,
294
+ doc=self.doc,
295
+ **kwargs,
296
+ )
297
+ return part
298
+
299
+ # making some assumptions about the kwargs it can pass
300
+ @override
301
+ def get_parts(
302
+ self,
303
+ item: Optional[NodeItem] = None,
304
+ *,
305
+ traverse_pictures: bool = False,
306
+ list_level: int = 0,
307
+ is_inline_scope: bool = False,
308
+ visited: Optional[set[str]] = None, # refs of visited items
309
+ **kwargs,
310
+ ) -> list[SerializationResult]:
311
+ """Get the components to be combined for serializing this node."""
312
+ parts: list[SerializationResult] = []
313
+ my_visited: set[str] = visited if visited is not None else set()
314
+ params = self.params.merge_with_patch(patch=kwargs)
315
+ for item, _ in self.doc.iterate_items(
316
+ root=item,
317
+ with_groups=True,
318
+ traverse_pictures=traverse_pictures,
319
+ included_content_layers=params.layers,
320
+ ):
321
+ if item.self_ref in my_visited:
322
+ continue
323
+ else:
324
+ my_visited.add(item.self_ref)
325
+ part = self.serialize(
326
+ item=item,
327
+ list_level=list_level,
328
+ is_inline_scope=is_inline_scope,
329
+ visited=my_visited,
330
+ **kwargs,
331
+ )
332
+ if part.text:
333
+ parts.append(part)
334
+ return parts
335
+
336
+ @override
337
+ def post_process(
338
+ self,
339
+ text: str,
340
+ *,
341
+ formatting: Optional[Formatting] = None,
342
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
343
+ **kwargs,
344
+ ) -> str:
345
+ """Apply some text post-processing steps."""
346
+ params = self.params.merge_with_patch(patch=kwargs)
347
+ res = text
348
+ if params.include_formatting and formatting:
349
+ if formatting.bold:
350
+ res = self.serialize_bold(text=res)
351
+ if formatting.italic:
352
+ res = self.serialize_italic(text=res)
353
+ if formatting.underline:
354
+ res = self.serialize_underline(text=res)
355
+ if formatting.strikethrough:
356
+ res = self.serialize_strikethrough(text=res)
357
+ if params.include_hyperlinks and hyperlink:
358
+ res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
359
+ return res
360
+
361
+ @override
362
+ def serialize_bold(self, text: str, **kwargs) -> str:
363
+ """Hook for bold formatting serialization."""
364
+ return text
365
+
366
+ @override
367
+ def serialize_italic(self, text: str, **kwargs) -> str:
368
+ """Hook for italic formatting serialization."""
369
+ return text
370
+
371
+ @override
372
+ def serialize_underline(self, text: str, **kwargs) -> str:
373
+ """Hook for underline formatting serialization."""
374
+ return text
375
+
376
+ @override
377
+ def serialize_strikethrough(self, text: str, **kwargs) -> str:
378
+ """Hook for strikethrough formatting serialization."""
379
+ return text
380
+
381
+ @override
382
+ def serialize_hyperlink(
383
+ self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
384
+ ) -> str:
385
+ """Hook for hyperlink serialization."""
386
+ return text
387
+
388
+ @override
389
+ def serialize_captions(
390
+ self,
391
+ item: FloatingItem,
392
+ **kwargs,
393
+ ) -> SerializationResult:
394
+ """Serialize the item's captions."""
395
+ params = self.params.merge_with_patch(patch=kwargs)
396
+ if DocItemLabel.CAPTION in params.labels:
397
+ text_parts: list[str] = [
398
+ it.text
399
+ for cap in item.captions
400
+ if isinstance(it := cap.resolve(self.doc), TextItem)
401
+ and it.self_ref not in self.get_excluded_refs(**kwargs)
402
+ ]
403
+ text_res = params.caption_delim.join(text_parts)
404
+ text_res = self.post_process(text=text_res)
405
+ else:
406
+ text_res = ""
407
+ return SerializationResult(text=text_res)