docling-core 2.23.3__py3-none-any.whl → 2.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -197,7 +197,7 @@ class BaseDocSerializer(ABC):
197
197
  @abstractmethod
198
198
  def get_parts(
199
199
  self,
200
- node: Optional[NodeItem] = None,
200
+ item: Optional[NodeItem] = None,
201
201
  **kwargs,
202
202
  ) -> list[SerializationResult]:
203
203
  """Get the components to be combined for serializing this node."""
@@ -222,6 +222,6 @@ class BaseDocSerializer(ABC):
222
222
  ...
223
223
 
224
224
  @abstractmethod
225
- def get_excluded_refs(self) -> list[str]:
225
+ def get_excluded_refs(self, **kwargs) -> list[str]:
226
226
  """Get references to excluded items."""
227
227
  ...
@@ -5,12 +5,14 @@
5
5
 
6
6
  """Define base classes for serialization."""
7
7
  import sys
8
+ from abc import abstractmethod
9
+ from copy import deepcopy
8
10
  from functools import cached_property
9
11
  from pathlib import Path
10
- from typing import Optional, Union
12
+ from typing import Any, Optional, Union
11
13
 
12
- from pydantic import AnyUrl, BaseModel, computed_field
13
- from typing_extensions import override
14
+ from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
15
+ from typing_extensions import Self, override
14
16
 
15
17
  from docling_core.experimental.serializer.base import (
16
18
  BaseDocSerializer,
@@ -24,9 +26,7 @@ from docling_core.experimental.serializer.base import (
24
26
  BaseTextSerializer,
25
27
  SerializationResult,
26
28
  )
27
- from docling_core.types.doc.base import ImageRefMode
28
29
  from docling_core.types.doc.document import (
29
- DEFAULT_CONTENT_LAYERS,
30
30
  DOCUMENT_TOKENS_EXPORT_LABELS,
31
31
  ContentLayer,
32
32
  DocItem,
@@ -38,10 +38,7 @@ from docling_core.types.doc.document import (
38
38
  KeyValueItem,
39
39
  NodeItem,
40
40
  OrderedList,
41
- PictureClassificationData,
42
- PictureDescriptionData,
43
41
  PictureItem,
44
- PictureMoleculeData,
45
42
  TableItem,
46
43
  TextItem,
47
44
  UnorderedList,
@@ -49,6 +46,30 @@ from docling_core.types.doc.document import (
49
46
  from docling_core.types.doc.labels import DocItemLabel
50
47
 
51
48
  _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
49
+ _DEFAULT_LAYERS = {cl for cl in ContentLayer}
50
+
51
+
52
+ class CommonParams(BaseModel):
53
+ """Common serialization parameters."""
54
+
55
+ # allowlists with non-recursive semantics, i.e. if a list group node is outside the
56
+ # range and some of its children items are within, they will be serialized
57
+ labels: set[DocItemLabel] = _DEFAULT_LABELS
58
+ layers: set[ContentLayer] = _DEFAULT_LAYERS
59
+ pages: Optional[set[int]] = None # None means all pages are allowed
60
+
61
+ # slice-like semantics: start is included, stop is excluded
62
+ start_idx: NonNegativeInt = 0
63
+ stop_idx: NonNegativeInt = sys.maxsize
64
+
65
+ include_formatting: bool = True
66
+ include_hyperlinks: bool = True
67
+ caption_delim: str = " "
68
+
69
+ def merge_with_patch(self, patch: dict[str, Any]) -> Self:
70
+ """Create an instance by merging the provided patch dict on top of self."""
71
+ res = self.model_validate({**self.model_dump(), **patch})
72
+ return res
52
73
 
53
74
 
54
75
  class DocSerializer(BaseModel, BaseDocSerializer):
@@ -58,22 +79,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
58
79
  """Pydantic config."""
59
80
 
60
81
  arbitrary_types_allowed = True
82
+ extra = "forbid"
61
83
 
62
84
  doc: DoclingDocument
63
85
 
64
- include_formatting: bool = True
65
- include_hyperlinks: bool = True
66
- escape_underscores: bool = True
67
-
68
- # this filtering criteria are non-recursive;
69
- # e.g. if a list group node is outside the range and some of its children items are
70
- # within, they will be serialized
71
- start: int = 0
72
- stop: int = sys.maxsize
73
- labels: set[DocItemLabel] = _DEFAULT_LABELS
74
- layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS
75
- pages: Optional[set[int]] = None
76
-
77
86
  text_serializer: BaseTextSerializer
78
87
  table_serializer: BaseTableSerializer
79
88
  picture_serializer: BasePictureSerializer
@@ -84,150 +93,242 @@ class DocSerializer(BaseModel, BaseDocSerializer):
84
93
  list_serializer: BaseListSerializer
85
94
  inline_serializer: BaseInlineSerializer
86
95
 
87
- # these will be passed to the picture serializer (None defers/delegates fallback
88
- # setting to callee):
89
- image_placeholder: Optional[str] = None
90
- image_mode: Optional[ImageRefMode] = None
96
+ params: CommonParams = CommonParams()
97
+
98
+ _excluded_refs_cache: dict[str, list[str]] = {}
91
99
 
92
100
  @computed_field # type: ignore[misc]
93
101
  @cached_property
94
- def _excluded_refs(self) -> list[str]:
95
- refs: list[str] = [
96
- item.self_ref
97
- for ix, (item, _) in enumerate(
98
- self.doc.iterate_items(
99
- with_groups=True,
100
- traverse_pictures=True,
101
- )
102
+ def _captions_of_some_item(self) -> set[str]:
103
+ layers = {cl for cl in ContentLayer} # TODO review
104
+ refs = {
105
+ cap.cref
106
+ for (item, _) in self.doc.iterate_items(
107
+ with_groups=True,
108
+ traverse_pictures=True,
109
+ included_content_layers=layers,
102
110
  )
103
- if (
104
- (ix < self.start or ix >= self.stop)
105
- or (
106
- isinstance(item, DocItem)
107
- and (
108
- item.label not in self.labels
109
- or item.content_layer not in self.layers
110
- or (
111
- self.pages is not None
112
- and (
113
- (not item.prov)
114
- or item.prov[0].page_no not in self.pages
111
+ for cap in (item.captions if isinstance(item, FloatingItem) else [])
112
+ }
113
+ return refs
114
+
115
+ @override
116
+ def get_excluded_refs(self, **kwargs) -> list[str]:
117
+ """References to excluded items."""
118
+ params = self.params.merge_with_patch(patch=kwargs)
119
+ params_json = params.model_dump_json()
120
+ refs = self._excluded_refs_cache.get(params_json)
121
+ if refs is None:
122
+ refs = [
123
+ item.self_ref
124
+ for ix, (item, _) in enumerate(
125
+ self.doc.iterate_items(
126
+ with_groups=True,
127
+ traverse_pictures=True,
128
+ included_content_layers=params.layers,
129
+ )
130
+ )
131
+ if (
132
+ (ix < params.start_idx or ix >= params.stop_idx)
133
+ or (
134
+ isinstance(item, DocItem)
135
+ and (
136
+ item.label not in params.labels
137
+ or item.content_layer not in params.layers
138
+ or (
139
+ params.pages is not None
140
+ and (
141
+ (not item.prov)
142
+ or item.prov[0].page_no not in params.pages
143
+ )
115
144
  )
116
145
  )
117
146
  )
118
147
  )
119
- )
120
- ]
148
+ ]
149
+ self._excluded_refs_cache[params_json] = refs
121
150
  return refs
122
151
 
123
- @override
124
- def get_excluded_refs(self) -> list[str]:
125
- """References to excluded items."""
126
- return self._excluded_refs
152
+ @abstractmethod
153
+ def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
154
+ """Serialize a page out of its parts."""
155
+ ...
156
+
157
+ @abstractmethod
158
+ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
159
+ """Serialize a document out of its pages."""
160
+ ...
161
+
162
+ def _serialize_body(self) -> SerializationResult:
163
+ """Serialize the document body."""
164
+ # find page ranges if available; otherwise regard whole doc as a single page
165
+ last_page: Optional[int] = None
166
+ starts: list[int] = []
167
+ for ix, (item, _) in enumerate(
168
+ self.doc.iterate_items(
169
+ with_groups=True,
170
+ traverse_pictures=True,
171
+ included_content_layers=self.params.layers,
172
+ )
173
+ ):
174
+ if isinstance(item, DocItem):
175
+ if item.prov:
176
+ if last_page is None or item.prov[0].page_no > last_page:
177
+ starts.append(ix)
178
+ last_page = item.prov[0].page_no
179
+ page_ranges = [
180
+ (
181
+ (starts[i] if i > 0 else 0),
182
+ (starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
183
+ )
184
+ for i, _ in enumerate(starts)
185
+ ] or [
186
+ (0, sys.maxsize)
187
+ ] # use whole range if no pages detected
188
+
189
+ page_results: list[SerializationResult] = []
190
+ for page_range in page_ranges:
191
+ params_to_pass = deepcopy(self.params)
192
+ params_to_pass.start_idx = page_range[0]
193
+ params_to_pass.stop_idx = page_range[1]
194
+ subparts = self.get_parts(**params_to_pass.model_dump())
195
+ page_res = self.serialize_page(subparts)
196
+ page_results.append(page_res)
197
+ res = self.serialize_doc(page_results)
198
+ return res
127
199
 
128
- # making some assumptions about the kwargs it can pass
129
200
  @override
130
- def get_parts(
201
+ def serialize(
131
202
  self,
132
- node: Optional[NodeItem] = None,
133
203
  *,
134
- traverse_pictures: bool = False,
204
+ item: Optional[NodeItem] = None,
135
205
  list_level: int = 0,
136
206
  is_inline_scope: bool = False,
137
207
  visited: Optional[set[str]] = None, # refs of visited items
138
208
  **kwargs,
139
- ) -> list[SerializationResult]:
140
- """Get the components to be combined for serializing this node."""
209
+ ) -> SerializationResult:
210
+ """Serialize a given node."""
141
211
  my_visited: set[str] = visited if visited is not None else set()
142
- parts: list[SerializationResult] = []
212
+ empty_res = SerializationResult(text="")
213
+ if item is None or item == self.doc.body:
214
+ if self.doc.body.self_ref not in my_visited:
215
+ my_visited.add(self.doc.body.self_ref)
216
+ return self._serialize_body()
217
+ else:
218
+ return empty_res
143
219
 
144
- label_blocklist = {
145
- DocItemLabel.CAPTION,
146
- DocItemLabel.FOOTNOTE,
147
- # TODO handle differently as it clashes with self.labels
148
- }
149
- for ix, (item, _) in enumerate(
150
- self.doc.iterate_items(
151
- root=node,
152
- with_groups=True,
153
- traverse_pictures=traverse_pictures,
154
- # ...
220
+ ########
221
+ # groups
222
+ ########
223
+ if isinstance(item, (UnorderedList, OrderedList)):
224
+ part = self.list_serializer.serialize(
225
+ item=item,
226
+ doc_serializer=self,
227
+ doc=self.doc,
228
+ list_level=list_level,
229
+ is_inline_scope=is_inline_scope,
230
+ visited=my_visited,
231
+ **kwargs,
155
232
  )
156
- ):
157
- if item.self_ref in my_visited:
158
- continue
233
+ elif isinstance(item, InlineGroup):
234
+ part = self.inline_serializer.serialize(
235
+ item=item,
236
+ doc_serializer=self,
237
+ doc=self.doc,
238
+ list_level=list_level,
239
+ visited=my_visited,
240
+ **kwargs,
241
+ )
242
+ ###########
243
+ # doc items
244
+ ###########
245
+ elif isinstance(item, TextItem):
246
+ if item.self_ref in self._captions_of_some_item:
247
+ # those captions will be handled by the floating item holding them
248
+ return empty_res
159
249
  else:
160
- my_visited.add(item.self_ref)
161
-
162
- ########
163
- # groups
164
- ########
165
- if isinstance(item, (UnorderedList, OrderedList)):
166
- part = self.list_serializer.serialize(
167
- item=item,
168
- doc_serializer=self,
169
- doc=self.doc,
170
- list_level=list_level,
171
- is_inline_scope=is_inline_scope,
172
- visited=my_visited,
173
- )
174
- elif isinstance(item, InlineGroup):
175
- part = self.inline_serializer.serialize(
176
- item=item,
177
- doc_serializer=self,
178
- doc=self.doc,
179
- list_level=list_level,
180
- visited=my_visited,
181
- )
182
- ###########
183
- # doc items
184
- ###########
185
- elif isinstance(item, DocItem) and item.label in label_blocklist:
186
- continue
187
- elif isinstance(item, TextItem):
188
250
  part = (
189
251
  self.text_serializer.serialize(
190
252
  item=item,
191
253
  doc_serializer=self,
192
254
  doc=self.doc,
193
255
  is_inline_scope=is_inline_scope,
256
+ **kwargs,
194
257
  )
195
- if item.self_ref not in self.get_excluded_refs()
196
- else SerializationResult(text="")
197
- )
198
- elif isinstance(item, TableItem):
199
- part = self.table_serializer.serialize(
200
- item=item,
201
- doc_serializer=self,
202
- doc=self.doc,
203
- )
204
- elif isinstance(item, PictureItem):
205
- part = self.picture_serializer.serialize(
206
- item=item,
207
- doc_serializer=self,
208
- doc=self.doc,
209
- visited=my_visited,
210
- image_mode=self.image_mode,
211
- image_placeholder=self.image_placeholder,
212
- )
213
- elif isinstance(item, KeyValueItem):
214
- part = self.key_value_serializer.serialize(
215
- item=item,
216
- doc_serializer=self,
217
- doc=self.doc,
218
- )
219
- elif isinstance(item, FormItem):
220
- part = self.form_serializer.serialize(
221
- item=item,
222
- doc_serializer=self,
223
- doc=self.doc,
258
+ if item.self_ref not in self.get_excluded_refs(**kwargs)
259
+ else empty_res
224
260
  )
261
+ elif isinstance(item, TableItem):
262
+ part = self.table_serializer.serialize(
263
+ item=item,
264
+ doc_serializer=self,
265
+ doc=self.doc,
266
+ **kwargs,
267
+ )
268
+ elif isinstance(item, PictureItem):
269
+ part = self.picture_serializer.serialize(
270
+ item=item,
271
+ doc_serializer=self,
272
+ doc=self.doc,
273
+ visited=my_visited,
274
+ **kwargs,
275
+ )
276
+ elif isinstance(item, KeyValueItem):
277
+ part = self.key_value_serializer.serialize(
278
+ item=item,
279
+ doc_serializer=self,
280
+ doc=self.doc,
281
+ **kwargs,
282
+ )
283
+ elif isinstance(item, FormItem):
284
+ part = self.form_serializer.serialize(
285
+ item=item,
286
+ doc_serializer=self,
287
+ doc=self.doc,
288
+ **kwargs,
289
+ )
290
+ else:
291
+ part = self.fallback_serializer.serialize(
292
+ item=item,
293
+ doc_serializer=self,
294
+ doc=self.doc,
295
+ **kwargs,
296
+ )
297
+ return part
298
+
299
+ # making some assumptions about the kwargs it can pass
300
+ @override
301
+ def get_parts(
302
+ self,
303
+ item: Optional[NodeItem] = None,
304
+ *,
305
+ traverse_pictures: bool = False,
306
+ list_level: int = 0,
307
+ is_inline_scope: bool = False,
308
+ visited: Optional[set[str]] = None, # refs of visited items
309
+ **kwargs,
310
+ ) -> list[SerializationResult]:
311
+ """Get the components to be combined for serializing this node."""
312
+ parts: list[SerializationResult] = []
313
+ my_visited: set[str] = visited if visited is not None else set()
314
+ params = self.params.merge_with_patch(patch=kwargs)
315
+ for item, _ in self.doc.iterate_items(
316
+ root=item,
317
+ with_groups=True,
318
+ traverse_pictures=traverse_pictures,
319
+ included_content_layers=params.layers,
320
+ ):
321
+ if item.self_ref in my_visited:
322
+ continue
225
323
  else:
226
- part = self.fallback_serializer.serialize(
227
- item=item,
228
- doc_serializer=self,
229
- doc=self.doc,
230
- )
324
+ my_visited.add(item.self_ref)
325
+ part = self.serialize(
326
+ item=item,
327
+ list_level=list_level,
328
+ is_inline_scope=is_inline_scope,
329
+ visited=my_visited,
330
+ **kwargs,
331
+ )
231
332
  if part.text:
232
333
  parts.append(part)
233
334
  return parts
@@ -242,8 +343,9 @@ class DocSerializer(BaseModel, BaseDocSerializer):
242
343
  **kwargs,
243
344
  ) -> str:
244
345
  """Apply some text post-processing steps."""
346
+ params = self.params.merge_with_patch(patch=kwargs)
245
347
  res = text
246
- if self.include_formatting and formatting:
348
+ if params.include_formatting and formatting:
247
349
  if formatting.bold:
248
350
  res = self.serialize_bold(text=res)
249
351
  if formatting.italic:
@@ -252,7 +354,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
252
354
  res = self.serialize_underline(text=res)
253
355
  if formatting.strikethrough:
254
356
  res = self.serialize_strikethrough(text=res)
255
- if self.include_hyperlinks and hyperlink:
357
+ if params.include_hyperlinks and hyperlink:
256
358
  res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
257
359
  return res
258
360
 
@@ -287,67 +389,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
287
389
  def serialize_captions(
288
390
  self,
289
391
  item: FloatingItem,
290
- separator: Optional[str] = None,
291
392
  **kwargs,
292
393
  ) -> SerializationResult:
293
394
  """Serialize the item's captions."""
294
- text_parts: list[str] = [
295
- it.text
296
- for cap in item.captions
297
- if isinstance(it := cap.resolve(self.doc), TextItem)
298
- and it.self_ref not in self.get_excluded_refs()
299
- ]
300
- text_res = (separator or "\n").join(text_parts)
301
- text_res = self.post_process(text=text_res)
302
- return SerializationResult(text=text_res)
303
-
304
-
305
- class PictureSerializer(BasePictureSerializer):
306
- """Class for picture serializers."""
307
-
308
- # helper function
309
- def _serialize_content(
310
- self,
311
- item: PictureItem,
312
- doc_serializer: "BaseDocSerializer",
313
- doc: DoclingDocument,
314
- separator: Optional[str] = None,
315
- visited: Optional[set[str]] = None,
316
- **kwargs,
317
- ) -> SerializationResult:
318
- parts = doc_serializer.get_parts(
319
- node=item,
320
- traverse_pictures=True,
321
- visited=visited,
322
- )
323
- text_res = (separator or " ").join([p.text for p in parts])
324
- # NOTE: we do no postprocessing since already done as needed
325
- return SerializationResult(text=text_res)
326
-
327
- # helper function
328
- def _serialize_annotations(
329
- self,
330
- item: PictureItem,
331
- doc_serializer: "BaseDocSerializer",
332
- doc: DoclingDocument,
333
- separator: Optional[str] = None,
334
- **kwargs,
335
- ) -> SerializationResult:
336
- text_parts: list[str] = []
337
- for annotation in item.annotations:
338
- if isinstance(annotation, PictureClassificationData):
339
- predicted_class = (
340
- annotation.predicted_classes[0].class_name
341
- if annotation.predicted_classes
342
- else None
343
- )
344
- if predicted_class is not None:
345
- text_parts.append(f"Picture type: {predicted_class}")
346
- elif isinstance(annotation, PictureMoleculeData):
347
- text_parts.append(f"SMILES: {annotation.smi}")
348
- elif isinstance(annotation, PictureDescriptionData):
349
- text_parts.append(f"Description: {annotation.text}")
350
-
351
- text_res = (separator or "\n").join(text_parts)
352
- text_res = doc_serializer.post_process(text=text_res)
395
+ params = self.params.merge_with_patch(patch=kwargs)
396
+ if DocItemLabel.CAPTION in params.labels:
397
+ text_parts: list[str] = [
398
+ it.text
399
+ for cap in item.captions
400
+ if isinstance(it := cap.resolve(self.doc), TextItem)
401
+ and it.self_ref not in self.get_excluded_refs(**kwargs)
402
+ ]
403
+ text_res = params.caption_delim.join(text_parts)
404
+ text_res = self.post_process(text=text_res)
405
+ else:
406
+ text_res = ""
353
407
  return SerializationResult(text=text_res)