docling-core 2.48.4__py3-none-any.whl → 2.50.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

docling_core/cli/view.py CHANGED
@@ -39,9 +39,17 @@ def view(
39
39
  typer.Argument(
40
40
  ...,
41
41
  metavar="source",
42
- help="Docling JSON file to view.",
42
+ help="Docling JSON or YAML file to view.",
43
43
  ),
44
44
  ],
45
+ split_view: Annotated[
46
+ bool,
47
+ typer.Option(
48
+ "--split-view",
49
+ "-s",
50
+ help="Split view of the document.",
51
+ ),
52
+ ] = False,
45
53
  version: Annotated[
46
54
  Optional[bool],
47
55
  typer.Option(
@@ -52,11 +60,19 @@ def view(
52
60
  ),
53
61
  ] = None,
54
62
  ):
55
- """Display a Docling JSON file on the default browser."""
63
+ """Display a DoclingDocument file on the default browser."""
56
64
  path = resolve_source_to_path(source=source)
57
- doc = DoclingDocument.load_from_json(filename=path)
58
- target_path = Path(tempfile.mkdtemp()) / "out.html"
59
- html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
65
+ if path.suffix == ".json":
66
+ doc = DoclingDocument.load_from_json(filename=path)
67
+ elif path.suffix in [".yaml", ".yml"]:
68
+ doc = DoclingDocument.load_from_yaml(filename=path)
69
+ else:
70
+ raise ValueError(f"Unsupported file type: {path.suffix}")
71
+ target_path = Path(tempfile.mkdtemp()) / f"{path.stem}.html"
72
+ html_output = doc.export_to_html(
73
+ image_mode=ImageRefMode.EMBEDDED,
74
+ split_page_view=split_view,
75
+ )
60
76
  with open(target_path, "w", encoding="utf-8") as f:
61
77
  f.write(html_output)
62
78
  webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
@@ -9,6 +9,7 @@ from pathlib import Path
9
9
  from typing import Any, Optional, Union
10
10
 
11
11
  from pydantic import AnyUrl, BaseModel
12
+ from typing_extensions import deprecated
12
13
 
13
14
  from docling_core.types.doc.document import (
14
15
  DocItem,
@@ -258,6 +259,7 @@ class BaseDocSerializer(ABC):
258
259
  """Serialize the item's captions."""
259
260
  ...
260
261
 
262
+ @deprecated("Use serialize_meta() instead.")
261
263
  @abstractmethod
262
264
  def serialize_annotations(
263
265
  self,
@@ -267,6 +269,15 @@ class BaseDocSerializer(ABC):
267
269
  """Serialize the item's annotations."""
268
270
  ...
269
271
 
272
+ @abstractmethod
273
+ def serialize_meta(
274
+ self,
275
+ item: NodeItem,
276
+ **kwargs: Any,
277
+ ) -> SerializationResult:
278
+ """Serialize the item's meta."""
279
+ ...
280
+
270
281
  @abstractmethod
271
282
  def get_excluded_refs(self, **kwargs: Any) -> set[str]:
272
283
  """Get references to excluded items."""
@@ -287,6 +298,26 @@ class BaseSerializerProvider(ABC):
287
298
  ...
288
299
 
289
300
 
301
+ class BaseMetaSerializer(ABC):
302
+ """Base class for meta serializers."""
303
+
304
+ @abstractmethod
305
+ def serialize(
306
+ self,
307
+ *,
308
+ item: NodeItem,
309
+ doc: DoclingDocument,
310
+ **kwargs: Any,
311
+ ) -> SerializationResult:
312
+ """Serializes the meta of the passed item."""
313
+ ...
314
+
315
+ def _humanize_text(self, text: str, title: bool = False) -> str:
316
+ tmp = text.replace("__", "_").replace("_", " ")
317
+ return tmp.title() if title else tmp.capitalize()
318
+
319
+
320
+ @deprecated("Use BaseMetaSerializer() instead.")
290
321
  class BaseAnnotationSerializer(ABC):
291
322
  """Base class for annotation serializers."""
292
323
 
@@ -4,6 +4,7 @@
4
4
  #
5
5
 
6
6
  """Define base classes for serialization."""
7
+ import logging
7
8
  import re
8
9
  import sys
9
10
  from abc import abstractmethod
@@ -11,7 +12,14 @@ from functools import cached_property
11
12
  from pathlib import Path
12
13
  from typing import Any, Iterable, Optional, Tuple, Union
13
14
 
14
- from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_field
15
+ from pydantic import (
16
+ AnyUrl,
17
+ BaseModel,
18
+ ConfigDict,
19
+ Field,
20
+ NonNegativeInt,
21
+ computed_field,
22
+ )
15
23
  from typing_extensions import Self, override
16
24
 
17
25
  from docling_core.transforms.serializer.base import (
@@ -22,6 +30,7 @@ from docling_core.transforms.serializer.base import (
22
30
  BaseInlineSerializer,
23
31
  BaseKeyValueSerializer,
24
32
  BaseListSerializer,
33
+ BaseMetaSerializer,
25
34
  BasePictureSerializer,
26
35
  BaseTableSerializer,
27
36
  BaseTextSerializer,
@@ -56,6 +65,9 @@ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
56
65
  _DEFAULT_LAYERS = {cl for cl in ContentLayer}
57
66
 
58
67
 
68
+ _logger = logging.getLogger(__name__)
69
+
70
+
59
71
  class _PageBreakNode(NodeItem):
60
72
  """Page break node."""
61
73
 
@@ -76,11 +88,11 @@ def _iterate_items(
76
88
  traverse_pictures: bool = False,
77
89
  add_page_breaks: bool = False,
78
90
  visited: Optional[set[str]] = None,
79
- ):
91
+ ) -> Iterable[Tuple[NodeItem, int]]:
80
92
  my_visited: set[str] = visited if visited is not None else set()
81
93
  prev_page_nr: Optional[int] = None
82
94
  page_break_i = 0
83
- for item, _ in doc.iterate_items(
95
+ for item, lvl in doc.iterate_items(
84
96
  root=node,
85
97
  with_groups=True,
86
98
  included_content_layers=layers,
@@ -93,7 +105,7 @@ def _iterate_items(
93
105
  ):
94
106
  # if group starts with new page, yield page break before group node
95
107
  my_visited.add(item.self_ref)
96
- for it in _iterate_items(
108
+ for it, _ in _iterate_items(
97
109
  doc=doc,
98
110
  layers=layers,
99
111
  node=item,
@@ -108,7 +120,7 @@ def _iterate_items(
108
120
  self_ref=f"#/pb/{page_break_i}",
109
121
  prev_page=prev_page_nr,
110
122
  next_page=page_no,
111
- )
123
+ ), lvl
112
124
  break
113
125
  elif isinstance(item, DocItem) and item.prov:
114
126
  page_no = item.prov[0].page_no
@@ -118,10 +130,10 @@ def _iterate_items(
118
130
  self_ref=f"#/pb/{page_break_i}",
119
131
  prev_page=prev_page_nr,
120
132
  next_page=page_no,
121
- )
133
+ ), lvl
122
134
  page_break_i += 1
123
135
  prev_page_nr = page_no
124
- yield item
136
+ yield item, lvl
125
137
 
126
138
 
127
139
  def _get_annotation_text(
@@ -188,9 +200,22 @@ class CommonParams(BaseModel):
188
200
  start_idx: NonNegativeInt = 0
189
201
  stop_idx: NonNegativeInt = sys.maxsize
190
202
 
203
+ include_non_meta: bool = True
204
+
191
205
  include_formatting: bool = True
192
206
  include_hyperlinks: bool = True
193
207
  caption_delim: str = " "
208
+ use_legacy_annotations: bool = Field(
209
+ default=False, description="Use legacy annotation serialization."
210
+ )
211
+ allowed_meta_names: Optional[set[str]] = Field(
212
+ default=None,
213
+ description="Meta name to allow; None means all meta names are allowed.",
214
+ )
215
+ blocked_meta_names: set[str] = Field(
216
+ default_factory=set,
217
+ description="Meta name to block; takes precedence over allowed_meta_names.",
218
+ )
194
219
 
195
220
  def merge_with_patch(self, patch: dict[str, Any]) -> Self:
196
221
  """Create an instance by merging the provided patch dict on top of self."""
@@ -215,6 +240,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
215
240
  list_serializer: BaseListSerializer
216
241
  inline_serializer: BaseInlineSerializer
217
242
 
243
+ meta_serializer: Optional[BaseMetaSerializer] = None
218
244
  annotation_serializer: BaseAnnotationSerializer
219
245
 
220
246
  params: CommonParams = CommonParams()
@@ -245,7 +271,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
245
271
  if refs is None:
246
272
  refs = {
247
273
  item.self_ref
248
- for ix, item in enumerate(
274
+ for ix, (item, _) in enumerate(
249
275
  _iterate_items(
250
276
  doc=self.doc,
251
277
  traverse_pictures=True,
@@ -301,103 +327,130 @@ class DocSerializer(BaseModel, BaseDocSerializer):
301
327
  ) -> SerializationResult:
302
328
  """Serialize a given node."""
303
329
  my_visited: set[str] = visited if visited is not None else set()
330
+ parts: list[SerializationResult] = []
331
+ delim: str = kwargs.get("delim", "\n")
332
+ my_params = self.params.model_copy(update=kwargs)
304
333
  my_kwargs = {**self.params.model_dump(), **kwargs}
305
334
  empty_res = create_ser_result()
306
- if item is None or item == self.doc.body:
307
- if self.doc.body.self_ref not in my_visited:
308
- my_visited.add(self.doc.body.self_ref)
309
- return self._serialize_body(**my_kwargs)
335
+
336
+ my_item = item or self.doc.body
337
+
338
+ if my_item == self.doc.body:
339
+ if my_item.meta and not my_params.use_legacy_annotations:
340
+ meta_part = self.serialize_meta(item=my_item, **my_kwargs)
341
+ if meta_part.text:
342
+ parts.append(meta_part)
343
+
344
+ if my_item.self_ref not in my_visited:
345
+ my_visited.add(my_item.self_ref)
346
+ part = self._serialize_body(**my_kwargs)
347
+ if part.text:
348
+ parts.append(part)
349
+ return create_ser_result(
350
+ text=delim.join([p.text for p in parts if p.text]),
351
+ span_source=parts,
352
+ )
310
353
  else:
311
354
  return empty_res
312
355
 
313
- my_visited.add(item.self_ref)
314
-
315
- ########
316
- # groups
317
- ########
318
- if isinstance(item, ListGroup):
319
- part = self.list_serializer.serialize(
320
- item=item,
321
- doc_serializer=self,
322
- doc=self.doc,
323
- list_level=list_level,
324
- is_inline_scope=is_inline_scope,
325
- visited=my_visited,
326
- **my_kwargs,
327
- )
328
- elif isinstance(item, InlineGroup):
329
- part = self.inline_serializer.serialize(
330
- item=item,
331
- doc_serializer=self,
332
- doc=self.doc,
333
- list_level=list_level,
334
- visited=my_visited,
335
- **my_kwargs,
336
- )
337
- ###########
338
- # doc items
339
- ###########
340
- elif isinstance(item, TextItem):
341
- if item.self_ref in self._captions_of_some_item:
342
- # those captions will be handled by the floating item holding them
343
- return empty_res
344
- else:
345
- part = (
346
- self.text_serializer.serialize(
347
- item=item,
348
- doc_serializer=self,
349
- doc=self.doc,
350
- is_inline_scope=is_inline_scope,
351
- visited=my_visited,
352
- **my_kwargs,
356
+ my_visited.add(my_item.self_ref)
357
+
358
+ if my_item.meta and not my_params.use_legacy_annotations:
359
+ meta_part = self.serialize_meta(item=my_item, **my_kwargs)
360
+ if meta_part.text:
361
+ parts.append(meta_part)
362
+
363
+ if my_params.include_non_meta:
364
+ ########
365
+ # groups
366
+ ########
367
+ if isinstance(my_item, ListGroup):
368
+ part = self.list_serializer.serialize(
369
+ item=my_item,
370
+ doc_serializer=self,
371
+ doc=self.doc,
372
+ list_level=list_level,
373
+ is_inline_scope=is_inline_scope,
374
+ visited=my_visited,
375
+ **my_kwargs,
376
+ )
377
+ elif isinstance(my_item, InlineGroup):
378
+ part = self.inline_serializer.serialize(
379
+ item=my_item,
380
+ doc_serializer=self,
381
+ doc=self.doc,
382
+ list_level=list_level,
383
+ visited=my_visited,
384
+ **my_kwargs,
385
+ )
386
+ ###########
387
+ # doc items
388
+ ###########
389
+ elif isinstance(my_item, TextItem):
390
+ if my_item.self_ref in self._captions_of_some_item:
391
+ # those captions will be handled by the floating item holding them
392
+ return empty_res
393
+ else:
394
+ part = (
395
+ self.text_serializer.serialize(
396
+ item=my_item,
397
+ doc_serializer=self,
398
+ doc=self.doc,
399
+ is_inline_scope=is_inline_scope,
400
+ visited=my_visited,
401
+ **my_kwargs,
402
+ )
403
+ if my_item.self_ref not in self.get_excluded_refs(**kwargs)
404
+ else empty_res
353
405
  )
354
- if item.self_ref not in self.get_excluded_refs(**kwargs)
355
- else empty_res
406
+ elif isinstance(my_item, TableItem):
407
+ part = self.table_serializer.serialize(
408
+ item=my_item,
409
+ doc_serializer=self,
410
+ doc=self.doc,
411
+ visited=my_visited,
412
+ **my_kwargs,
356
413
  )
357
- elif isinstance(item, TableItem):
358
- part = self.table_serializer.serialize(
359
- item=item,
360
- doc_serializer=self,
361
- doc=self.doc,
362
- visited=my_visited,
363
- **my_kwargs,
364
- )
365
- elif isinstance(item, PictureItem):
366
- part = self.picture_serializer.serialize(
367
- item=item,
368
- doc_serializer=self,
369
- doc=self.doc,
370
- visited=my_visited,
371
- **my_kwargs,
372
- )
373
- elif isinstance(item, KeyValueItem):
374
- part = self.key_value_serializer.serialize(
375
- item=item,
376
- doc_serializer=self,
377
- doc=self.doc,
378
- **my_kwargs,
379
- )
380
- elif isinstance(item, FormItem):
381
- part = self.form_serializer.serialize(
382
- item=item,
383
- doc_serializer=self,
384
- doc=self.doc,
385
- **my_kwargs,
386
- )
387
- elif isinstance(item, _PageBreakNode):
388
- part = _PageBreakSerResult(
389
- text=self._create_page_break(node=item),
390
- node=item,
391
- )
392
- else:
393
- part = self.fallback_serializer.serialize(
394
- item=item,
395
- doc_serializer=self,
396
- doc=self.doc,
397
- visited=my_visited,
398
- **my_kwargs,
399
- )
400
- return part
414
+ elif isinstance(my_item, PictureItem):
415
+ part = self.picture_serializer.serialize(
416
+ item=my_item,
417
+ doc_serializer=self,
418
+ doc=self.doc,
419
+ visited=my_visited,
420
+ **my_kwargs,
421
+ )
422
+ elif isinstance(my_item, KeyValueItem):
423
+ part = self.key_value_serializer.serialize(
424
+ item=my_item,
425
+ doc_serializer=self,
426
+ doc=self.doc,
427
+ **my_kwargs,
428
+ )
429
+ elif isinstance(my_item, FormItem):
430
+ part = self.form_serializer.serialize(
431
+ item=my_item,
432
+ doc_serializer=self,
433
+ doc=self.doc,
434
+ **my_kwargs,
435
+ )
436
+ elif isinstance(my_item, _PageBreakNode):
437
+ part = _PageBreakSerResult(
438
+ text=self._create_page_break(node=my_item),
439
+ node=my_item,
440
+ )
441
+ else:
442
+ part = self.fallback_serializer.serialize(
443
+ item=my_item,
444
+ doc_serializer=self,
445
+ doc=self.doc,
446
+ visited=my_visited,
447
+ **my_kwargs,
448
+ )
449
+ parts.append(part)
450
+
451
+ return create_ser_result(
452
+ text=delim.join([p.text for p in parts if p.text]), span_source=parts
453
+ )
401
454
 
402
455
  # making some assumptions about the kwargs it can pass
403
456
  @override
@@ -416,7 +469,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
416
469
  my_visited: set[str] = visited if visited is not None else set()
417
470
  params = self.params.merge_with_patch(patch=kwargs)
418
471
 
419
- for node in _iterate_items(
472
+ for node, lvl in _iterate_items(
420
473
  node=item,
421
474
  doc=self.doc,
422
475
  layers=params.layers,
@@ -426,15 +479,17 @@ class DocSerializer(BaseModel, BaseDocSerializer):
426
479
  continue
427
480
  else:
428
481
  my_visited.add(node.self_ref)
482
+
429
483
  part = self.serialize(
430
484
  item=node,
431
485
  list_level=list_level,
432
486
  is_inline_scope=is_inline_scope,
433
487
  visited=my_visited,
434
- **kwargs,
488
+ **(dict(level=lvl) | kwargs),
435
489
  )
436
490
  if part.text:
437
491
  parts.append(part)
492
+
438
493
  return parts
439
494
 
440
495
  @override
@@ -528,6 +583,31 @@ class DocSerializer(BaseModel, BaseDocSerializer):
528
583
  text_res = ""
529
584
  return create_ser_result(text=text_res, span_source=results)
530
585
 
586
+ @override
587
+ def serialize_meta(
588
+ self,
589
+ item: NodeItem,
590
+ **kwargs: Any,
591
+ ) -> SerializationResult:
592
+ """Serialize the item's meta."""
593
+ if self.meta_serializer:
594
+ if item.self_ref not in self.get_excluded_refs(**kwargs):
595
+ return self.meta_serializer.serialize(
596
+ item=item,
597
+ doc=self.doc,
598
+ **(self.params.model_dump() | kwargs),
599
+ )
600
+ else:
601
+ return create_ser_result(
602
+ text="", span_source=item if isinstance(item, DocItem) else []
603
+ )
604
+ else:
605
+ _logger.warning("No meta serializer found.")
606
+ return create_ser_result(
607
+ text="", span_source=item if isinstance(item, DocItem) else []
608
+ )
609
+
610
+ # TODO deprecate
531
611
  @override
532
612
  def serialize_annotations(
533
613
  self,
@@ -44,6 +44,7 @@ from docling_core.types.doc.document import (
44
44
  PictureTabularChartData,
45
45
  ProvenanceItem,
46
46
  SectionHeaderItem,
47
+ TableData,
47
48
  TableItem,
48
49
  TextItem,
49
50
  )
@@ -233,13 +234,22 @@ class DocTagsPictureSerializer(BasePictureSerializer):
233
234
  ysize=params.ysize,
234
235
  )
235
236
 
236
- classifications = [
237
- ann
238
- for ann in item.annotations
239
- if isinstance(ann, PictureClassificationData)
240
- ]
241
- if len(classifications) > 0:
237
+ # handle classification data
238
+ predicted_class: Optional[str] = None
239
+ if item.meta and item.meta.classification:
240
+ predicted_class = (
241
+ item.meta.classification.get_main_prediction().class_name
242
+ )
243
+ elif (
244
+ classifications := [
245
+ ann
246
+ for ann in item.annotations
247
+ if isinstance(ann, PictureClassificationData)
248
+ ]
249
+ ) and classifications[0].predicted_classes:
242
250
  predicted_class = classifications[0].predicted_classes[0].class_name
251
+ if predicted_class:
252
+ body += DocumentToken.get_picture_classification_token(predicted_class)
243
253
  if predicted_class in [
244
254
  PictureClassificationLabel.PIE_CHART,
245
255
  PictureClassificationLabel.BAR_CHART,
@@ -250,26 +260,31 @@ class DocTagsPictureSerializer(BasePictureSerializer):
250
260
  PictureClassificationLabel.HEATMAP,
251
261
  ]:
252
262
  is_chart = True
253
- body += DocumentToken.get_picture_classification_token(predicted_class)
254
263
 
255
- smiles_annotations = [
264
+ # handle molecule data
265
+ smi: Optional[str] = None
266
+ if item.meta and item.meta.molecule:
267
+ smi = item.meta.molecule.smi
268
+ elif smiles_annotations := [
256
269
  ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
257
- ]
258
- if len(smiles_annotations) > 0:
259
- body += _wrap(
260
- text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value
261
- )
262
-
263
- tabular_chart_annotations = [
270
+ ]:
271
+ smi = smiles_annotations[0].smi
272
+ if smi:
273
+ body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value)
274
+
275
+ # handle tabular chart data
276
+ chart_data: Optional[TableData] = None
277
+ if item.meta and item.meta.tabular_chart:
278
+ chart_data = item.meta.tabular_chart.chart_data
279
+ elif tabular_chart_annotations := [
264
280
  ann
265
281
  for ann in item.annotations
266
282
  if isinstance(ann, PictureTabularChartData)
267
- ]
268
- if len(tabular_chart_annotations) > 0:
283
+ ]:
284
+ chart_data = tabular_chart_annotations[0].chart_data
285
+ if chart_data and chart_data.table_cells:
269
286
  temp_doc = DoclingDocument(name="temp")
270
- temp_table = temp_doc.add_table(
271
- data=tabular_chart_annotations[0].chart_data
272
- )
287
+ temp_table = temp_doc.add_table(data=chart_data)
273
288
  otsl_content = temp_table.export_to_otsl(
274
289
  temp_doc, add_cell_location=False
275
290
  )