docling-core 2.40.0__py3-none-any.whl → 2.42.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -26,8 +26,10 @@ from pydantic import (
26
26
  BaseModel,
27
27
  ConfigDict,
28
28
  Field,
29
+ FieldSerializationInfo,
29
30
  StringConstraints,
30
31
  computed_field,
32
+ field_serializer,
31
33
  field_validator,
32
34
  model_validator,
33
35
  validate_call,
@@ -38,7 +40,12 @@ from typing_extensions import Annotated, Self, deprecated
38
40
  from docling_core.search.package import VERSION_PATTERN
39
41
  from docling_core.types.base import _JSON_POINTER_REGEX
40
42
  from docling_core.types.doc import BoundingBox, Size
41
- from docling_core.types.doc.base import CoordOrigin, ImageRefMode
43
+ from docling_core.types.doc.base import (
44
+ CoordOrigin,
45
+ ImageRefMode,
46
+ PydanticSerCtxKey,
47
+ round_pydantic_float,
48
+ )
42
49
  from docling_core.types.doc.labels import (
43
50
  CodeLanguageLabel,
44
51
  DocItemLabel,
@@ -98,6 +105,10 @@ class PictureClassificationClass(BaseModel):
98
105
  class_name: str
99
106
  confidence: float
100
107
 
108
+ @field_serializer("confidence")
109
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
110
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
111
+
101
112
 
102
113
  class PictureClassificationData(BaseAnnotation):
103
114
  """PictureClassificationData."""
@@ -125,6 +136,10 @@ class PictureMoleculeData(BaseAnnotation):
125
136
  segmentation: List[Tuple[float, float]]
126
137
  provenance: str
127
138
 
139
+ @field_serializer("confidence")
140
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
141
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
142
+
128
143
 
129
144
  class MiscAnnotation(BaseAnnotation):
130
145
  """MiscAnnotation."""
@@ -366,6 +381,145 @@ class TableData(BaseModel): # TBD
366
381
 
367
382
  return table_data
368
383
 
384
+ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
385
+ """Remove rows from the table by their indices.
386
+
387
+ :param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
388
+
389
+ :return: List[List[TableCell]]: A list representation of the removed rows as lists of TableCell objects.
390
+ """
391
+ if not indices:
392
+ return []
393
+
394
+ indices = sorted(indices, reverse=True)
395
+
396
+ all_removed_cells = []
397
+ for row_index in indices:
398
+ if row_index < 0 or row_index >= self.num_rows:
399
+ raise IndexError(
400
+ f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}."
401
+ )
402
+
403
+ start_idx = row_index * self.num_cols
404
+ end_idx = start_idx + self.num_cols
405
+ removed_cells = self.table_cells[start_idx:end_idx]
406
+
407
+ # Remove the cells from the table
408
+ self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
409
+
410
+ # Update the number of rows
411
+ self.num_rows -= 1
412
+
413
+ # Reassign row offset indices for existing cells
414
+ for index, cell in enumerate(self.table_cells):
415
+ new_index = index // self.num_cols
416
+ cell.start_row_offset_idx = new_index
417
+ cell.end_row_offset_idx = new_index + 1
418
+
419
+ all_removed_cells.append(removed_cells)
420
+
421
+ return all_removed_cells
422
+
423
+ def pop_row(self) -> List[TableCell]:
424
+ """Remove and return the last row from the table.
425
+
426
+ :returns: List[TableCell]: A list of TableCell objects representing the popped row.
427
+ """
428
+ if self.num_rows == 0:
429
+ raise IndexError("Cannot pop from an empty table.")
430
+
431
+ return self.remove_row(self.num_rows - 1)
432
+
433
+ def remove_row(self, row_index: int) -> List[TableCell]:
434
+ """Remove a row from the table by its index.
435
+
436
+ :param row_index: int: The index of the row to remove. (Starting from 0)
437
+
438
+ :returns: List[TableCell]: A list of TableCell objects representing the removed row.
439
+ """
440
+ return self.remove_rows([row_index])[0]
441
+
442
+ def insert_rows(
443
+ self, row_index: int, rows: List[List[str]], after: bool = False
444
+ ) -> None:
445
+ """Insert multiple new rows from a list of lists of strings before/after a specific index in the table.
446
+
447
+ :param row_index: int: The index at which to insert the new rows. (Starting from 0)
448
+ :param rows: List[List[str]]: A list of lists, where each inner list represents the content of a new row.
449
+ :param after: bool: If True, insert the rows after the specified index, otherwise before it. (Default is False)
450
+
451
+ :returns: None
452
+ """
453
+ effective_rows = rows[::-1]
454
+
455
+ for row in effective_rows:
456
+ self.insert_row(row_index, row, after)
457
+
458
+ def insert_row(self, row_index: int, row: List[str], after: bool = False) -> None:
459
+ """Insert a new row from a list of strings before/after a specific index in the table.
460
+
461
+ :param row_index: int: The index at which to insert the new row. (Starting from 0)
462
+ :param row: List[str]: A list of strings representing the content of the new row.
463
+ :param after: bool: If True, insert the row after the specified index, otherwise before it. (Default is False)
464
+
465
+ :returns: None
466
+ """
467
+ if len(row) != self.num_cols:
468
+ raise ValueError(
469
+ f"Row length {len(row)} does not match the number of columns {self.num_cols}."
470
+ )
471
+
472
+ effective_index = row_index + (1 if after else 0)
473
+
474
+ if effective_index < 0 or effective_index > self.num_rows:
475
+ raise IndexError(
476
+ f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}."
477
+ )
478
+
479
+ new_row_cells = [
480
+ TableCell(
481
+ text=text,
482
+ start_row_offset_idx=effective_index,
483
+ end_row_offset_idx=effective_index + 1,
484
+ start_col_offset_idx=j,
485
+ end_col_offset_idx=j + 1,
486
+ )
487
+ for j, text in enumerate(row)
488
+ ]
489
+
490
+ self.table_cells = (
491
+ self.table_cells[: effective_index * self.num_cols]
492
+ + new_row_cells
493
+ + self.table_cells[effective_index * self.num_cols :]
494
+ )
495
+
496
+ # Reassign row offset indices for existing cells
497
+ for index, cell in enumerate(self.table_cells):
498
+ new_index = index // self.num_cols
499
+ cell.start_row_offset_idx = new_index
500
+ cell.end_row_offset_idx = new_index + 1
501
+
502
+ self.num_rows += 1
503
+
504
+ def add_rows(self, rows: List[List[str]]) -> None:
505
+ """Add multiple new rows to the table from a list of lists of strings.
506
+
507
+ :param rows: List[List[str]]: A list of lists, where each inner list represents the content of a new row.
508
+
509
+ :returns: None
510
+ """
511
+ for row in rows:
512
+ self.add_row(row)
513
+
514
+ def add_row(self, row: List[str]) -> None:
515
+ """Add a new row to the table from a list of strings.
516
+
517
+ :param row: List[str]: A list of strings representing the content of the new row.
518
+
519
+ :returns: None
520
+ """
521
+ self.insert_row(row_index=self.num_rows - 1, row=row, after=True)
522
+
369
523
  def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
370
524
  """Get the minimal bounding box for each row in the table.
371
525
 
@@ -822,7 +976,7 @@ class NodeItem(BaseModel):
822
976
  after: bool = True,
823
977
  ) -> bool:
824
978
  """Add sibling node in tree."""
825
- if len(stack) == 1 and stack[0] < len(self.children) and (not after):
979
+ if len(stack) == 1 and stack[0] <= len(self.children) and (not after):
826
980
  # ensure the parent is correct
827
981
  new_item = new_ref.resolve(doc=doc)
828
982
  new_item.parent = self.get_ref()
@@ -1958,6 +2112,16 @@ class DoclingDocument(BaseModel):
1958
2112
  item.self_ref = cref
1959
2113
  item.parent = parent_ref
1960
2114
 
2115
+ self.groups.append(item)
2116
+ elif isinstance(item, GroupItem):
2117
+ item_label = "groups"
2118
+ item_index = len(self.groups)
2119
+
2120
+ cref = f"#/{item_label}/{item_index}"
2121
+
2122
+ item.self_ref = cref
2123
+ item.parent = parent_ref
2124
+
1961
2125
  self.groups.append(item)
1962
2126
 
1963
2127
  else:
@@ -1976,7 +2140,7 @@ class DoclingDocument(BaseModel):
1976
2140
  item_index = int(path[2])
1977
2141
 
1978
2142
  if (
1979
- len(self.__getattribute__(item_label)) + 1 == item_index
2143
+ len(self.__getattribute__(item_label)) == item_index + 1
1980
2144
  ): # we can only pop the last item
1981
2145
  del self.__getattribute__(item_label)[item_index]
1982
2146
  else:
@@ -2001,6 +2165,10 @@ class DoclingDocument(BaseModel):
2001
2165
  if not success:
2002
2166
  self._pop_item(item=item)
2003
2167
 
2168
+ raise ValueError(
2169
+ f"Could not insert item: {item} under parent: {parent_ref.resolve(doc=self)}"
2170
+ )
2171
+
2004
2172
  return item.get_ref()
2005
2173
 
2006
2174
  def _delete_items(self, refs: list[RefItem]):
@@ -2380,17 +2548,6 @@ class DoclingDocument(BaseModel):
2380
2548
  hyperlink=hyperlink,
2381
2549
  )
2382
2550
 
2383
- elif label in [DocItemLabel.TITLE]:
2384
- return self.add_title(
2385
- text=text,
2386
- orig=orig,
2387
- prov=prov,
2388
- parent=parent,
2389
- content_layer=content_layer,
2390
- formatting=formatting,
2391
- hyperlink=hyperlink,
2392
- )
2393
-
2394
2551
  elif label in [DocItemLabel.SECTION_HEADER]:
2395
2552
  return self.add_heading(
2396
2553
  text=text,
@@ -2790,177 +2947,1171 @@ class DoclingDocument(BaseModel):
2790
2947
 
2791
2948
  return form_item
2792
2949
 
2793
- def num_pages(self):
2794
- """num_pages."""
2795
- return len(self.pages.values())
2950
+ # ---------------------------
2951
+ # Node Item Insertion Methods
2952
+ # ---------------------------
2796
2953
 
2797
- def validate_tree(self, root) -> bool:
2798
- """validate_tree."""
2799
- res = []
2800
- for child_ref in root.children:
2801
- child = child_ref.resolve(self)
2802
- if child.parent.resolve(self) != root:
2803
- return False
2804
- res.append(self.validate_tree(child))
2954
+ def _get_insertion_stack_and_parent(
2955
+ self, sibling: NodeItem
2956
+ ) -> tuple[list[int], RefItem]:
2957
+ """Get the stack and parent reference for inserting a new item at a sibling."""
2958
+ # Get the stack of the sibling
2959
+ sibling_ref = sibling.get_ref()
2805
2960
 
2806
- return all(res) or len(res) == 0
2961
+ success, stack = self._get_stack_of_refitem(ref=sibling_ref)
2807
2962
 
2808
- def iterate_items(
2809
- self,
2810
- root: Optional[NodeItem] = None,
2811
- with_groups: bool = False,
2812
- traverse_pictures: bool = False,
2813
- page_no: Optional[int] = None,
2814
- included_content_layers: Optional[set[ContentLayer]] = None,
2815
- _level: int = 0, # fixed parameter, carries through the node nesting level
2816
- ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
2817
- """Iterate elements with level."""
2818
- for item, stack in self._iterate_items_with_stack(
2819
- root=root,
2820
- with_groups=with_groups,
2821
- traverse_pictures=traverse_pictures,
2822
- page_no=page_no,
2823
- included_content_layers=included_content_layers,
2824
- ):
2825
- yield item, len(stack)
2963
+ if not success:
2964
+ raise ValueError(
2965
+ f"Could not insert at {sibling_ref.cref}: could not find the stack"
2966
+ )
2826
2967
 
2827
- def _iterate_items_with_stack(
2828
- self,
2829
- root: Optional[NodeItem] = None,
2830
- with_groups: bool = False,
2831
- traverse_pictures: bool = False,
2832
- page_no: Optional[int] = None,
2833
- included_content_layers: Optional[set[ContentLayer]] = None,
2834
- _stack: Optional[list[int]] = None,
2835
- ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
2836
- """Iterate elements with stack."""
2837
- my_layers = (
2838
- included_content_layers
2839
- if included_content_layers is not None
2840
- else DEFAULT_CONTENT_LAYERS
2841
- )
2842
- my_stack: list[int] = _stack if _stack is not None else []
2968
+ # Get the parent RefItem
2969
+ parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
2843
2970
 
2844
- if not root:
2845
- root = self.body
2971
+ if parent_ref is None:
2972
+ raise ValueError(f"Could not find a parent at stack: {stack}")
2846
2973
 
2847
- # Yield non-group items or group items when with_groups=True
2974
+ return stack, parent_ref
2848
2975
 
2849
- # Combine conditions to have a single yield point
2850
- should_yield = (
2851
- (not isinstance(root, GroupItem) or with_groups)
2852
- and (
2853
- not isinstance(root, DocItem)
2854
- or (
2855
- page_no is None
2856
- or any(prov.page_no == page_no for prov in root.prov)
2857
- )
2858
- )
2859
- and root.content_layer in my_layers
2860
- )
2976
+ def _insert_in_structure(
2977
+ self,
2978
+ item: NodeItem,
2979
+ stack: list[int],
2980
+ after: bool,
2981
+ created_parent: Optional[bool] = False,
2982
+ ) -> None:
2983
+ """Insert item into the document structure at the specified stack and handle errors."""
2984
+ # Ensure the item has a parent reference
2985
+ if item.parent is None:
2986
+ item.parent = self.body.get_ref()
2861
2987
 
2862
- if should_yield:
2863
- yield root, my_stack
2988
+ self._append_item(item=item, parent_ref=item.parent)
2864
2989
 
2865
- my_stack.append(-1)
2990
+ new_ref = item.get_ref()
2866
2991
 
2867
- allowed_pic_refs: set[str] = (
2868
- {r.cref for r in root.captions}
2869
- if (root_is_picture := isinstance(root, PictureItem))
2870
- else set()
2992
+ success = self.body._add_sibling(
2993
+ doc=self, stack=stack, new_ref=new_ref, after=after
2871
2994
  )
2872
2995
 
2873
- # Traverse children
2874
- for child_ind, child_ref in enumerate(root.children):
2875
- child = child_ref.resolve(self)
2876
- if (
2877
- root_is_picture
2878
- and not traverse_pictures
2879
- and isinstance(child, NodeItem)
2880
- and child.self_ref not in allowed_pic_refs
2881
- ):
2882
- continue
2883
- my_stack[-1] = child_ind
2884
-
2885
- if isinstance(child, NodeItem):
2886
- yield from self._iterate_items_with_stack(
2887
- child,
2888
- with_groups=with_groups,
2889
- traverse_pictures=traverse_pictures,
2890
- page_no=page_no,
2891
- _stack=my_stack,
2892
- included_content_layers=my_layers,
2893
- )
2894
-
2895
- my_stack.pop()
2896
-
2897
- def _clear_picture_pil_cache(self):
2898
- """Clear cache storage of all images."""
2899
- for item, level in self.iterate_items(with_groups=False):
2900
- if isinstance(item, PictureItem):
2901
- if item.image is not None and item.image._pil is not None:
2902
- item.image._pil.close()
2996
+ # Error handling can be determined here
2997
+ if not success:
2998
+ self._pop_item(item=item)
2903
2999
 
2904
- def _list_images_on_disk(self) -> List[Path]:
2905
- """List all images on disk."""
2906
- result: List[Path] = []
3000
+ if created_parent:
3001
+ self.delete_items(node_items=[item.parent.resolve(self)])
2907
3002
 
2908
- for item, level in self.iterate_items(with_groups=False):
2909
- if isinstance(item, PictureItem):
2910
- if item.image is not None:
2911
- if (
2912
- isinstance(item.image.uri, AnyUrl)
2913
- and item.image.uri.scheme == "file"
2914
- and item.image.uri.path is not None
2915
- ):
2916
- local_path = Path(unquote(item.image.uri.path))
2917
- result.append(local_path)
2918
- elif isinstance(item.image.uri, Path):
2919
- result.append(item.image.uri)
3003
+ raise ValueError(
3004
+ f"Could not insert item: {item} under parent: {item.parent.resolve(doc=self)}"
3005
+ )
2920
3006
 
2921
- return result
3007
+ def insert_list_group(
3008
+ self,
3009
+ sibling: NodeItem,
3010
+ name: Optional[str] = None,
3011
+ content_layer: Optional[ContentLayer] = None,
3012
+ after: bool = True,
3013
+ ) -> ListGroup:
3014
+ """Creates a new ListGroup item and inserts it into the document.
2922
3015
 
2923
- def _with_embedded_pictures(self) -> "DoclingDocument":
2924
- """Document with embedded images.
3016
+ :param sibling: NodeItem:
3017
+ :param name: Optional[str]: (Default value = None)
3018
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3019
+ :param after: bool: (Default value = True)
2925
3020
 
2926
- Creates a copy of this document where all pictures referenced
2927
- through a file URI are turned into base64 embedded form.
3021
+ :returns: ListGroup: The newly created ListGroup item.
2928
3022
  """
2929
- result: DoclingDocument = copy.deepcopy(self)
3023
+ # Get stack and parent reference of the sibling
3024
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
2930
3025
 
2931
- for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
2932
- if isinstance(item, PictureItem):
3026
+ group = ListGroup(self_ref="#", parent=parent_ref)
2933
3027
 
2934
- if item.image is not None:
2935
- if (
2936
- isinstance(item.image.uri, AnyUrl)
2937
- and item.image.uri.scheme == "file"
2938
- ):
2939
- assert isinstance(item.image.uri.path, str)
2940
- tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
2941
- item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
3028
+ if name is not None:
3029
+ group.name = name
3030
+ if content_layer:
3031
+ group.content_layer = content_layer
2942
3032
 
2943
- elif isinstance(item.image.uri, Path):
2944
- tmp_image = PILImage.open(str(item.image.uri))
2945
- item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
3033
+ self._insert_in_structure(item=group, stack=stack, after=after)
2946
3034
 
2947
- return result
3035
+ return group
2948
3036
 
2949
- def _with_pictures_refs(
2950
- self, image_dir: Path, reference_path: Optional[Path] = None
2951
- ) -> "DoclingDocument":
2952
- """Document with images as refs.
3037
+ def insert_inline_group(
3038
+ self,
3039
+ sibling: NodeItem,
3040
+ name: Optional[str] = None,
3041
+ content_layer: Optional[ContentLayer] = None,
3042
+ after: bool = True,
3043
+ ) -> InlineGroup:
3044
+ """Creates a new InlineGroup item and inserts it into the document.
2953
3045
 
2954
- Creates a copy of this document where all picture data is
2955
- saved to image_dir and referenced through file URIs.
3046
+ :param sibling: NodeItem:
3047
+ :param name: Optional[str]: (Default value = None)
3048
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3049
+ :param after: bool: (Default value = True)
3050
+
3051
+ :returns: InlineGroup: The newly created InlineGroup item.
2956
3052
  """
2957
- result: DoclingDocument = copy.deepcopy(self)
3053
+ # Get stack and parent reference of the sibling
3054
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
2958
3055
 
2959
- img_count = 0
2960
- image_dir.mkdir(parents=True, exist_ok=True)
3056
+ # Create a new InlineGroup NodeItem
3057
+ group = InlineGroup(self_ref="#", parent=parent_ref)
2961
3058
 
2962
- if image_dir.is_dir():
2963
- for item, level in result.iterate_items(with_groups=False):
3059
+ if name is not None:
3060
+ group.name = name
3061
+ if content_layer:
3062
+ group.content_layer = content_layer
3063
+
3064
+ self._insert_in_structure(item=group, stack=stack, after=after)
3065
+
3066
+ return group
3067
+
3068
+ def insert_group(
3069
+ self,
3070
+ sibling: NodeItem,
3071
+ label: Optional[GroupLabel] = None,
3072
+ name: Optional[str] = None,
3073
+ content_layer: Optional[ContentLayer] = None,
3074
+ after: bool = True,
3075
+ ) -> GroupItem:
3076
+ """Creates a new GroupItem item and inserts it into the document.
3077
+
3078
+ :param sibling: NodeItem:
3079
+ :param label: Optional[GroupLabel]: (Default value = None)
3080
+ :param name: Optional[str]: (Default value = None)
3081
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3082
+ :param after: bool: (Default value = True)
3083
+
3084
+ :returns: GroupItem: The newly created GroupItem.
3085
+ """
3086
+ if label in [GroupLabel.LIST, GroupLabel.ORDERED_LIST]:
3087
+ return self.insert_list_group(
3088
+ sibling=sibling,
3089
+ name=name,
3090
+ content_layer=content_layer,
3091
+ after=after,
3092
+ )
3093
+ elif label == GroupLabel.INLINE:
3094
+ return self.insert_inline_group(
3095
+ sibling=sibling,
3096
+ name=name,
3097
+ content_layer=content_layer,
3098
+ after=after,
3099
+ )
3100
+
3101
+ # Get stack and parent reference of the sibling
3102
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3103
+
3104
+ # Create a new GroupItem NodeItem
3105
+ group = GroupItem(self_ref="#", parent=parent_ref)
3106
+
3107
+ if name is not None:
3108
+ group.name = name
3109
+ if label is not None:
3110
+ group.label = label
3111
+ if content_layer:
3112
+ group.content_layer = content_layer
3113
+
3114
+ self._insert_in_structure(item=group, stack=stack, after=after)
3115
+
3116
+ return group
3117
+
3118
+ def insert_list_item(
3119
+ self,
3120
+ sibling: NodeItem,
3121
+ text: str,
3122
+ enumerated: bool = False,
3123
+ marker: Optional[str] = None,
3124
+ orig: Optional[str] = None,
3125
+ prov: Optional[ProvenanceItem] = None,
3126
+ content_layer: Optional[ContentLayer] = None,
3127
+ formatting: Optional[Formatting] = None,
3128
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3129
+ after: bool = True,
3130
+ ) -> ListItem:
3131
+ """Creates a new ListItem item and inserts it into the document.
3132
+
3133
+ :param sibling: NodeItem:
3134
+ :param text: str:
3135
+ :param enumerated: bool: (Default value = False)
3136
+ :param marker: Optional[str]: (Default value = None)
3137
+ :param orig: Optional[str]: (Default value = None)
3138
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3139
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3140
+ :param formatting: Optional[Formatting]: (Default value = None)
3141
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3142
+ :param after: bool: (Default value = True)
3143
+
3144
+ :returns: ListItem: The newly created ListItem item.
3145
+ """
3146
+ # Get stack and parent reference of the sibling
3147
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3148
+
3149
+ # Ensure the parent is a ListGroup
3150
+
3151
+ parent = parent_ref.resolve(self)
3152
+ set_parent = False
3153
+
3154
+ if not isinstance(parent, ListGroup):
3155
+ warnings.warn(
3156
+ "ListItem parent must be a ListGroup, creating one on the fly.",
3157
+ DeprecationWarning,
3158
+ )
3159
+ parent = self.insert_list_group(sibling=sibling, after=after)
3160
+ parent_ref = parent.get_ref()
3161
+ if after:
3162
+ stack[-1] += 1
3163
+ stack.append(0)
3164
+ after = False
3165
+ set_parent = True
3166
+
3167
+ # Create a new ListItem NodeItem
3168
+ if not orig:
3169
+ orig = text
3170
+
3171
+ list_item = ListItem(
3172
+ text=text,
3173
+ orig=orig,
3174
+ self_ref="#",
3175
+ parent=parent_ref,
3176
+ enumerated=enumerated,
3177
+ marker=marker or "",
3178
+ formatting=formatting,
3179
+ hyperlink=hyperlink,
3180
+ )
3181
+
3182
+ if prov:
3183
+ list_item.prov.append(prov)
3184
+ if content_layer:
3185
+ list_item.content_layer = content_layer
3186
+
3187
+ self._insert_in_structure(
3188
+ item=list_item, stack=stack, after=after, created_parent=set_parent
3189
+ )
3190
+
3191
+ return list_item
3192
+
3193
+ def insert_text(
3194
+ self,
3195
+ sibling: NodeItem,
3196
+ label: DocItemLabel,
3197
+ text: str,
3198
+ orig: Optional[str] = None,
3199
+ prov: Optional[ProvenanceItem] = None,
3200
+ content_layer: Optional[ContentLayer] = None,
3201
+ formatting: Optional[Formatting] = None,
3202
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3203
+ after: bool = True,
3204
+ ) -> TextItem:
3205
+ """Creates a new TextItem item and inserts it into the document.
3206
+
3207
+ :param sibling: NodeItem:
3208
+ :param label: DocItemLabel:
3209
+ :param text: str:
3210
+ :param orig: Optional[str]: (Default value = None)
3211
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3212
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3213
+ :param formatting: Optional[Formatting]: (Default value = None)
3214
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3215
+ :param after: bool: (Default value = True)
3216
+
3217
+ :returns: TextItem: The newly created TextItem item.
3218
+ """
3219
+ if label in [DocItemLabel.TITLE]:
3220
+ return self.insert_title(
3221
+ sibling=sibling,
3222
+ text=text,
3223
+ orig=orig,
3224
+ prov=prov,
3225
+ content_layer=content_layer,
3226
+ formatting=formatting,
3227
+ hyperlink=hyperlink,
3228
+ after=after,
3229
+ )
3230
+
3231
+ elif label in [DocItemLabel.LIST_ITEM]:
3232
+ return self.insert_list_item(
3233
+ sibling=sibling,
3234
+ text=text,
3235
+ orig=orig,
3236
+ prov=prov,
3237
+ content_layer=content_layer,
3238
+ formatting=formatting,
3239
+ hyperlink=hyperlink,
3240
+ after=after,
3241
+ )
3242
+
3243
+ elif label in [DocItemLabel.SECTION_HEADER]:
3244
+ return self.insert_heading(
3245
+ sibling=sibling,
3246
+ text=text,
3247
+ orig=orig,
3248
+ prov=prov,
3249
+ content_layer=content_layer,
3250
+ formatting=formatting,
3251
+ hyperlink=hyperlink,
3252
+ after=after,
3253
+ )
3254
+
3255
+ elif label in [DocItemLabel.CODE]:
3256
+ return self.insert_code(
3257
+ sibling=sibling,
3258
+ text=text,
3259
+ orig=orig,
3260
+ prov=prov,
3261
+ content_layer=content_layer,
3262
+ formatting=formatting,
3263
+ hyperlink=hyperlink,
3264
+ after=after,
3265
+ )
3266
+
3267
+ elif label in [DocItemLabel.FORMULA]:
3268
+ return self.insert_formula(
3269
+ sibling=sibling,
3270
+ text=text,
3271
+ orig=orig,
3272
+ prov=prov,
3273
+ content_layer=content_layer,
3274
+ formatting=formatting,
3275
+ hyperlink=hyperlink,
3276
+ after=after,
3277
+ )
3278
+
3279
+ else:
3280
+ # Get stack and parent reference of the sibling
3281
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3282
+
3283
+ # Create a new TextItem NodeItem
3284
+ if not orig:
3285
+ orig = text
3286
+
3287
+ text_item = TextItem(
3288
+ label=label,
3289
+ text=text,
3290
+ orig=orig,
3291
+ self_ref="#",
3292
+ parent=parent_ref,
3293
+ formatting=formatting,
3294
+ hyperlink=hyperlink,
3295
+ )
3296
+
3297
+ if prov:
3298
+ text_item.prov.append(prov)
3299
+ if content_layer:
3300
+ text_item.content_layer = content_layer
3301
+
3302
+ self._insert_in_structure(item=text_item, stack=stack, after=after)
3303
+
3304
+ return text_item
3305
+
3306
+ def insert_table(
3307
+ self,
3308
+ sibling: NodeItem,
3309
+ data: TableData,
3310
+ caption: Optional[Union[TextItem, RefItem]] = None,
3311
+ prov: Optional[ProvenanceItem] = None,
3312
+ label: DocItemLabel = DocItemLabel.TABLE,
3313
+ content_layer: Optional[ContentLayer] = None,
3314
+ annotations: Optional[list[TableAnnotationType]] = None,
3315
+ after: bool = True,
3316
+ ) -> TableItem:
3317
+ """Creates a new TableItem item and inserts it into the document.
3318
+
3319
+ :param sibling: NodeItem:
3320
+ :param data: TableData:
3321
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
3322
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3323
+ :param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
3324
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3325
+ :param annotations: Optional[List[TableAnnotationType]]: (Default value = None)
3326
+ :param after: bool: (Default value = True)
3327
+
3328
+ :returns: TableItem: The newly created TableItem item.
3329
+ """
3330
+ # Get stack and parent reference of the sibling
3331
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3332
+
3333
+ # Create a new ListItem NodeItem
3334
+ table_item = TableItem(
3335
+ label=label,
3336
+ data=data,
3337
+ self_ref="#",
3338
+ parent=parent_ref,
3339
+ annotations=annotations or [],
3340
+ )
3341
+
3342
+ if prov:
3343
+ table_item.prov.append(prov)
3344
+ if content_layer:
3345
+ table_item.content_layer = content_layer
3346
+ if caption:
3347
+ table_item.captions.append(caption.get_ref())
3348
+
3349
+ self._insert_in_structure(item=table_item, stack=stack, after=after)
3350
+
3351
+ return table_item
3352
+
3353
+ def insert_picture(
3354
+ self,
3355
+ sibling: NodeItem,
3356
+ annotations: Optional[List[PictureDataType]] = None,
3357
+ image: Optional[ImageRef] = None,
3358
+ caption: Optional[Union[TextItem, RefItem]] = None,
3359
+ prov: Optional[ProvenanceItem] = None,
3360
+ content_layer: Optional[ContentLayer] = None,
3361
+ after: bool = True,
3362
+ ) -> PictureItem:
3363
+ """Creates a new PictureItem item and inserts it into the document.
3364
+
3365
+ :param sibling: NodeItem:
3366
+ :param annotations: Optional[List[PictureDataType]]: (Default value = None)
3367
+ :param image: Optional[ImageRef]: (Default value = None)
3368
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
3369
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3370
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3371
+ :param after: bool: (Default value = True)
3372
+
3373
+ :returns: PictureItem: The newly created PictureItem item.
3374
+ """
3375
+ # Get stack and parent reference of the sibling
3376
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3377
+
3378
+ # Create a new PictureItem NodeItem
3379
+ picture_item = PictureItem(
3380
+ label=DocItemLabel.PICTURE,
3381
+ annotations=annotations or [],
3382
+ image=image,
3383
+ self_ref="#",
3384
+ parent=parent_ref,
3385
+ )
3386
+
3387
+ if prov:
3388
+ picture_item.prov.append(prov)
3389
+ if content_layer:
3390
+ picture_item.content_layer = content_layer
3391
+ if caption:
3392
+ picture_item.captions.append(caption.get_ref())
3393
+
3394
+ self._insert_in_structure(item=picture_item, stack=stack, after=after)
3395
+
3396
+ return picture_item
3397
+
3398
+ def insert_title(
3399
+ self,
3400
+ sibling: NodeItem,
3401
+ text: str,
3402
+ orig: Optional[str] = None,
3403
+ prov: Optional[ProvenanceItem] = None,
3404
+ content_layer: Optional[ContentLayer] = None,
3405
+ formatting: Optional[Formatting] = None,
3406
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3407
+ after: bool = True,
3408
+ ) -> TitleItem:
3409
+ """Creates a new TitleItem item and inserts it into the document.
3410
+
3411
+ :param sibling: NodeItem:
3412
+ :param text: str:
3413
+ :param orig: Optional[str]: (Default value = None)
3414
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3415
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3416
+ :param formatting: Optional[Formatting]: (Default value = None)
3417
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3418
+ :param after: bool: (Default value = True)
3419
+
3420
+ :returns: TitleItem: The newly created TitleItem item.
3421
+ """
3422
+ # Get stack and parent reference of the sibling
3423
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3424
+
3425
+ # Create a new TitleItem NodeItem
3426
+ if not orig:
3427
+ orig = text
3428
+
3429
+ title_item = TitleItem(
3430
+ text=text,
3431
+ orig=orig,
3432
+ self_ref="#",
3433
+ parent=parent_ref,
3434
+ formatting=formatting,
3435
+ hyperlink=hyperlink,
3436
+ )
3437
+
3438
+ if prov:
3439
+ title_item.prov.append(prov)
3440
+ if content_layer:
3441
+ title_item.content_layer = content_layer
3442
+
3443
+ self._insert_in_structure(item=title_item, stack=stack, after=after)
3444
+
3445
+ return title_item
3446
+
3447
+ def insert_code(
3448
+ self,
3449
+ sibling: NodeItem,
3450
+ text: str,
3451
+ code_language: Optional[CodeLanguageLabel] = None,
3452
+ orig: Optional[str] = None,
3453
+ caption: Optional[Union[TextItem, RefItem]] = None,
3454
+ prov: Optional[ProvenanceItem] = None,
3455
+ content_layer: Optional[ContentLayer] = None,
3456
+ formatting: Optional[Formatting] = None,
3457
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3458
+ after: bool = True,
3459
+ ) -> CodeItem:
3460
+ """Creates a new CodeItem item and inserts it into the document.
3461
+
3462
+ :param sibling: NodeItem:
3463
+ :param text: str:
3464
+ :param code_language: Optional[str]: (Default value = None)
3465
+ :param orig: Optional[str]: (Default value = None)
3466
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
3467
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3468
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3469
+ :param formatting: Optional[Formatting]: (Default value = None)
3470
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3471
+ :param after: bool: (Default value = True)
3472
+
3473
+ :returns: CodeItem: The newly created CodeItem item.
3474
+ """
3475
+ # Get stack and parent reference of the sibling
3476
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3477
+
3478
+ # Create a new CodeItem NodeItem
3479
+ if not orig:
3480
+ orig = text
3481
+
3482
+ code_item = CodeItem(
3483
+ text=text,
3484
+ orig=orig,
3485
+ self_ref="#",
3486
+ parent=parent_ref,
3487
+ formatting=formatting,
3488
+ hyperlink=hyperlink,
3489
+ )
3490
+
3491
+ if code_language:
3492
+ code_item.code_language = code_language
3493
+ if content_layer:
3494
+ code_item.content_layer = content_layer
3495
+ if prov:
3496
+ code_item.prov.append(prov)
3497
+ if caption:
3498
+ code_item.captions.append(caption.get_ref())
3499
+
3500
+ self._insert_in_structure(item=code_item, stack=stack, after=after)
3501
+
3502
+ return code_item
3503
+
3504
+ def insert_formula(
3505
+ self,
3506
+ sibling: NodeItem,
3507
+ text: str,
3508
+ orig: Optional[str] = None,
3509
+ prov: Optional[ProvenanceItem] = None,
3510
+ content_layer: Optional[ContentLayer] = None,
3511
+ formatting: Optional[Formatting] = None,
3512
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3513
+ after: bool = True,
3514
+ ) -> FormulaItem:
3515
+ """Creates a new FormulaItem item and inserts it into the document.
3516
+
3517
+ :param sibling: NodeItem:
3518
+ :param text: str:
3519
+ :param orig: Optional[str]: (Default value = None)
3520
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3521
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3522
+ :param formatting: Optional[Formatting]: (Default value = None)
3523
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3524
+ :param after: bool: (Default value = True)
3525
+
3526
+ :returns: FormulaItem: The newly created FormulaItem item.
3527
+ """
3528
+ # Get stack and parent reference of the sibling
3529
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3530
+
3531
+ # Create a new FormulaItem NodeItem
3532
+ if not orig:
3533
+ orig = text
3534
+
3535
+ formula_item = FormulaItem(
3536
+ text=text,
3537
+ orig=orig,
3538
+ self_ref="#",
3539
+ parent=parent_ref,
3540
+ formatting=formatting,
3541
+ hyperlink=hyperlink,
3542
+ )
3543
+
3544
+ if prov:
3545
+ formula_item.prov.append(prov)
3546
+ if content_layer:
3547
+ formula_item.content_layer = content_layer
3548
+
3549
+ self._insert_in_structure(item=formula_item, stack=stack, after=after)
3550
+
3551
+ return formula_item
3552
+
3553
+ def insert_heading(
3554
+ self,
3555
+ sibling: NodeItem,
3556
+ text: str,
3557
+ orig: Optional[str] = None,
3558
+ level: LevelNumber = 1,
3559
+ prov: Optional[ProvenanceItem] = None,
3560
+ content_layer: Optional[ContentLayer] = None,
3561
+ formatting: Optional[Formatting] = None,
3562
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3563
+ after: bool = True,
3564
+ ) -> SectionHeaderItem:
3565
+ """Creates a new SectionHeaderItem item and inserts it into the document.
3566
+
3567
+ :param sibling: NodeItem:
3568
+ :param text: str:
3569
+ :param orig: Optional[str]: (Default value = None)
3570
+ :param level: LevelNumber: (Default value = 1)
3571
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3572
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3573
+ :param formatting: Optional[Formatting]: (Default value = None)
3574
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3575
+ :param after: bool: (Default value = True)
3576
+
3577
+ :returns: SectionHeaderItem: The newly created SectionHeaderItem item.
3578
+ """
3579
+ # Get stack and parent reference of the sibling
3580
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3581
+
3582
+ # Create a new SectionHeaderItem NodeItem
3583
+ if not orig:
3584
+ orig = text
3585
+
3586
+ section_header_item = SectionHeaderItem(
3587
+ level=level,
3588
+ text=text,
3589
+ orig=orig,
3590
+ self_ref="#",
3591
+ parent=parent_ref,
3592
+ formatting=formatting,
3593
+ hyperlink=hyperlink,
3594
+ )
3595
+
3596
+ if prov:
3597
+ section_header_item.prov.append(prov)
3598
+ if content_layer:
3599
+ section_header_item.content_layer = content_layer
3600
+
3601
+ self._insert_in_structure(item=section_header_item, stack=stack, after=after)
3602
+
3603
+ return section_header_item
3604
+
3605
+ def insert_key_values(
3606
+ self,
3607
+ sibling: NodeItem,
3608
+ graph: GraphData,
3609
+ prov: Optional[ProvenanceItem] = None,
3610
+ after: bool = True,
3611
+ ) -> KeyValueItem:
3612
+ """Creates a new KeyValueItem item and inserts it into the document.
3613
+
3614
+ :param sibling: NodeItem:
3615
+ :param graph: GraphData:
3616
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3617
+ :param after: bool: (Default value = True)
3618
+
3619
+ :returns: KeyValueItem: The newly created KeyValueItem item.
3620
+ """
3621
+ # Get stack and parent reference of the sibling
3622
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3623
+
3624
+ # Create a new KeyValueItem NodeItem
3625
+ key_value_item = KeyValueItem(graph=graph, self_ref="#", parent=parent_ref)
3626
+
3627
+ if prov:
3628
+ key_value_item.prov.append(prov)
3629
+
3630
+ self._insert_in_structure(item=key_value_item, stack=stack, after=after)
3631
+
3632
+ return key_value_item
3633
+
3634
+ def insert_form(
3635
+ self,
3636
+ sibling: NodeItem,
3637
+ graph: GraphData,
3638
+ prov: Optional[ProvenanceItem] = None,
3639
+ after: bool = True,
3640
+ ) -> FormItem:
3641
+ """Creates a new FormItem item and inserts it into the document.
3642
+
3643
+ :param sibling: NodeItem:
3644
+ :param graph: GraphData:
3645
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3646
+ :param after: bool: (Default value = True)
3647
+
3648
+ :returns: FormItem: The newly created FormItem item.
3649
+ """
3650
+ # Get stack and parent reference of the sibling
3651
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3652
+
3653
+ # Create a new FormItem NodeItem
3654
+ form_item = FormItem(graph=graph, self_ref="#", parent=parent_ref)
3655
+
3656
+ if prov:
3657
+ form_item.prov.append(prov)
3658
+
3659
+ self._insert_in_structure(item=form_item, stack=stack, after=after)
3660
+
3661
+ return form_item
3662
+
3663
+ # ---------------------------
3664
+ # Range Manipulation Methods
3665
+ # ---------------------------
3666
+
3667
+ def delete_items_range(
3668
+ self,
3669
+ *,
3670
+ start: NodeItem,
3671
+ end: NodeItem,
3672
+ start_inclusive: bool = True,
3673
+ end_inclusive: bool = True,
3674
+ ) -> None:
3675
+ """Deletes all NodeItems and their children in the range from the start NodeItem to the end NodeItem.
3676
+
3677
+ :param start: NodeItem: The starting NodeItem of the range
3678
+ :param end: NodeItem: The ending NodeItem of the range
3679
+ :param start_inclusive: bool: (Default value = True): If True, the start NodeItem will also be deleted
3680
+ :param end_inclusive: bool: (Default value = True): If True, the end NodeItem will also be deleted
3681
+
3682
+ :returns: None
3683
+ """
3684
+ start_parent_ref = (
3685
+ start.parent if start.parent is not None else self.body.get_ref()
3686
+ )
3687
+ end_parent_ref = end.parent if end.parent is not None else self.body.get_ref()
3688
+
3689
+ if start.parent != end.parent:
3690
+ raise ValueError(
3691
+ "Start and end NodeItems must have the same parent to delete a range."
3692
+ )
3693
+
3694
+ start_ref = start.get_ref()
3695
+ end_ref = end.get_ref()
3696
+
3697
+ start_parent = start_parent_ref.resolve(doc=self)
3698
+ end_parent = end_parent_ref.resolve(doc=self)
3699
+
3700
+ start_index = start_parent.children.index(start_ref)
3701
+ end_index = end_parent.children.index(end_ref)
3702
+
3703
+ if start_index > end_index:
3704
+ raise ValueError(
3705
+ "Start NodeItem must come before or be the same as the end NodeItem in the document structure."
3706
+ )
3707
+
3708
+ to_delete = start_parent.children[start_index : end_index + 1]
3709
+
3710
+ if not start_inclusive:
3711
+ to_delete = to_delete[1:]
3712
+ if not end_inclusive:
3713
+ to_delete = to_delete[:-1]
3714
+
3715
+ self._delete_items(refs=to_delete)
3716
+
3717
+ def extract_items_range(
3718
+ self,
3719
+ *,
3720
+ start: NodeItem,
3721
+ end: NodeItem,
3722
+ start_inclusive: bool = True,
3723
+ end_inclusive: bool = True,
3724
+ delete: bool = False,
3725
+ ) -> "DoclingDocument":
3726
+ """Extracts NodeItems and children in the range from the start NodeItem to the end as a new DoclingDocument.
3727
+
3728
+ :param start: NodeItem: The starting NodeItem of the range (must be a direct child of the document body)
3729
+ :param end: NodeItem: The ending NodeItem of the range (must be a direct child of the document body)
3730
+ :param start_inclusive: bool: (Default value = True): If True, the start NodeItem will also be extracted
3731
+ :param end_inclusive: bool: (Default value = True): If True, the end NodeItem will also be extracted
3732
+ :param delete: bool: (Default value = False): If True, extracted items are deleted in the original document
3733
+
3734
+ :returns: DoclingDocument: A new document containing the extracted NodeItems and their children
3735
+ """
3736
+ if not start.parent == end.parent:
3737
+ raise ValueError(
3738
+ "Start and end NodeItems must have the same parent to extract a range."
3739
+ )
3740
+
3741
+ start_ref = start.get_ref()
3742
+ end_ref = end.get_ref()
3743
+
3744
+ start_parent_ref = (
3745
+ start.parent if start.parent is not None else self.body.get_ref()
3746
+ )
3747
+ end_parent_ref = end.parent if end.parent is not None else self.body.get_ref()
3748
+
3749
+ start_parent = start_parent_ref.resolve(doc=self)
3750
+ end_parent = end_parent_ref.resolve(doc=self)
3751
+
3752
+ start_index = start_parent.children.index(start_ref) + (
3753
+ 0 if start_inclusive else 1
3754
+ )
3755
+ end_index = end_parent.children.index(end_ref) + (1 if end_inclusive else 0)
3756
+
3757
+ if start_index > end_index:
3758
+ raise ValueError(
3759
+ "Start NodeItem must come before or be the same as the end NodeItem in the document structure."
3760
+ )
3761
+
3762
+ new_doc = DoclingDocument(name=f"{self.name}- Extracted Range")
3763
+
3764
+ ref_items = start_parent.children[start_index:end_index]
3765
+ node_items = [ref.resolve(self) for ref in ref_items]
3766
+
3767
+ new_doc.add_node_items(node_items=node_items, doc=self)
3768
+
3769
+ if delete:
3770
+ self.delete_items_range(
3771
+ start=start,
3772
+ end=end,
3773
+ start_inclusive=start_inclusive,
3774
+ end_inclusive=end_inclusive,
3775
+ )
3776
+
3777
+ return new_doc
3778
+
3779
+ def insert_document(
3780
+ self,
3781
+ doc: "DoclingDocument",
3782
+ sibling: NodeItem,
3783
+ after: bool = True,
3784
+ ) -> None:
3785
+ """Inserts the content from the body of a DoclingDocument into this document at a specific position.
3786
+
3787
+ :param doc: DoclingDocument: The document whose content will be inserted
3788
+ :param sibling: NodeItem: The NodeItem after/before which the new items will be inserted
3789
+ :param after: bool: If True, insert after the sibling; if False, insert before (Default value = True)
3790
+
3791
+ :returns: None
3792
+ """
3793
+ ref_items = doc.body.children
3794
+ node_items = [ref.resolve(doc) for ref in ref_items]
3795
+ self.insert_node_items(
3796
+ sibling=sibling, node_items=node_items, doc=doc, after=after
3797
+ )
3798
+
3799
+ def add_document(
3800
+ self,
3801
+ doc: "DoclingDocument",
3802
+ parent: Optional[NodeItem] = None,
3803
+ ) -> None:
3804
+ """Adds the content from the body of a DoclingDocument to this document under a specific parent.
3805
+
3806
+ :param doc: DoclingDocument: The document whose content will be added
3807
+ :param parent: Optional[NodeItem]: The parent NodeItem under which new items are added (Default value = None)
3808
+
3809
+ :returns: None
3810
+ """
3811
+ ref_items = doc.body.children
3812
+ node_items = [ref.resolve(doc) for ref in ref_items]
3813
+ self.add_node_items(node_items=node_items, doc=doc, parent=parent)
3814
+
3815
+ def add_node_items(
3816
+ self,
3817
+ node_items: List[NodeItem],
3818
+ doc: "DoclingDocument",
3819
+ parent: Optional[NodeItem] = None,
3820
+ ) -> None:
3821
+ """Adds multiple NodeItems and their children under a parent in this document.
3822
+
3823
+ :param node_items: list[NodeItem]: The NodeItems to be added
3824
+ :param doc: DoclingDocument: The document to which the NodeItems and their children belong
3825
+ :param parent: Optional[NodeItem]: The parent NodeItem under which new items are added (Default value = None)
3826
+
3827
+ :returns: None
3828
+ """
3829
+ parent = self.body if parent is None else parent
3830
+
3831
+ # Check for ListItem parent violations
3832
+ if not isinstance(parent, ListGroup):
3833
+ for item in node_items:
3834
+ if isinstance(item, ListItem):
3835
+ raise ValueError("Cannot add ListItem into a non-ListGroup parent.")
3836
+
3837
+ # Append the NodeItems to the document content
3838
+
3839
+ parent_ref = parent.get_ref()
3840
+
3841
+ new_refs = self._append_item_copies(
3842
+ node_items=node_items, parent_ref=parent_ref, doc=doc
3843
+ )
3844
+
3845
+ # Add the new item refs in the document structure
3846
+
3847
+ for ref in new_refs:
3848
+ parent.children.append(ref)
3849
+
3850
+ def insert_node_items(
3851
+ self,
3852
+ sibling: NodeItem,
3853
+ node_items: List[NodeItem],
3854
+ doc: "DoclingDocument",
3855
+ after: bool = True,
3856
+ ) -> None:
3857
+ """Insert multiple NodeItems and their children at a specific position in the document.
3858
+
3859
+ :param sibling: NodeItem: The NodeItem after/before which the new items will be inserted
3860
+ :param node_items: list[NodeItem]: The NodeItems to be inserted
3861
+ :param doc: DoclingDocument: The document to which the NodeItems and their children belong
3862
+ :param after: bool: If True, insert after the sibling; if False, insert before (Default value = True)
3863
+
3864
+ :returns: None
3865
+ """
3866
+ # Check for ListItem parent violations
3867
+ parent = sibling.parent.resolve(self) if sibling.parent else self.body
3868
+
3869
+ if not isinstance(parent, ListGroup):
3870
+ for item in node_items:
3871
+ if isinstance(item, ListItem):
3872
+ raise ValueError(
3873
+ "Cannot insert ListItem into a non-ListGroup parent."
3874
+ )
3875
+
3876
+ # Append the NodeItems to the document content
3877
+
3878
+ parent_ref = parent.get_ref()
3879
+
3880
+ new_refs = self._append_item_copies(
3881
+ node_items=node_items, parent_ref=parent_ref, doc=doc
3882
+ )
3883
+
3884
+ # Get the stack of the sibling
3885
+
3886
+ sibling_ref = sibling.get_ref()
3887
+
3888
+ success, stack = self._get_stack_of_refitem(ref=sibling_ref)
3889
+
3890
+ if not success:
3891
+ raise ValueError(
3892
+ f"Could not insert at {sibling_ref.cref}: could not find the stack"
3893
+ )
3894
+
3895
+ # Insert the new item refs in the document structure
3896
+
3897
+ reversed_new_refs = new_refs[::-1]
3898
+
3899
+ for ref in reversed_new_refs:
3900
+ success = self.body._add_sibling(
3901
+ doc=self, stack=stack, new_ref=ref, after=after
3902
+ )
3903
+
3904
+ if not success:
3905
+ raise ValueError(
3906
+ f"Could not insert item {ref.cref} at {sibling.get_ref().cref}"
3907
+ )
3908
+
3909
+ def _append_item_copies(
3910
+ self,
3911
+ node_items: List[NodeItem],
3912
+ parent_ref: RefItem,
3913
+ doc: "DoclingDocument",
3914
+ ) -> List[RefItem]:
3915
+ """Append node item copies (with their children) from a different document to the content of this document.
3916
+
3917
+ :param node_items: List[NodeItem]: The NodeItems to be appended
3918
+ :param parent_ref: RefItem: The reference of the parent of the new items in this document
3919
+ :param doc: DoclingDocument: The document from which the NodeItems are taken
3920
+
3921
+ :returns: List[RefItem]: A list of references to the newly added items in this document
3922
+ """
3923
+ new_refs: List[RefItem] = []
3924
+
3925
+ for item in node_items:
3926
+ item_copy = item.model_copy(deep=True)
3927
+
3928
+ self._append_item(item=item_copy, parent_ref=parent_ref)
3929
+
3930
+ if item_copy.children:
3931
+ children_node_items = [ref.resolve(doc) for ref in item_copy.children]
3932
+
3933
+ item_copy.children = self._append_item_copies(
3934
+ node_items=children_node_items,
3935
+ parent_ref=item_copy.get_ref(),
3936
+ doc=doc,
3937
+ )
3938
+
3939
+ new_ref = item_copy.get_ref()
3940
+ new_refs.append(new_ref)
3941
+
3942
+ return new_refs
3943
+
3944
+ def num_pages(self):
3945
+ """num_pages."""
3946
+ return len(self.pages.values())
3947
+
3948
+ def validate_tree(self, root) -> bool:
3949
+ """validate_tree."""
3950
+ res = []
3951
+ for child_ref in root.children:
3952
+ child = child_ref.resolve(self)
3953
+ if child.parent.resolve(self) != root:
3954
+ return False
3955
+ res.append(self.validate_tree(child))
3956
+
3957
+ return all(res) or len(res) == 0
3958
+
3959
+ def iterate_items(
3960
+ self,
3961
+ root: Optional[NodeItem] = None,
3962
+ with_groups: bool = False,
3963
+ traverse_pictures: bool = False,
3964
+ page_no: Optional[int] = None,
3965
+ included_content_layers: Optional[set[ContentLayer]] = None,
3966
+ _level: int = 0, # fixed parameter, carries through the node nesting level
3967
+ ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
3968
+ """Iterate elements with level."""
3969
+ for item, stack in self._iterate_items_with_stack(
3970
+ root=root,
3971
+ with_groups=with_groups,
3972
+ traverse_pictures=traverse_pictures,
3973
+ page_no=page_no,
3974
+ included_content_layers=included_content_layers,
3975
+ ):
3976
+ yield item, len(stack)
3977
+
3978
+ def _iterate_items_with_stack(
3979
+ self,
3980
+ root: Optional[NodeItem] = None,
3981
+ with_groups: bool = False,
3982
+ traverse_pictures: bool = False,
3983
+ page_no: Optional[int] = None,
3984
+ included_content_layers: Optional[set[ContentLayer]] = None,
3985
+ _stack: Optional[list[int]] = None,
3986
+ ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
3987
+ """Iterate elements with stack."""
3988
+ my_layers = (
3989
+ included_content_layers
3990
+ if included_content_layers is not None
3991
+ else DEFAULT_CONTENT_LAYERS
3992
+ )
3993
+ my_stack: list[int] = _stack if _stack is not None else []
3994
+
3995
+ if not root:
3996
+ root = self.body
3997
+
3998
+ # Yield non-group items or group items when with_groups=True
3999
+
4000
+ # Combine conditions to have a single yield point
4001
+ should_yield = (
4002
+ (not isinstance(root, GroupItem) or with_groups)
4003
+ and (
4004
+ not isinstance(root, DocItem)
4005
+ or (
4006
+ page_no is None
4007
+ or any(prov.page_no == page_no for prov in root.prov)
4008
+ )
4009
+ )
4010
+ and root.content_layer in my_layers
4011
+ )
4012
+
4013
+ if should_yield:
4014
+ yield root, my_stack
4015
+
4016
+ my_stack.append(-1)
4017
+
4018
+ allowed_pic_refs: set[str] = (
4019
+ {r.cref for r in root.captions}
4020
+ if (root_is_picture := isinstance(root, PictureItem))
4021
+ else set()
4022
+ )
4023
+
4024
+ # Traverse children
4025
+ for child_ind, child_ref in enumerate(root.children):
4026
+ child = child_ref.resolve(self)
4027
+ if (
4028
+ root_is_picture
4029
+ and not traverse_pictures
4030
+ and isinstance(child, NodeItem)
4031
+ and child.self_ref not in allowed_pic_refs
4032
+ ):
4033
+ continue
4034
+ my_stack[-1] = child_ind
4035
+
4036
+ if isinstance(child, NodeItem):
4037
+ yield from self._iterate_items_with_stack(
4038
+ child,
4039
+ with_groups=with_groups,
4040
+ traverse_pictures=traverse_pictures,
4041
+ page_no=page_no,
4042
+ _stack=my_stack,
4043
+ included_content_layers=my_layers,
4044
+ )
4045
+
4046
+ my_stack.pop()
4047
+
4048
+ def _clear_picture_pil_cache(self):
4049
+ """Clear cache storage of all images."""
4050
+ for item, level in self.iterate_items(with_groups=False):
4051
+ if isinstance(item, PictureItem):
4052
+ if item.image is not None and item.image._pil is not None:
4053
+ item.image._pil.close()
4054
+
4055
+ def _list_images_on_disk(self) -> List[Path]:
4056
+ """List all images on disk."""
4057
+ result: List[Path] = []
4058
+
4059
+ for item, level in self.iterate_items(with_groups=False):
4060
+ if isinstance(item, PictureItem):
4061
+ if item.image is not None:
4062
+ if (
4063
+ isinstance(item.image.uri, AnyUrl)
4064
+ and item.image.uri.scheme == "file"
4065
+ and item.image.uri.path is not None
4066
+ ):
4067
+ local_path = Path(unquote(item.image.uri.path))
4068
+ result.append(local_path)
4069
+ elif isinstance(item.image.uri, Path):
4070
+ result.append(item.image.uri)
4071
+
4072
+ return result
4073
+
4074
+ def _with_embedded_pictures(self) -> "DoclingDocument":
4075
+ """Document with embedded images.
4076
+
4077
+ Creates a copy of this document where all pictures referenced
4078
+ through a file URI are turned into base64 embedded form.
4079
+ """
4080
+ result: DoclingDocument = copy.deepcopy(self)
4081
+
4082
+ for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
4083
+ if isinstance(item, PictureItem):
4084
+
4085
+ if item.image is not None:
4086
+ if (
4087
+ isinstance(item.image.uri, AnyUrl)
4088
+ and item.image.uri.scheme == "file"
4089
+ ):
4090
+ assert isinstance(item.image.uri.path, str)
4091
+ tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
4092
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
4093
+
4094
+ elif isinstance(item.image.uri, Path):
4095
+ tmp_image = PILImage.open(str(item.image.uri))
4096
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
4097
+
4098
+ return result
4099
+
4100
+ def _with_pictures_refs(
4101
+ self, image_dir: Path, reference_path: Optional[Path] = None
4102
+ ) -> "DoclingDocument":
4103
+ """Document with images as refs.
4104
+
4105
+ Creates a copy of this document where all picture data is
4106
+ saved to image_dir and referenced through file URIs.
4107
+ """
4108
+ result: DoclingDocument = copy.deepcopy(self)
4109
+
4110
+ img_count = 0
4111
+ image_dir.mkdir(parents=True, exist_ok=True)
4112
+
4113
+ if image_dir.is_dir():
4114
+ for item, level in result.iterate_items(with_groups=False):
2964
4115
  if isinstance(item, PictureItem):
2965
4116
 
2966
4117
  if (
@@ -3048,6 +4199,8 @@ class DoclingDocument(BaseModel):
3048
4199
  artifacts_dir: Optional[Path] = None,
3049
4200
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
3050
4201
  indent: int = 2,
4202
+ coord_precision: Optional[int] = None,
4203
+ confid_precision: Optional[int] = None,
3051
4204
  ):
3052
4205
  """Save as json."""
3053
4206
  if isinstance(filename, str):
@@ -3061,7 +4214,9 @@ class DoclingDocument(BaseModel):
3061
4214
  artifacts_dir, image_mode, reference_path=reference_path
3062
4215
  )
3063
4216
 
3064
- out = new_doc.export_to_dict()
4217
+ out = new_doc.export_to_dict(
4218
+ coord_precision=coord_precision, confid_precision=confid_precision
4219
+ )
3065
4220
  with open(filename, "w", encoding="utf-8") as fw:
3066
4221
  json.dump(out, fw, indent=indent)
3067
4222
 
@@ -3087,6 +4242,8 @@ class DoclingDocument(BaseModel):
3087
4242
  artifacts_dir: Optional[Path] = None,
3088
4243
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
3089
4244
  default_flow_style: bool = False,
4245
+ coord_precision: Optional[int] = None,
4246
+ confid_precision: Optional[int] = None,
3090
4247
  ):
3091
4248
  """Save as yaml."""
3092
4249
  if isinstance(filename, str):
@@ -3100,7 +4257,9 @@ class DoclingDocument(BaseModel):
3100
4257
  artifacts_dir, image_mode, reference_path=reference_path
3101
4258
  )
3102
4259
 
3103
- out = new_doc.export_to_dict()
4260
+ out = new_doc.export_to_dict(
4261
+ coord_precision=coord_precision, confid_precision=confid_precision
4262
+ )
3104
4263
  with open(filename, "w", encoding="utf-8") as fw:
3105
4264
  yaml.dump(out, fw, default_flow_style=default_flow_style)
3106
4265
 
@@ -3125,9 +4284,18 @@ class DoclingDocument(BaseModel):
3125
4284
  mode: str = "json",
3126
4285
  by_alias: bool = True,
3127
4286
  exclude_none: bool = True,
4287
+ coord_precision: Optional[int] = None,
4288
+ confid_precision: Optional[int] = None,
3128
4289
  ) -> Dict[str, Any]:
3129
4290
  """Export to dict."""
3130
- out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none)
4291
+ context = {}
4292
+ if coord_precision is not None:
4293
+ context[PydanticSerCtxKey.COORD_PREC.value] = coord_precision
4294
+ if confid_precision is not None:
4295
+ context[PydanticSerCtxKey.CONFID_PREC.value] = confid_precision
4296
+ out = self.model_dump(
4297
+ mode=mode, by_alias=by_alias, exclude_none=exclude_none, context=context
4298
+ )
3131
4299
 
3132
4300
  return out
3133
4301