docling-core 2.41.0__py3-none-any.whl → 2.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -41,10 +41,10 @@ from docling_core.search.package import VERSION_PATTERN
41
41
  from docling_core.types.base import _JSON_POINTER_REGEX
42
42
  from docling_core.types.doc import BoundingBox, Size
43
43
  from docling_core.types.doc.base import (
44
- _CTX_COORD_PREC,
45
44
  CoordOrigin,
46
45
  ImageRefMode,
47
- _serialize_precision,
46
+ PydanticSerCtxKey,
47
+ round_pydantic_float,
48
48
  )
49
49
  from docling_core.types.doc.labels import (
50
50
  CodeLanguageLabel,
@@ -92,8 +92,6 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
92
92
  ]
93
93
  )
94
94
 
95
- _CTX_CONFID_PREC = "confid_prec"
96
-
97
95
 
98
96
  class BaseAnnotation(BaseModel):
99
97
  """Base class for all annotation types."""
@@ -109,7 +107,7 @@ class PictureClassificationClass(BaseModel):
109
107
 
110
108
  @field_serializer("confidence")
111
109
  def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
112
- return _serialize_precision(value, info, _CTX_CONFID_PREC)
110
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
113
111
 
114
112
 
115
113
  class PictureClassificationData(BaseAnnotation):
@@ -140,7 +138,7 @@ class PictureMoleculeData(BaseAnnotation):
140
138
 
141
139
  @field_serializer("confidence")
142
140
  def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
143
- return _serialize_precision(value, info, _CTX_CONFID_PREC)
141
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
144
142
 
145
143
 
146
144
  class MiscAnnotation(BaseAnnotation):
@@ -383,6 +381,145 @@ class TableData(BaseModel): # TBD
383
381
 
384
382
  return table_data
385
383
 
384
+ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
385
+ """Remove rows from the table by their indices.
386
+
387
+ :param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
388
+
389
+ :return: List[List[TableCell]]: A list representation of the removed rows as lists of TableCell objects.
390
+ """
391
+ if not indices:
392
+ return []
393
+
394
+ indices = sorted(indices, reverse=True)
395
+
396
+ all_removed_cells = []
397
+ for row_index in indices:
398
+ if row_index < 0 or row_index >= self.num_rows:
399
+ raise IndexError(
400
+ f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}."
401
+ )
402
+
403
+ start_idx = row_index * self.num_cols
404
+ end_idx = start_idx + self.num_cols
405
+ removed_cells = self.table_cells[start_idx:end_idx]
406
+
407
+ # Remove the cells from the table
408
+ self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
409
+
410
+ # Update the number of rows
411
+ self.num_rows -= 1
412
+
413
+ # Reassign row offset indices for existing cells
414
+ for index, cell in enumerate(self.table_cells):
415
+ new_index = index // self.num_cols
416
+ cell.start_row_offset_idx = new_index
417
+ cell.end_row_offset_idx = new_index + 1
418
+
419
+ all_removed_cells.append(removed_cells)
420
+
421
+ return all_removed_cells
422
+
423
+ def pop_row(self) -> List[TableCell]:
424
+ """Remove and return the last row from the table.
425
+
426
+ :returns: List[TableCell]: A list of TableCell objects representing the popped row.
427
+ """
428
+ if self.num_rows == 0:
429
+ raise IndexError("Cannot pop from an empty table.")
430
+
431
+ return self.remove_row(self.num_rows - 1)
432
+
433
+ def remove_row(self, row_index: int) -> List[TableCell]:
434
+ """Remove a row from the table by its index.
435
+
436
+ :param row_index: int: The index of the row to remove. (Starting from 0)
437
+
438
+ :returns: List[TableCell]: A list of TableCell objects representing the removed row.
439
+ """
440
+ return self.remove_rows([row_index])[0]
441
+
442
+ def insert_rows(
443
+ self, row_index: int, rows: List[List[str]], after: bool = False
444
+ ) -> None:
445
+ """Insert multiple new rows from a list of lists of strings before/after a specific index in the table.
446
+
447
+ :param row_index: int: The index at which to insert the new rows. (Starting from 0)
448
+ :param rows: List[List[str]]: A list of lists, where each inner list represents the content of a new row.
449
+ :param after: bool: If True, insert the rows after the specified index, otherwise before it. (Default is False)
450
+
451
+ :returns: None
452
+ """
453
+ effective_rows = rows[::-1]
454
+
455
+ for row in effective_rows:
456
+ self.insert_row(row_index, row, after)
457
+
458
+ def insert_row(self, row_index: int, row: List[str], after: bool = False) -> None:
459
+ """Insert a new row from a list of strings before/after a specific index in the table.
460
+
461
+ :param row_index: int: The index at which to insert the new row. (Starting from 0)
462
+ :param row: List[str]: A list of strings representing the content of the new row.
463
+ :param after: bool: If True, insert the row after the specified index, otherwise before it. (Default is False)
464
+
465
+ :returns: None
466
+ """
467
+ if len(row) != self.num_cols:
468
+ raise ValueError(
469
+ f"Row length {len(row)} does not match the number of columns {self.num_cols}."
470
+ )
471
+
472
+ effective_index = row_index + (1 if after else 0)
473
+
474
+ if effective_index < 0 or effective_index > self.num_rows:
475
+ raise IndexError(
476
+ f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}."
477
+ )
478
+
479
+ new_row_cells = [
480
+ TableCell(
481
+ text=text,
482
+ start_row_offset_idx=effective_index,
483
+ end_row_offset_idx=effective_index + 1,
484
+ start_col_offset_idx=j,
485
+ end_col_offset_idx=j + 1,
486
+ )
487
+ for j, text in enumerate(row)
488
+ ]
489
+
490
+ self.table_cells = (
491
+ self.table_cells[: effective_index * self.num_cols]
492
+ + new_row_cells
493
+ + self.table_cells[effective_index * self.num_cols :]
494
+ )
495
+
496
+ # Reassign row offset indices for existing cells
497
+ for index, cell in enumerate(self.table_cells):
498
+ new_index = index // self.num_cols
499
+ cell.start_row_offset_idx = new_index
500
+ cell.end_row_offset_idx = new_index + 1
501
+
502
+ self.num_rows += 1
503
+
504
+ def add_rows(self, rows: List[List[str]]) -> None:
505
+ """Add multiple new rows to the table from a list of lists of strings.
506
+
507
+ :param rows: List[List[str]]: A list of lists, where each inner list represents the content of a new row.
508
+
509
+ :returns: None
510
+ """
511
+ for row in rows:
512
+ self.add_row(row)
513
+
514
+ def add_row(self, row: List[str]) -> None:
515
+ """Add a new row to the table from a list of strings.
516
+
517
+ :param row: List[str]: A list of strings representing the content of the new row.
518
+
519
+ :returns: None
520
+ """
521
+ self.insert_row(row_index=self.num_rows - 1, row=row, after=True)
522
+
386
523
  def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
387
524
  """Get the minimal bounding box for each row in the table.
388
525
 
@@ -839,7 +976,7 @@ class NodeItem(BaseModel):
839
976
  after: bool = True,
840
977
  ) -> bool:
841
978
  """Add sibling node in tree."""
842
- if len(stack) == 1 and stack[0] < len(self.children) and (not after):
979
+ if len(stack) == 1 and stack[0] <= len(self.children) and (not after):
843
980
  # ensure the parent is correct
844
981
  new_item = new_ref.resolve(doc=doc)
845
982
  new_item.parent = self.get_ref()
@@ -1975,6 +2112,16 @@ class DoclingDocument(BaseModel):
1975
2112
  item.self_ref = cref
1976
2113
  item.parent = parent_ref
1977
2114
 
2115
+ self.groups.append(item)
2116
+ elif isinstance(item, GroupItem):
2117
+ item_label = "groups"
2118
+ item_index = len(self.groups)
2119
+
2120
+ cref = f"#/{item_label}/{item_index}"
2121
+
2122
+ item.self_ref = cref
2123
+ item.parent = parent_ref
2124
+
1978
2125
  self.groups.append(item)
1979
2126
 
1980
2127
  else:
@@ -1993,7 +2140,7 @@ class DoclingDocument(BaseModel):
1993
2140
  item_index = int(path[2])
1994
2141
 
1995
2142
  if (
1996
- len(self.__getattribute__(item_label)) + 1 == item_index
2143
+ len(self.__getattribute__(item_label)) == item_index + 1
1997
2144
  ): # we can only pop the last item
1998
2145
  del self.__getattribute__(item_label)[item_index]
1999
2146
  else:
@@ -2018,6 +2165,10 @@ class DoclingDocument(BaseModel):
2018
2165
  if not success:
2019
2166
  self._pop_item(item=item)
2020
2167
 
2168
+ raise ValueError(
2169
+ f"Could not insert item: {item} under parent: {parent_ref.resolve(doc=self)}"
2170
+ )
2171
+
2021
2172
  return item.get_ref()
2022
2173
 
2023
2174
  def _delete_items(self, refs: list[RefItem]):
@@ -2397,17 +2548,6 @@ class DoclingDocument(BaseModel):
2397
2548
  hyperlink=hyperlink,
2398
2549
  )
2399
2550
 
2400
- elif label in [DocItemLabel.TITLE]:
2401
- return self.add_title(
2402
- text=text,
2403
- orig=orig,
2404
- prov=prov,
2405
- parent=parent,
2406
- content_layer=content_layer,
2407
- formatting=formatting,
2408
- hyperlink=hyperlink,
2409
- )
2410
-
2411
2551
  elif label in [DocItemLabel.SECTION_HEADER]:
2412
2552
  return self.add_heading(
2413
2553
  text=text,
@@ -2807,164 +2947,1161 @@ class DoclingDocument(BaseModel):
2807
2947
 
2808
2948
  return form_item
2809
2949
 
2810
- def num_pages(self):
2811
- """num_pages."""
2812
- return len(self.pages.values())
2950
+ # ---------------------------
2951
+ # Node Item Insertion Methods
2952
+ # ---------------------------
2813
2953
 
2814
- def validate_tree(self, root) -> bool:
2815
- """validate_tree."""
2816
- res = []
2817
- for child_ref in root.children:
2818
- child = child_ref.resolve(self)
2819
- if child.parent.resolve(self) != root:
2820
- return False
2821
- res.append(self.validate_tree(child))
2954
+ def _get_insertion_stack_and_parent(
2955
+ self, sibling: NodeItem
2956
+ ) -> tuple[list[int], RefItem]:
2957
+ """Get the stack and parent reference for inserting a new item at a sibling."""
2958
+ # Get the stack of the sibling
2959
+ sibling_ref = sibling.get_ref()
2822
2960
 
2823
- return all(res) or len(res) == 0
2961
+ success, stack = self._get_stack_of_refitem(ref=sibling_ref)
2824
2962
 
2825
- def iterate_items(
2826
- self,
2827
- root: Optional[NodeItem] = None,
2828
- with_groups: bool = False,
2829
- traverse_pictures: bool = False,
2830
- page_no: Optional[int] = None,
2831
- included_content_layers: Optional[set[ContentLayer]] = None,
2832
- _level: int = 0, # fixed parameter, carries through the node nesting level
2833
- ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
2834
- """Iterate elements with level."""
2835
- for item, stack in self._iterate_items_with_stack(
2836
- root=root,
2837
- with_groups=with_groups,
2838
- traverse_pictures=traverse_pictures,
2839
- page_no=page_no,
2840
- included_content_layers=included_content_layers,
2841
- ):
2842
- yield item, len(stack)
2963
+ if not success:
2964
+ raise ValueError(
2965
+ f"Could not insert at {sibling_ref.cref}: could not find the stack"
2966
+ )
2843
2967
 
2844
- def _iterate_items_with_stack(
2968
+ # Get the parent RefItem
2969
+ parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
2970
+
2971
+ if parent_ref is None:
2972
+ raise ValueError(f"Could not find a parent at stack: {stack}")
2973
+
2974
+ return stack, parent_ref
2975
+
2976
+ def _insert_in_structure(
2845
2977
  self,
2846
- root: Optional[NodeItem] = None,
2847
- with_groups: bool = False,
2848
- traverse_pictures: bool = False,
2849
- page_no: Optional[int] = None,
2850
- included_content_layers: Optional[set[ContentLayer]] = None,
2851
- _stack: Optional[list[int]] = None,
2852
- ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
2853
- """Iterate elements with stack."""
2854
- my_layers = (
2855
- included_content_layers
2856
- if included_content_layers is not None
2857
- else DEFAULT_CONTENT_LAYERS
2858
- )
2859
- my_stack: list[int] = _stack if _stack is not None else []
2978
+ item: NodeItem,
2979
+ stack: list[int],
2980
+ after: bool,
2981
+ created_parent: Optional[bool] = False,
2982
+ ) -> None:
2983
+ """Insert item into the document structure at the specified stack and handle errors."""
2984
+ # Ensure the item has a parent reference
2985
+ if item.parent is None:
2986
+ item.parent = self.body.get_ref()
2860
2987
 
2861
- if not root:
2862
- root = self.body
2988
+ self._append_item(item=item, parent_ref=item.parent)
2863
2989
 
2864
- # Yield non-group items or group items when with_groups=True
2990
+ new_ref = item.get_ref()
2865
2991
 
2866
- # Combine conditions to have a single yield point
2867
- should_yield = (
2868
- (not isinstance(root, GroupItem) or with_groups)
2869
- and (
2870
- not isinstance(root, DocItem)
2871
- or (
2872
- page_no is None
2873
- or any(prov.page_no == page_no for prov in root.prov)
2874
- )
2875
- )
2876
- and root.content_layer in my_layers
2992
+ success = self.body._add_sibling(
2993
+ doc=self, stack=stack, new_ref=new_ref, after=after
2877
2994
  )
2878
2995
 
2879
- if should_yield:
2880
- yield root, my_stack
2996
+ # Error handling can be determined here
2997
+ if not success:
2998
+ self._pop_item(item=item)
2881
2999
 
2882
- my_stack.append(-1)
3000
+ if created_parent:
3001
+ self.delete_items(node_items=[item.parent.resolve(self)])
2883
3002
 
2884
- allowed_pic_refs: set[str] = (
2885
- {r.cref for r in root.captions}
2886
- if (root_is_picture := isinstance(root, PictureItem))
2887
- else set()
2888
- )
3003
+ raise ValueError(
3004
+ f"Could not insert item: {item} under parent: {item.parent.resolve(doc=self)}"
3005
+ )
2889
3006
 
2890
- # Traverse children
2891
- for child_ind, child_ref in enumerate(root.children):
2892
- child = child_ref.resolve(self)
2893
- if (
2894
- root_is_picture
2895
- and not traverse_pictures
2896
- and isinstance(child, NodeItem)
2897
- and child.self_ref not in allowed_pic_refs
2898
- ):
2899
- continue
2900
- my_stack[-1] = child_ind
3007
+ def insert_list_group(
3008
+ self,
3009
+ sibling: NodeItem,
3010
+ name: Optional[str] = None,
3011
+ content_layer: Optional[ContentLayer] = None,
3012
+ after: bool = True,
3013
+ ) -> ListGroup:
3014
+ """Creates a new ListGroup item and inserts it into the document.
2901
3015
 
2902
- if isinstance(child, NodeItem):
2903
- yield from self._iterate_items_with_stack(
2904
- child,
2905
- with_groups=with_groups,
2906
- traverse_pictures=traverse_pictures,
2907
- page_no=page_no,
2908
- _stack=my_stack,
2909
- included_content_layers=my_layers,
2910
- )
3016
+ :param sibling: NodeItem:
3017
+ :param name: Optional[str]: (Default value = None)
3018
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3019
+ :param after: bool: (Default value = True)
2911
3020
 
2912
- my_stack.pop()
3021
+ :returns: ListGroup: The newly created ListGroup item.
3022
+ """
3023
+ # Get stack and parent reference of the sibling
3024
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
2913
3025
 
2914
- def _clear_picture_pil_cache(self):
2915
- """Clear cache storage of all images."""
2916
- for item, level in self.iterate_items(with_groups=False):
2917
- if isinstance(item, PictureItem):
2918
- if item.image is not None and item.image._pil is not None:
2919
- item.image._pil.close()
3026
+ group = ListGroup(self_ref="#", parent=parent_ref)
2920
3027
 
2921
- def _list_images_on_disk(self) -> List[Path]:
2922
- """List all images on disk."""
2923
- result: List[Path] = []
3028
+ if name is not None:
3029
+ group.name = name
3030
+ if content_layer:
3031
+ group.content_layer = content_layer
2924
3032
 
2925
- for item, level in self.iterate_items(with_groups=False):
2926
- if isinstance(item, PictureItem):
2927
- if item.image is not None:
2928
- if (
2929
- isinstance(item.image.uri, AnyUrl)
2930
- and item.image.uri.scheme == "file"
2931
- and item.image.uri.path is not None
2932
- ):
2933
- local_path = Path(unquote(item.image.uri.path))
2934
- result.append(local_path)
2935
- elif isinstance(item.image.uri, Path):
2936
- result.append(item.image.uri)
3033
+ self._insert_in_structure(item=group, stack=stack, after=after)
2937
3034
 
2938
- return result
3035
+ return group
2939
3036
 
2940
- def _with_embedded_pictures(self) -> "DoclingDocument":
2941
- """Document with embedded images.
3037
+ def insert_inline_group(
3038
+ self,
3039
+ sibling: NodeItem,
3040
+ name: Optional[str] = None,
3041
+ content_layer: Optional[ContentLayer] = None,
3042
+ after: bool = True,
3043
+ ) -> InlineGroup:
3044
+ """Creates a new InlineGroup item and inserts it into the document.
2942
3045
 
2943
- Creates a copy of this document where all pictures referenced
2944
- through a file URI are turned into base64 embedded form.
3046
+ :param sibling: NodeItem:
3047
+ :param name: Optional[str]: (Default value = None)
3048
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3049
+ :param after: bool: (Default value = True)
3050
+
3051
+ :returns: InlineGroup: The newly created InlineGroup item.
2945
3052
  """
2946
- result: DoclingDocument = copy.deepcopy(self)
3053
+ # Get stack and parent reference of the sibling
3054
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
2947
3055
 
2948
- for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
2949
- if isinstance(item, PictureItem):
3056
+ # Create a new InlineGroup NodeItem
3057
+ group = InlineGroup(self_ref="#", parent=parent_ref)
2950
3058
 
2951
- if item.image is not None:
2952
- if (
2953
- isinstance(item.image.uri, AnyUrl)
2954
- and item.image.uri.scheme == "file"
2955
- ):
2956
- assert isinstance(item.image.uri.path, str)
2957
- tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
2958
- item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
3059
+ if name is not None:
3060
+ group.name = name
3061
+ if content_layer:
3062
+ group.content_layer = content_layer
2959
3063
 
2960
- elif isinstance(item.image.uri, Path):
2961
- tmp_image = PILImage.open(str(item.image.uri))
2962
- item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
3064
+ self._insert_in_structure(item=group, stack=stack, after=after)
2963
3065
 
2964
- return result
3066
+ return group
2965
3067
 
2966
- def _with_pictures_refs(
2967
- self, image_dir: Path, reference_path: Optional[Path] = None
3068
+ def insert_group(
3069
+ self,
3070
+ sibling: NodeItem,
3071
+ label: Optional[GroupLabel] = None,
3072
+ name: Optional[str] = None,
3073
+ content_layer: Optional[ContentLayer] = None,
3074
+ after: bool = True,
3075
+ ) -> GroupItem:
3076
+ """Creates a new GroupItem item and inserts it into the document.
3077
+
3078
+ :param sibling: NodeItem:
3079
+ :param label: Optional[GroupLabel]: (Default value = None)
3080
+ :param name: Optional[str]: (Default value = None)
3081
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3082
+ :param after: bool: (Default value = True)
3083
+
3084
+ :returns: GroupItem: The newly created GroupItem.
3085
+ """
3086
+ if label in [GroupLabel.LIST, GroupLabel.ORDERED_LIST]:
3087
+ return self.insert_list_group(
3088
+ sibling=sibling,
3089
+ name=name,
3090
+ content_layer=content_layer,
3091
+ after=after,
3092
+ )
3093
+ elif label == GroupLabel.INLINE:
3094
+ return self.insert_inline_group(
3095
+ sibling=sibling,
3096
+ name=name,
3097
+ content_layer=content_layer,
3098
+ after=after,
3099
+ )
3100
+
3101
+ # Get stack and parent reference of the sibling
3102
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3103
+
3104
+ # Create a new GroupItem NodeItem
3105
+ group = GroupItem(self_ref="#", parent=parent_ref)
3106
+
3107
+ if name is not None:
3108
+ group.name = name
3109
+ if label is not None:
3110
+ group.label = label
3111
+ if content_layer:
3112
+ group.content_layer = content_layer
3113
+
3114
+ self._insert_in_structure(item=group, stack=stack, after=after)
3115
+
3116
+ return group
3117
+
3118
+ def insert_list_item(
3119
+ self,
3120
+ sibling: NodeItem,
3121
+ text: str,
3122
+ enumerated: bool = False,
3123
+ marker: Optional[str] = None,
3124
+ orig: Optional[str] = None,
3125
+ prov: Optional[ProvenanceItem] = None,
3126
+ content_layer: Optional[ContentLayer] = None,
3127
+ formatting: Optional[Formatting] = None,
3128
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3129
+ after: bool = True,
3130
+ ) -> ListItem:
3131
+ """Creates a new ListItem item and inserts it into the document.
3132
+
3133
+ :param sibling: NodeItem:
3134
+ :param text: str:
3135
+ :param enumerated: bool: (Default value = False)
3136
+ :param marker: Optional[str]: (Default value = None)
3137
+ :param orig: Optional[str]: (Default value = None)
3138
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3139
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3140
+ :param formatting: Optional[Formatting]: (Default value = None)
3141
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3142
+ :param after: bool: (Default value = True)
3143
+
3144
+ :returns: ListItem: The newly created ListItem item.
3145
+ """
3146
+ # Get stack and parent reference of the sibling
3147
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3148
+
3149
+ # Ensure the parent is a ListGroup
3150
+
3151
+ parent = parent_ref.resolve(self)
3152
+ set_parent = False
3153
+
3154
+ if not isinstance(parent, ListGroup):
3155
+ warnings.warn(
3156
+ "ListItem parent must be a ListGroup, creating one on the fly.",
3157
+ DeprecationWarning,
3158
+ )
3159
+ parent = self.insert_list_group(sibling=sibling, after=after)
3160
+ parent_ref = parent.get_ref()
3161
+ if after:
3162
+ stack[-1] += 1
3163
+ stack.append(0)
3164
+ after = False
3165
+ set_parent = True
3166
+
3167
+ # Create a new ListItem NodeItem
3168
+ if not orig:
3169
+ orig = text
3170
+
3171
+ list_item = ListItem(
3172
+ text=text,
3173
+ orig=orig,
3174
+ self_ref="#",
3175
+ parent=parent_ref,
3176
+ enumerated=enumerated,
3177
+ marker=marker or "",
3178
+ formatting=formatting,
3179
+ hyperlink=hyperlink,
3180
+ )
3181
+
3182
+ if prov:
3183
+ list_item.prov.append(prov)
3184
+ if content_layer:
3185
+ list_item.content_layer = content_layer
3186
+
3187
+ self._insert_in_structure(
3188
+ item=list_item, stack=stack, after=after, created_parent=set_parent
3189
+ )
3190
+
3191
+ return list_item
3192
+
3193
+ def insert_text(
3194
+ self,
3195
+ sibling: NodeItem,
3196
+ label: DocItemLabel,
3197
+ text: str,
3198
+ orig: Optional[str] = None,
3199
+ prov: Optional[ProvenanceItem] = None,
3200
+ content_layer: Optional[ContentLayer] = None,
3201
+ formatting: Optional[Formatting] = None,
3202
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3203
+ after: bool = True,
3204
+ ) -> TextItem:
3205
+ """Creates a new TextItem item and inserts it into the document.
3206
+
3207
+ :param sibling: NodeItem:
3208
+ :param label: DocItemLabel:
3209
+ :param text: str:
3210
+ :param orig: Optional[str]: (Default value = None)
3211
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3212
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3213
+ :param formatting: Optional[Formatting]: (Default value = None)
3214
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3215
+ :param after: bool: (Default value = True)
3216
+
3217
+ :returns: TextItem: The newly created TextItem item.
3218
+ """
3219
+ if label in [DocItemLabel.TITLE]:
3220
+ return self.insert_title(
3221
+ sibling=sibling,
3222
+ text=text,
3223
+ orig=orig,
3224
+ prov=prov,
3225
+ content_layer=content_layer,
3226
+ formatting=formatting,
3227
+ hyperlink=hyperlink,
3228
+ after=after,
3229
+ )
3230
+
3231
+ elif label in [DocItemLabel.LIST_ITEM]:
3232
+ return self.insert_list_item(
3233
+ sibling=sibling,
3234
+ text=text,
3235
+ orig=orig,
3236
+ prov=prov,
3237
+ content_layer=content_layer,
3238
+ formatting=formatting,
3239
+ hyperlink=hyperlink,
3240
+ after=after,
3241
+ )
3242
+
3243
+ elif label in [DocItemLabel.SECTION_HEADER]:
3244
+ return self.insert_heading(
3245
+ sibling=sibling,
3246
+ text=text,
3247
+ orig=orig,
3248
+ prov=prov,
3249
+ content_layer=content_layer,
3250
+ formatting=formatting,
3251
+ hyperlink=hyperlink,
3252
+ after=after,
3253
+ )
3254
+
3255
+ elif label in [DocItemLabel.CODE]:
3256
+ return self.insert_code(
3257
+ sibling=sibling,
3258
+ text=text,
3259
+ orig=orig,
3260
+ prov=prov,
3261
+ content_layer=content_layer,
3262
+ formatting=formatting,
3263
+ hyperlink=hyperlink,
3264
+ after=after,
3265
+ )
3266
+
3267
+ elif label in [DocItemLabel.FORMULA]:
3268
+ return self.insert_formula(
3269
+ sibling=sibling,
3270
+ text=text,
3271
+ orig=orig,
3272
+ prov=prov,
3273
+ content_layer=content_layer,
3274
+ formatting=formatting,
3275
+ hyperlink=hyperlink,
3276
+ after=after,
3277
+ )
3278
+
3279
+ else:
3280
+ # Get stack and parent reference of the sibling
3281
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3282
+
3283
+ # Create a new TextItem NodeItem
3284
+ if not orig:
3285
+ orig = text
3286
+
3287
+ text_item = TextItem(
3288
+ label=label,
3289
+ text=text,
3290
+ orig=orig,
3291
+ self_ref="#",
3292
+ parent=parent_ref,
3293
+ formatting=formatting,
3294
+ hyperlink=hyperlink,
3295
+ )
3296
+
3297
+ if prov:
3298
+ text_item.prov.append(prov)
3299
+ if content_layer:
3300
+ text_item.content_layer = content_layer
3301
+
3302
+ self._insert_in_structure(item=text_item, stack=stack, after=after)
3303
+
3304
+ return text_item
3305
+
3306
+ def insert_table(
3307
+ self,
3308
+ sibling: NodeItem,
3309
+ data: TableData,
3310
+ caption: Optional[Union[TextItem, RefItem]] = None,
3311
+ prov: Optional[ProvenanceItem] = None,
3312
+ label: DocItemLabel = DocItemLabel.TABLE,
3313
+ content_layer: Optional[ContentLayer] = None,
3314
+ annotations: Optional[list[TableAnnotationType]] = None,
3315
+ after: bool = True,
3316
+ ) -> TableItem:
3317
+ """Creates a new TableItem item and inserts it into the document.
3318
+
3319
+ :param sibling: NodeItem:
3320
+ :param data: TableData:
3321
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
3322
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3323
+ :param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
3324
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3325
+ :param annotations: Optional[List[TableAnnotationType]]: (Default value = None)
3326
+ :param after: bool: (Default value = True)
3327
+
3328
+ :returns: TableItem: The newly created TableItem item.
3329
+ """
3330
+ # Get stack and parent reference of the sibling
3331
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3332
+
3333
+ # Create a new ListItem NodeItem
3334
+ table_item = TableItem(
3335
+ label=label,
3336
+ data=data,
3337
+ self_ref="#",
3338
+ parent=parent_ref,
3339
+ annotations=annotations or [],
3340
+ )
3341
+
3342
+ if prov:
3343
+ table_item.prov.append(prov)
3344
+ if content_layer:
3345
+ table_item.content_layer = content_layer
3346
+ if caption:
3347
+ table_item.captions.append(caption.get_ref())
3348
+
3349
+ self._insert_in_structure(item=table_item, stack=stack, after=after)
3350
+
3351
+ return table_item
3352
+
3353
+ def insert_picture(
3354
+ self,
3355
+ sibling: NodeItem,
3356
+ annotations: Optional[List[PictureDataType]] = None,
3357
+ image: Optional[ImageRef] = None,
3358
+ caption: Optional[Union[TextItem, RefItem]] = None,
3359
+ prov: Optional[ProvenanceItem] = None,
3360
+ content_layer: Optional[ContentLayer] = None,
3361
+ after: bool = True,
3362
+ ) -> PictureItem:
3363
+ """Creates a new PictureItem item and inserts it into the document.
3364
+
3365
+ :param sibling: NodeItem:
3366
+ :param annotations: Optional[List[PictureDataType]]: (Default value = None)
3367
+ :param image: Optional[ImageRef]: (Default value = None)
3368
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
3369
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3370
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3371
+ :param after: bool: (Default value = True)
3372
+
3373
+ :returns: PictureItem: The newly created PictureItem item.
3374
+ """
3375
+ # Get stack and parent reference of the sibling
3376
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3377
+
3378
+ # Create a new PictureItem NodeItem
3379
+ picture_item = PictureItem(
3380
+ label=DocItemLabel.PICTURE,
3381
+ annotations=annotations or [],
3382
+ image=image,
3383
+ self_ref="#",
3384
+ parent=parent_ref,
3385
+ )
3386
+
3387
+ if prov:
3388
+ picture_item.prov.append(prov)
3389
+ if content_layer:
3390
+ picture_item.content_layer = content_layer
3391
+ if caption:
3392
+ picture_item.captions.append(caption.get_ref())
3393
+
3394
+ self._insert_in_structure(item=picture_item, stack=stack, after=after)
3395
+
3396
+ return picture_item
3397
+
3398
+ def insert_title(
3399
+ self,
3400
+ sibling: NodeItem,
3401
+ text: str,
3402
+ orig: Optional[str] = None,
3403
+ prov: Optional[ProvenanceItem] = None,
3404
+ content_layer: Optional[ContentLayer] = None,
3405
+ formatting: Optional[Formatting] = None,
3406
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3407
+ after: bool = True,
3408
+ ) -> TitleItem:
3409
+ """Creates a new TitleItem item and inserts it into the document.
3410
+
3411
+ :param sibling: NodeItem:
3412
+ :param text: str:
3413
+ :param orig: Optional[str]: (Default value = None)
3414
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3415
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3416
+ :param formatting: Optional[Formatting]: (Default value = None)
3417
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3418
+ :param after: bool: (Default value = True)
3419
+
3420
+ :returns: TitleItem: The newly created TitleItem item.
3421
+ """
3422
+ # Get stack and parent reference of the sibling
3423
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3424
+
3425
+ # Create a new TitleItem NodeItem
3426
+ if not orig:
3427
+ orig = text
3428
+
3429
+ title_item = TitleItem(
3430
+ text=text,
3431
+ orig=orig,
3432
+ self_ref="#",
3433
+ parent=parent_ref,
3434
+ formatting=formatting,
3435
+ hyperlink=hyperlink,
3436
+ )
3437
+
3438
+ if prov:
3439
+ title_item.prov.append(prov)
3440
+ if content_layer:
3441
+ title_item.content_layer = content_layer
3442
+
3443
+ self._insert_in_structure(item=title_item, stack=stack, after=after)
3444
+
3445
+ return title_item
3446
+
3447
+ def insert_code(
3448
+ self,
3449
+ sibling: NodeItem,
3450
+ text: str,
3451
+ code_language: Optional[CodeLanguageLabel] = None,
3452
+ orig: Optional[str] = None,
3453
+ caption: Optional[Union[TextItem, RefItem]] = None,
3454
+ prov: Optional[ProvenanceItem] = None,
3455
+ content_layer: Optional[ContentLayer] = None,
3456
+ formatting: Optional[Formatting] = None,
3457
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3458
+ after: bool = True,
3459
+ ) -> CodeItem:
3460
+ """Creates a new CodeItem item and inserts it into the document.
3461
+
3462
+ :param sibling: NodeItem:
3463
+ :param text: str:
3464
+ :param code_language: Optional[str]: (Default value = None)
3465
+ :param orig: Optional[str]: (Default value = None)
3466
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
3467
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3468
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3469
+ :param formatting: Optional[Formatting]: (Default value = None)
3470
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3471
+ :param after: bool: (Default value = True)
3472
+
3473
+ :returns: CodeItem: The newly created CodeItem item.
3474
+ """
3475
+ # Get stack and parent reference of the sibling
3476
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3477
+
3478
+ # Create a new CodeItem NodeItem
3479
+ if not orig:
3480
+ orig = text
3481
+
3482
+ code_item = CodeItem(
3483
+ text=text,
3484
+ orig=orig,
3485
+ self_ref="#",
3486
+ parent=parent_ref,
3487
+ formatting=formatting,
3488
+ hyperlink=hyperlink,
3489
+ )
3490
+
3491
+ if code_language:
3492
+ code_item.code_language = code_language
3493
+ if content_layer:
3494
+ code_item.content_layer = content_layer
3495
+ if prov:
3496
+ code_item.prov.append(prov)
3497
+ if caption:
3498
+ code_item.captions.append(caption.get_ref())
3499
+
3500
+ self._insert_in_structure(item=code_item, stack=stack, after=after)
3501
+
3502
+ return code_item
3503
+
3504
+ def insert_formula(
3505
+ self,
3506
+ sibling: NodeItem,
3507
+ text: str,
3508
+ orig: Optional[str] = None,
3509
+ prov: Optional[ProvenanceItem] = None,
3510
+ content_layer: Optional[ContentLayer] = None,
3511
+ formatting: Optional[Formatting] = None,
3512
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3513
+ after: bool = True,
3514
+ ) -> FormulaItem:
3515
+ """Creates a new FormulaItem item and inserts it into the document.
3516
+
3517
+ :param sibling: NodeItem:
3518
+ :param text: str:
3519
+ :param orig: Optional[str]: (Default value = None)
3520
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3521
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3522
+ :param formatting: Optional[Formatting]: (Default value = None)
3523
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3524
+ :param after: bool: (Default value = True)
3525
+
3526
+ :returns: FormulaItem: The newly created FormulaItem item.
3527
+ """
3528
+ # Get stack and parent reference of the sibling
3529
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3530
+
3531
+ # Create a new FormulaItem NodeItem
3532
+ if not orig:
3533
+ orig = text
3534
+
3535
+ formula_item = FormulaItem(
3536
+ text=text,
3537
+ orig=orig,
3538
+ self_ref="#",
3539
+ parent=parent_ref,
3540
+ formatting=formatting,
3541
+ hyperlink=hyperlink,
3542
+ )
3543
+
3544
+ if prov:
3545
+ formula_item.prov.append(prov)
3546
+ if content_layer:
3547
+ formula_item.content_layer = content_layer
3548
+
3549
+ self._insert_in_structure(item=formula_item, stack=stack, after=after)
3550
+
3551
+ return formula_item
3552
+
3553
+ def insert_heading(
3554
+ self,
3555
+ sibling: NodeItem,
3556
+ text: str,
3557
+ orig: Optional[str] = None,
3558
+ level: LevelNumber = 1,
3559
+ prov: Optional[ProvenanceItem] = None,
3560
+ content_layer: Optional[ContentLayer] = None,
3561
+ formatting: Optional[Formatting] = None,
3562
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
3563
+ after: bool = True,
3564
+ ) -> SectionHeaderItem:
3565
+ """Creates a new SectionHeaderItem item and inserts it into the document.
3566
+
3567
+ :param sibling: NodeItem:
3568
+ :param text: str:
3569
+ :param orig: Optional[str]: (Default value = None)
3570
+ :param level: LevelNumber: (Default value = 1)
3571
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3572
+ :param content_layer: Optional[ContentLayer]: (Default value = None)
3573
+ :param formatting: Optional[Formatting]: (Default value = None)
3574
+ :param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
3575
+ :param after: bool: (Default value = True)
3576
+
3577
+ :returns: SectionHeaderItem: The newly created SectionHeaderItem item.
3578
+ """
3579
+ # Get stack and parent reference of the sibling
3580
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3581
+
3582
+ # Create a new SectionHeaderItem NodeItem
3583
+ if not orig:
3584
+ orig = text
3585
+
3586
+ section_header_item = SectionHeaderItem(
3587
+ level=level,
3588
+ text=text,
3589
+ orig=orig,
3590
+ self_ref="#",
3591
+ parent=parent_ref,
3592
+ formatting=formatting,
3593
+ hyperlink=hyperlink,
3594
+ )
3595
+
3596
+ if prov:
3597
+ section_header_item.prov.append(prov)
3598
+ if content_layer:
3599
+ section_header_item.content_layer = content_layer
3600
+
3601
+ self._insert_in_structure(item=section_header_item, stack=stack, after=after)
3602
+
3603
+ return section_header_item
3604
+
3605
+ def insert_key_values(
3606
+ self,
3607
+ sibling: NodeItem,
3608
+ graph: GraphData,
3609
+ prov: Optional[ProvenanceItem] = None,
3610
+ after: bool = True,
3611
+ ) -> KeyValueItem:
3612
+ """Creates a new KeyValueItem item and inserts it into the document.
3613
+
3614
+ :param sibling: NodeItem:
3615
+ :param graph: GraphData:
3616
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3617
+ :param after: bool: (Default value = True)
3618
+
3619
+ :returns: KeyValueItem: The newly created KeyValueItem item.
3620
+ """
3621
+ # Get stack and parent reference of the sibling
3622
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3623
+
3624
+ # Create a new KeyValueItem NodeItem
3625
+ key_value_item = KeyValueItem(graph=graph, self_ref="#", parent=parent_ref)
3626
+
3627
+ if prov:
3628
+ key_value_item.prov.append(prov)
3629
+
3630
+ self._insert_in_structure(item=key_value_item, stack=stack, after=after)
3631
+
3632
+ return key_value_item
3633
+
3634
+ def insert_form(
3635
+ self,
3636
+ sibling: NodeItem,
3637
+ graph: GraphData,
3638
+ prov: Optional[ProvenanceItem] = None,
3639
+ after: bool = True,
3640
+ ) -> FormItem:
3641
+ """Creates a new FormItem item and inserts it into the document.
3642
+
3643
+ :param sibling: NodeItem:
3644
+ :param graph: GraphData:
3645
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
3646
+ :param after: bool: (Default value = True)
3647
+
3648
+ :returns: FormItem: The newly created FormItem item.
3649
+ """
3650
+ # Get stack and parent reference of the sibling
3651
+ stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
3652
+
3653
+ # Create a new FormItem NodeItem
3654
+ form_item = FormItem(graph=graph, self_ref="#", parent=parent_ref)
3655
+
3656
+ if prov:
3657
+ form_item.prov.append(prov)
3658
+
3659
+ self._insert_in_structure(item=form_item, stack=stack, after=after)
3660
+
3661
+ return form_item
3662
+
3663
+ # ---------------------------
3664
+ # Range Manipulation Methods
3665
+ # ---------------------------
3666
+
3667
+ def delete_items_range(
3668
+ self,
3669
+ *,
3670
+ start: NodeItem,
3671
+ end: NodeItem,
3672
+ start_inclusive: bool = True,
3673
+ end_inclusive: bool = True,
3674
+ ) -> None:
3675
+ """Deletes all NodeItems and their children in the range from the start NodeItem to the end NodeItem.
3676
+
3677
+ :param start: NodeItem: The starting NodeItem of the range
3678
+ :param end: NodeItem: The ending NodeItem of the range
3679
+ :param start_inclusive: bool: (Default value = True): If True, the start NodeItem will also be deleted
3680
+ :param end_inclusive: bool: (Default value = True): If True, the end NodeItem will also be deleted
3681
+
3682
+ :returns: None
3683
+ """
3684
+ start_parent_ref = (
3685
+ start.parent if start.parent is not None else self.body.get_ref()
3686
+ )
3687
+ end_parent_ref = end.parent if end.parent is not None else self.body.get_ref()
3688
+
3689
+ if start.parent != end.parent:
3690
+ raise ValueError(
3691
+ "Start and end NodeItems must have the same parent to delete a range."
3692
+ )
3693
+
3694
+ start_ref = start.get_ref()
3695
+ end_ref = end.get_ref()
3696
+
3697
+ start_parent = start_parent_ref.resolve(doc=self)
3698
+ end_parent = end_parent_ref.resolve(doc=self)
3699
+
3700
+ start_index = start_parent.children.index(start_ref)
3701
+ end_index = end_parent.children.index(end_ref)
3702
+
3703
+ if start_index > end_index:
3704
+ raise ValueError(
3705
+ "Start NodeItem must come before or be the same as the end NodeItem in the document structure."
3706
+ )
3707
+
3708
+ to_delete = start_parent.children[start_index : end_index + 1]
3709
+
3710
+ if not start_inclusive:
3711
+ to_delete = to_delete[1:]
3712
+ if not end_inclusive:
3713
+ to_delete = to_delete[:-1]
3714
+
3715
+ self._delete_items(refs=to_delete)
3716
+
3717
+ def extract_items_range(
3718
+ self,
3719
+ *,
3720
+ start: NodeItem,
3721
+ end: NodeItem,
3722
+ start_inclusive: bool = True,
3723
+ end_inclusive: bool = True,
3724
+ delete: bool = False,
3725
+ ) -> "DoclingDocument":
3726
+ """Extracts NodeItems and children in the range from the start NodeItem to the end as a new DoclingDocument.
3727
+
3728
+ :param start: NodeItem: The starting NodeItem of the range (must be a direct child of the document body)
3729
+ :param end: NodeItem: The ending NodeItem of the range (must be a direct child of the document body)
3730
+ :param start_inclusive: bool: (Default value = True): If True, the start NodeItem will also be extracted
3731
+ :param end_inclusive: bool: (Default value = True): If True, the end NodeItem will also be extracted
3732
+ :param delete: bool: (Default value = False): If True, extracted items are deleted in the original document
3733
+
3734
+ :returns: DoclingDocument: A new document containing the extracted NodeItems and their children
3735
+ """
3736
+ if not start.parent == end.parent:
3737
+ raise ValueError(
3738
+ "Start and end NodeItems must have the same parent to extract a range."
3739
+ )
3740
+
3741
+ start_ref = start.get_ref()
3742
+ end_ref = end.get_ref()
3743
+
3744
+ start_parent_ref = (
3745
+ start.parent if start.parent is not None else self.body.get_ref()
3746
+ )
3747
+ end_parent_ref = end.parent if end.parent is not None else self.body.get_ref()
3748
+
3749
+ start_parent = start_parent_ref.resolve(doc=self)
3750
+ end_parent = end_parent_ref.resolve(doc=self)
3751
+
3752
+ start_index = start_parent.children.index(start_ref) + (
3753
+ 0 if start_inclusive else 1
3754
+ )
3755
+ end_index = end_parent.children.index(end_ref) + (1 if end_inclusive else 0)
3756
+
3757
+ if start_index > end_index:
3758
+ raise ValueError(
3759
+ "Start NodeItem must come before or be the same as the end NodeItem in the document structure."
3760
+ )
3761
+
3762
+ new_doc = DoclingDocument(name=f"{self.name}- Extracted Range")
3763
+
3764
+ ref_items = start_parent.children[start_index:end_index]
3765
+ node_items = [ref.resolve(self) for ref in ref_items]
3766
+
3767
+ new_doc.add_node_items(node_items=node_items, doc=self)
3768
+
3769
+ if delete:
3770
+ self.delete_items_range(
3771
+ start=start,
3772
+ end=end,
3773
+ start_inclusive=start_inclusive,
3774
+ end_inclusive=end_inclusive,
3775
+ )
3776
+
3777
+ return new_doc
3778
+
3779
+ def insert_document(
3780
+ self,
3781
+ doc: "DoclingDocument",
3782
+ sibling: NodeItem,
3783
+ after: bool = True,
3784
+ ) -> None:
3785
+ """Inserts the content from the body of a DoclingDocument into this document at a specific position.
3786
+
3787
+ :param doc: DoclingDocument: The document whose content will be inserted
3788
+ :param sibling: NodeItem: The NodeItem after/before which the new items will be inserted
3789
+ :param after: bool: If True, insert after the sibling; if False, insert before (Default value = True)
3790
+
3791
+ :returns: None
3792
+ """
3793
+ ref_items = doc.body.children
3794
+ node_items = [ref.resolve(doc) for ref in ref_items]
3795
+ self.insert_node_items(
3796
+ sibling=sibling, node_items=node_items, doc=doc, after=after
3797
+ )
3798
+
3799
+ def add_document(
3800
+ self,
3801
+ doc: "DoclingDocument",
3802
+ parent: Optional[NodeItem] = None,
3803
+ ) -> None:
3804
+ """Adds the content from the body of a DoclingDocument to this document under a specific parent.
3805
+
3806
+ :param doc: DoclingDocument: The document whose content will be added
3807
+ :param parent: Optional[NodeItem]: The parent NodeItem under which new items are added (Default value = None)
3808
+
3809
+ :returns: None
3810
+ """
3811
+ ref_items = doc.body.children
3812
+ node_items = [ref.resolve(doc) for ref in ref_items]
3813
+ self.add_node_items(node_items=node_items, doc=doc, parent=parent)
3814
+
3815
+ def add_node_items(
3816
+ self,
3817
+ node_items: List[NodeItem],
3818
+ doc: "DoclingDocument",
3819
+ parent: Optional[NodeItem] = None,
3820
+ ) -> None:
3821
+ """Adds multiple NodeItems and their children under a parent in this document.
3822
+
3823
+ :param node_items: list[NodeItem]: The NodeItems to be added
3824
+ :param doc: DoclingDocument: The document to which the NodeItems and their children belong
3825
+ :param parent: Optional[NodeItem]: The parent NodeItem under which new items are added (Default value = None)
3826
+
3827
+ :returns: None
3828
+ """
3829
+ parent = self.body if parent is None else parent
3830
+
3831
+ # Check for ListItem parent violations
3832
+ if not isinstance(parent, ListGroup):
3833
+ for item in node_items:
3834
+ if isinstance(item, ListItem):
3835
+ raise ValueError("Cannot add ListItem into a non-ListGroup parent.")
3836
+
3837
+ # Append the NodeItems to the document content
3838
+
3839
+ parent_ref = parent.get_ref()
3840
+
3841
+ new_refs = self._append_item_copies(
3842
+ node_items=node_items, parent_ref=parent_ref, doc=doc
3843
+ )
3844
+
3845
+ # Add the new item refs in the document structure
3846
+
3847
+ for ref in new_refs:
3848
+ parent.children.append(ref)
3849
+
3850
+ def insert_node_items(
3851
+ self,
3852
+ sibling: NodeItem,
3853
+ node_items: List[NodeItem],
3854
+ doc: "DoclingDocument",
3855
+ after: bool = True,
3856
+ ) -> None:
3857
+ """Insert multiple NodeItems and their children at a specific position in the document.
3858
+
3859
+ :param sibling: NodeItem: The NodeItem after/before which the new items will be inserted
3860
+ :param node_items: list[NodeItem]: The NodeItems to be inserted
3861
+ :param doc: DoclingDocument: The document to which the NodeItems and their children belong
3862
+ :param after: bool: If True, insert after the sibling; if False, insert before (Default value = True)
3863
+
3864
+ :returns: None
3865
+ """
3866
+ # Check for ListItem parent violations
3867
+ parent = sibling.parent.resolve(self) if sibling.parent else self.body
3868
+
3869
+ if not isinstance(parent, ListGroup):
3870
+ for item in node_items:
3871
+ if isinstance(item, ListItem):
3872
+ raise ValueError(
3873
+ "Cannot insert ListItem into a non-ListGroup parent."
3874
+ )
3875
+
3876
+ # Append the NodeItems to the document content
3877
+
3878
+ parent_ref = parent.get_ref()
3879
+
3880
+ new_refs = self._append_item_copies(
3881
+ node_items=node_items, parent_ref=parent_ref, doc=doc
3882
+ )
3883
+
3884
+ # Get the stack of the sibling
3885
+
3886
+ sibling_ref = sibling.get_ref()
3887
+
3888
+ success, stack = self._get_stack_of_refitem(ref=sibling_ref)
3889
+
3890
+ if not success:
3891
+ raise ValueError(
3892
+ f"Could not insert at {sibling_ref.cref}: could not find the stack"
3893
+ )
3894
+
3895
+ # Insert the new item refs in the document structure
3896
+
3897
+ reversed_new_refs = new_refs[::-1]
3898
+
3899
+ for ref in reversed_new_refs:
3900
+ success = self.body._add_sibling(
3901
+ doc=self, stack=stack, new_ref=ref, after=after
3902
+ )
3903
+
3904
+ if not success:
3905
+ raise ValueError(
3906
+ f"Could not insert item {ref.cref} at {sibling.get_ref().cref}"
3907
+ )
3908
+
3909
+ def _append_item_copies(
3910
+ self,
3911
+ node_items: List[NodeItem],
3912
+ parent_ref: RefItem,
3913
+ doc: "DoclingDocument",
3914
+ ) -> List[RefItem]:
3915
+ """Append node item copies (with their children) from a different document to the content of this document.
3916
+
3917
+ :param node_items: List[NodeItem]: The NodeItems to be appended
3918
+ :param parent_ref: RefItem: The reference of the parent of the new items in this document
3919
+ :param doc: DoclingDocument: The document from which the NodeItems are taken
3920
+
3921
+ :returns: List[RefItem]: A list of references to the newly added items in this document
3922
+ """
3923
+ new_refs: List[RefItem] = []
3924
+
3925
+ for item in node_items:
3926
+ item_copy = item.model_copy(deep=True)
3927
+
3928
+ self._append_item(item=item_copy, parent_ref=parent_ref)
3929
+
3930
+ if item_copy.children:
3931
+ children_node_items = [ref.resolve(doc) for ref in item_copy.children]
3932
+
3933
+ item_copy.children = self._append_item_copies(
3934
+ node_items=children_node_items,
3935
+ parent_ref=item_copy.get_ref(),
3936
+ doc=doc,
3937
+ )
3938
+
3939
+ new_ref = item_copy.get_ref()
3940
+ new_refs.append(new_ref)
3941
+
3942
+ return new_refs
3943
+
3944
+ def num_pages(self):
3945
+ """num_pages."""
3946
+ return len(self.pages.values())
3947
+
3948
+ def validate_tree(self, root) -> bool:
3949
+ """validate_tree."""
3950
+ res = []
3951
+ for child_ref in root.children:
3952
+ child = child_ref.resolve(self)
3953
+ if child.parent.resolve(self) != root:
3954
+ return False
3955
+ res.append(self.validate_tree(child))
3956
+
3957
+ return all(res) or len(res) == 0
3958
+
3959
+ def iterate_items(
3960
+ self,
3961
+ root: Optional[NodeItem] = None,
3962
+ with_groups: bool = False,
3963
+ traverse_pictures: bool = False,
3964
+ page_no: Optional[int] = None,
3965
+ included_content_layers: Optional[set[ContentLayer]] = None,
3966
+ _level: int = 0, # fixed parameter, carries through the node nesting level
3967
+ ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
3968
+ """Iterate elements with level."""
3969
+ for item, stack in self._iterate_items_with_stack(
3970
+ root=root,
3971
+ with_groups=with_groups,
3972
+ traverse_pictures=traverse_pictures,
3973
+ page_no=page_no,
3974
+ included_content_layers=included_content_layers,
3975
+ ):
3976
+ yield item, len(stack)
3977
+
3978
+ def _iterate_items_with_stack(
3979
+ self,
3980
+ root: Optional[NodeItem] = None,
3981
+ with_groups: bool = False,
3982
+ traverse_pictures: bool = False,
3983
+ page_no: Optional[int] = None,
3984
+ included_content_layers: Optional[set[ContentLayer]] = None,
3985
+ _stack: Optional[list[int]] = None,
3986
+ ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
3987
+ """Iterate elements with stack."""
3988
+ my_layers = (
3989
+ included_content_layers
3990
+ if included_content_layers is not None
3991
+ else DEFAULT_CONTENT_LAYERS
3992
+ )
3993
+ my_stack: list[int] = _stack if _stack is not None else []
3994
+
3995
+ if not root:
3996
+ root = self.body
3997
+
3998
+ # Yield non-group items or group items when with_groups=True
3999
+
4000
+ # Combine conditions to have a single yield point
4001
+ should_yield = (
4002
+ (not isinstance(root, GroupItem) or with_groups)
4003
+ and (
4004
+ not isinstance(root, DocItem)
4005
+ or (
4006
+ page_no is None
4007
+ or any(prov.page_no == page_no for prov in root.prov)
4008
+ )
4009
+ )
4010
+ and root.content_layer in my_layers
4011
+ )
4012
+
4013
+ if should_yield:
4014
+ yield root, my_stack
4015
+
4016
+ my_stack.append(-1)
4017
+
4018
+ allowed_pic_refs: set[str] = (
4019
+ {r.cref for r in root.captions}
4020
+ if (root_is_picture := isinstance(root, PictureItem))
4021
+ else set()
4022
+ )
4023
+
4024
+ # Traverse children
4025
+ for child_ind, child_ref in enumerate(root.children):
4026
+ child = child_ref.resolve(self)
4027
+ if (
4028
+ root_is_picture
4029
+ and not traverse_pictures
4030
+ and isinstance(child, NodeItem)
4031
+ and child.self_ref not in allowed_pic_refs
4032
+ ):
4033
+ continue
4034
+ my_stack[-1] = child_ind
4035
+
4036
+ if isinstance(child, NodeItem):
4037
+ yield from self._iterate_items_with_stack(
4038
+ child,
4039
+ with_groups=with_groups,
4040
+ traverse_pictures=traverse_pictures,
4041
+ page_no=page_no,
4042
+ _stack=my_stack,
4043
+ included_content_layers=my_layers,
4044
+ )
4045
+
4046
+ my_stack.pop()
4047
+
4048
+ def _clear_picture_pil_cache(self):
4049
+ """Clear cache storage of all images."""
4050
+ for item, level in self.iterate_items(with_groups=False):
4051
+ if isinstance(item, PictureItem):
4052
+ if item.image is not None and item.image._pil is not None:
4053
+ item.image._pil.close()
4054
+
4055
+ def _list_images_on_disk(self) -> List[Path]:
4056
+ """List all images on disk."""
4057
+ result: List[Path] = []
4058
+
4059
+ for item, level in self.iterate_items(with_groups=False):
4060
+ if isinstance(item, PictureItem):
4061
+ if item.image is not None:
4062
+ if (
4063
+ isinstance(item.image.uri, AnyUrl)
4064
+ and item.image.uri.scheme == "file"
4065
+ and item.image.uri.path is not None
4066
+ ):
4067
+ local_path = Path(unquote(item.image.uri.path))
4068
+ result.append(local_path)
4069
+ elif isinstance(item.image.uri, Path):
4070
+ result.append(item.image.uri)
4071
+
4072
+ return result
4073
+
4074
+ def _with_embedded_pictures(self) -> "DoclingDocument":
4075
+ """Document with embedded images.
4076
+
4077
+ Creates a copy of this document where all pictures referenced
4078
+ through a file URI are turned into base64 embedded form.
4079
+ """
4080
+ result: DoclingDocument = copy.deepcopy(self)
4081
+
4082
+ for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
4083
+ if isinstance(item, PictureItem):
4084
+
4085
+ if item.image is not None:
4086
+ if (
4087
+ isinstance(item.image.uri, AnyUrl)
4088
+ and item.image.uri.scheme == "file"
4089
+ ):
4090
+ assert isinstance(item.image.uri.path, str)
4091
+ tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
4092
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
4093
+
4094
+ elif isinstance(item.image.uri, Path):
4095
+ tmp_image = PILImage.open(str(item.image.uri))
4096
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
4097
+
4098
+ return result
4099
+
4100
+ def _with_pictures_refs(
4101
+ self,
4102
+ image_dir: Path,
4103
+ page_no: Optional[int],
4104
+ reference_path: Optional[Path] = None,
2968
4105
  ) -> "DoclingDocument":
2969
4106
  """Document with images as refs.
2970
4107
 
@@ -2977,7 +4114,7 @@ class DoclingDocument(BaseModel):
2977
4114
  image_dir.mkdir(parents=True, exist_ok=True)
2978
4115
 
2979
4116
  if image_dir.is_dir():
2980
- for item, level in result.iterate_items(with_groups=False):
4117
+ for item, level in result.iterate_items(page_no=page_no, with_groups=False):
2981
4118
  if isinstance(item, PictureItem):
2982
4119
 
2983
4120
  if (
@@ -3077,7 +4214,7 @@ class DoclingDocument(BaseModel):
3077
4214
  os.makedirs(artifacts_dir, exist_ok=True)
3078
4215
 
3079
4216
  new_doc = self._make_copy_with_refmode(
3080
- artifacts_dir, image_mode, reference_path=reference_path
4217
+ artifacts_dir, image_mode, page_no=None, reference_path=reference_path
3081
4218
  )
3082
4219
 
3083
4220
  out = new_doc.export_to_dict(
@@ -3120,7 +4257,7 @@ class DoclingDocument(BaseModel):
3120
4257
  os.makedirs(artifacts_dir, exist_ok=True)
3121
4258
 
3122
4259
  new_doc = self._make_copy_with_refmode(
3123
- artifacts_dir, image_mode, reference_path=reference_path
4260
+ artifacts_dir, image_mode, page_no=None, reference_path=reference_path
3124
4261
  )
3125
4262
 
3126
4263
  out = new_doc.export_to_dict(
@@ -3156,9 +4293,9 @@ class DoclingDocument(BaseModel):
3156
4293
  """Export to dict."""
3157
4294
  context = {}
3158
4295
  if coord_precision is not None:
3159
- context[_CTX_COORD_PREC] = coord_precision
4296
+ context[PydanticSerCtxKey.COORD_PREC.value] = coord_precision
3160
4297
  if confid_precision is not None:
3161
- context[_CTX_CONFID_PREC] = confid_precision
4298
+ context[PydanticSerCtxKey.CONFID_PREC.value] = confid_precision
3162
4299
  out = self.model_dump(
3163
4300
  mode=mode, by_alias=by_alias, exclude_none=exclude_none, context=context
3164
4301
  )
@@ -3193,7 +4330,7 @@ class DoclingDocument(BaseModel):
3193
4330
  os.makedirs(artifacts_dir, exist_ok=True)
3194
4331
 
3195
4332
  new_doc = self._make_copy_with_refmode(
3196
- artifacts_dir, image_mode, reference_path=reference_path
4333
+ artifacts_dir, image_mode, page_no, reference_path=reference_path
3197
4334
  )
3198
4335
 
3199
4336
  md_out = new_doc.export_to_markdown(
@@ -3369,7 +4506,7 @@ class DoclingDocument(BaseModel):
3369
4506
  os.makedirs(artifacts_dir, exist_ok=True)
3370
4507
 
3371
4508
  new_doc = self._make_copy_with_refmode(
3372
- artifacts_dir, image_mode, reference_path=reference_path
4509
+ artifacts_dir, image_mode, page_no, reference_path=reference_path
3373
4510
  )
3374
4511
 
3375
4512
  html_out = new_doc.export_to_html(
@@ -3408,6 +4545,7 @@ class DoclingDocument(BaseModel):
3408
4545
  self,
3409
4546
  artifacts_dir: Path,
3410
4547
  image_mode: ImageRefMode,
4548
+ page_no: Optional[int],
3411
4549
  reference_path: Optional[Path] = None,
3412
4550
  ):
3413
4551
  new_doc = None
@@ -3415,7 +4553,7 @@ class DoclingDocument(BaseModel):
3415
4553
  new_doc = self
3416
4554
  elif image_mode == ImageRefMode.REFERENCED:
3417
4555
  new_doc = self._with_pictures_refs(
3418
- image_dir=artifacts_dir, reference_path=reference_path
4556
+ image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
3419
4557
  )
3420
4558
  elif image_mode == ImageRefMode.EMBEDDED:
3421
4559
  new_doc = self._with_embedded_pictures()