docling-core 2.40.0__py3-none-any.whl → 2.42.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +29 -2
- docling_core/types/doc/document.py +1329 -161
- docling_core/types/doc/page.py +22 -2
- {docling_core-2.40.0.dist-info → docling_core-2.42.0.dist-info}/METADATA +1 -1
- {docling_core-2.40.0.dist-info → docling_core-2.42.0.dist-info}/RECORD +9 -9
- {docling_core-2.40.0.dist-info → docling_core-2.42.0.dist-info}/WHEEL +0 -0
- {docling_core-2.40.0.dist-info → docling_core-2.42.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.40.0.dist-info → docling_core-2.42.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.40.0.dist-info → docling_core-2.42.0.dist-info}/top_level.txt +0 -0
|
@@ -26,8 +26,10 @@ from pydantic import (
|
|
|
26
26
|
BaseModel,
|
|
27
27
|
ConfigDict,
|
|
28
28
|
Field,
|
|
29
|
+
FieldSerializationInfo,
|
|
29
30
|
StringConstraints,
|
|
30
31
|
computed_field,
|
|
32
|
+
field_serializer,
|
|
31
33
|
field_validator,
|
|
32
34
|
model_validator,
|
|
33
35
|
validate_call,
|
|
@@ -38,7 +40,12 @@ from typing_extensions import Annotated, Self, deprecated
|
|
|
38
40
|
from docling_core.search.package import VERSION_PATTERN
|
|
39
41
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
40
42
|
from docling_core.types.doc import BoundingBox, Size
|
|
41
|
-
from docling_core.types.doc.base import
|
|
43
|
+
from docling_core.types.doc.base import (
|
|
44
|
+
CoordOrigin,
|
|
45
|
+
ImageRefMode,
|
|
46
|
+
PydanticSerCtxKey,
|
|
47
|
+
round_pydantic_float,
|
|
48
|
+
)
|
|
42
49
|
from docling_core.types.doc.labels import (
|
|
43
50
|
CodeLanguageLabel,
|
|
44
51
|
DocItemLabel,
|
|
@@ -98,6 +105,10 @@ class PictureClassificationClass(BaseModel):
|
|
|
98
105
|
class_name: str
|
|
99
106
|
confidence: float
|
|
100
107
|
|
|
108
|
+
@field_serializer("confidence")
|
|
109
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
110
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
|
111
|
+
|
|
101
112
|
|
|
102
113
|
class PictureClassificationData(BaseAnnotation):
|
|
103
114
|
"""PictureClassificationData."""
|
|
@@ -125,6 +136,10 @@ class PictureMoleculeData(BaseAnnotation):
|
|
|
125
136
|
segmentation: List[Tuple[float, float]]
|
|
126
137
|
provenance: str
|
|
127
138
|
|
|
139
|
+
@field_serializer("confidence")
|
|
140
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
141
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
|
142
|
+
|
|
128
143
|
|
|
129
144
|
class MiscAnnotation(BaseAnnotation):
|
|
130
145
|
"""MiscAnnotation."""
|
|
@@ -366,6 +381,145 @@ class TableData(BaseModel): # TBD
|
|
|
366
381
|
|
|
367
382
|
return table_data
|
|
368
383
|
|
|
384
|
+
def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
|
|
385
|
+
"""Remove rows from the table by their indices.
|
|
386
|
+
|
|
387
|
+
:param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
|
|
388
|
+
|
|
389
|
+
:return: List[List[TableCell]]: A list representation of the removed rows as lists of TableCell objects.
|
|
390
|
+
"""
|
|
391
|
+
if not indices:
|
|
392
|
+
return []
|
|
393
|
+
|
|
394
|
+
indices = sorted(indices, reverse=True)
|
|
395
|
+
|
|
396
|
+
all_removed_cells = []
|
|
397
|
+
for row_index in indices:
|
|
398
|
+
if row_index < 0 or row_index >= self.num_rows:
|
|
399
|
+
raise IndexError(
|
|
400
|
+
f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}."
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
start_idx = row_index * self.num_cols
|
|
404
|
+
end_idx = start_idx + self.num_cols
|
|
405
|
+
removed_cells = self.table_cells[start_idx:end_idx]
|
|
406
|
+
|
|
407
|
+
# Remove the cells from the table
|
|
408
|
+
self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
|
|
409
|
+
|
|
410
|
+
# Update the number of rows
|
|
411
|
+
self.num_rows -= 1
|
|
412
|
+
|
|
413
|
+
# Reassign row offset indices for existing cells
|
|
414
|
+
for index, cell in enumerate(self.table_cells):
|
|
415
|
+
new_index = index // self.num_cols
|
|
416
|
+
cell.start_row_offset_idx = new_index
|
|
417
|
+
cell.end_row_offset_idx = new_index + 1
|
|
418
|
+
|
|
419
|
+
all_removed_cells.append(removed_cells)
|
|
420
|
+
|
|
421
|
+
return all_removed_cells
|
|
422
|
+
|
|
423
|
+
def pop_row(self) -> List[TableCell]:
|
|
424
|
+
"""Remove and return the last row from the table.
|
|
425
|
+
|
|
426
|
+
:returns: List[TableCell]: A list of TableCell objects representing the popped row.
|
|
427
|
+
"""
|
|
428
|
+
if self.num_rows == 0:
|
|
429
|
+
raise IndexError("Cannot pop from an empty table.")
|
|
430
|
+
|
|
431
|
+
return self.remove_row(self.num_rows - 1)
|
|
432
|
+
|
|
433
|
+
def remove_row(self, row_index: int) -> List[TableCell]:
|
|
434
|
+
"""Remove a row from the table by its index.
|
|
435
|
+
|
|
436
|
+
:param row_index: int: The index of the row to remove. (Starting from 0)
|
|
437
|
+
|
|
438
|
+
:returns: List[TableCell]: A list of TableCell objects representing the removed row.
|
|
439
|
+
"""
|
|
440
|
+
return self.remove_rows([row_index])[0]
|
|
441
|
+
|
|
442
|
+
def insert_rows(
|
|
443
|
+
self, row_index: int, rows: List[List[str]], after: bool = False
|
|
444
|
+
) -> None:
|
|
445
|
+
"""Insert multiple new rows from a list of lists of strings before/after a specific index in the table.
|
|
446
|
+
|
|
447
|
+
:param row_index: int: The index at which to insert the new rows. (Starting from 0)
|
|
448
|
+
:param rows: List[List[str]]: A list of lists, where each inner list represents the content of a new row.
|
|
449
|
+
:param after: bool: If True, insert the rows after the specified index, otherwise before it. (Default is False)
|
|
450
|
+
|
|
451
|
+
:returns: None
|
|
452
|
+
"""
|
|
453
|
+
effective_rows = rows[::-1]
|
|
454
|
+
|
|
455
|
+
for row in effective_rows:
|
|
456
|
+
self.insert_row(row_index, row, after)
|
|
457
|
+
|
|
458
|
+
def insert_row(self, row_index: int, row: List[str], after: bool = False) -> None:
|
|
459
|
+
"""Insert a new row from a list of strings before/after a specific index in the table.
|
|
460
|
+
|
|
461
|
+
:param row_index: int: The index at which to insert the new row. (Starting from 0)
|
|
462
|
+
:param row: List[str]: A list of strings representing the content of the new row.
|
|
463
|
+
:param after: bool: If True, insert the row after the specified index, otherwise before it. (Default is False)
|
|
464
|
+
|
|
465
|
+
:returns: None
|
|
466
|
+
"""
|
|
467
|
+
if len(row) != self.num_cols:
|
|
468
|
+
raise ValueError(
|
|
469
|
+
f"Row length {len(row)} does not match the number of columns {self.num_cols}."
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
effective_index = row_index + (1 if after else 0)
|
|
473
|
+
|
|
474
|
+
if effective_index < 0 or effective_index > self.num_rows:
|
|
475
|
+
raise IndexError(
|
|
476
|
+
f"Row index {row_index} is out of bounds for the current number of rows {self.num_rows}."
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
new_row_cells = [
|
|
480
|
+
TableCell(
|
|
481
|
+
text=text,
|
|
482
|
+
start_row_offset_idx=effective_index,
|
|
483
|
+
end_row_offset_idx=effective_index + 1,
|
|
484
|
+
start_col_offset_idx=j,
|
|
485
|
+
end_col_offset_idx=j + 1,
|
|
486
|
+
)
|
|
487
|
+
for j, text in enumerate(row)
|
|
488
|
+
]
|
|
489
|
+
|
|
490
|
+
self.table_cells = (
|
|
491
|
+
self.table_cells[: effective_index * self.num_cols]
|
|
492
|
+
+ new_row_cells
|
|
493
|
+
+ self.table_cells[effective_index * self.num_cols :]
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# Reassign row offset indices for existing cells
|
|
497
|
+
for index, cell in enumerate(self.table_cells):
|
|
498
|
+
new_index = index // self.num_cols
|
|
499
|
+
cell.start_row_offset_idx = new_index
|
|
500
|
+
cell.end_row_offset_idx = new_index + 1
|
|
501
|
+
|
|
502
|
+
self.num_rows += 1
|
|
503
|
+
|
|
504
|
+
def add_rows(self, rows: List[List[str]]) -> None:
|
|
505
|
+
"""Add multiple new rows to the table from a list of lists of strings.
|
|
506
|
+
|
|
507
|
+
:param rows: List[List[str]]: A list of lists, where each inner list represents the content of a new row.
|
|
508
|
+
|
|
509
|
+
:returns: None
|
|
510
|
+
"""
|
|
511
|
+
for row in rows:
|
|
512
|
+
self.add_row(row)
|
|
513
|
+
|
|
514
|
+
def add_row(self, row: List[str]) -> None:
|
|
515
|
+
"""Add a new row to the table from a list of strings.
|
|
516
|
+
|
|
517
|
+
:param row: List[str]: A list of strings representing the content of the new row.
|
|
518
|
+
|
|
519
|
+
:returns: None
|
|
520
|
+
"""
|
|
521
|
+
self.insert_row(row_index=self.num_rows - 1, row=row, after=True)
|
|
522
|
+
|
|
369
523
|
def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
|
|
370
524
|
"""Get the minimal bounding box for each row in the table.
|
|
371
525
|
|
|
@@ -822,7 +976,7 @@ class NodeItem(BaseModel):
|
|
|
822
976
|
after: bool = True,
|
|
823
977
|
) -> bool:
|
|
824
978
|
"""Add sibling node in tree."""
|
|
825
|
-
if len(stack) == 1 and stack[0]
|
|
979
|
+
if len(stack) == 1 and stack[0] <= len(self.children) and (not after):
|
|
826
980
|
# ensure the parent is correct
|
|
827
981
|
new_item = new_ref.resolve(doc=doc)
|
|
828
982
|
new_item.parent = self.get_ref()
|
|
@@ -1958,6 +2112,16 @@ class DoclingDocument(BaseModel):
|
|
|
1958
2112
|
item.self_ref = cref
|
|
1959
2113
|
item.parent = parent_ref
|
|
1960
2114
|
|
|
2115
|
+
self.groups.append(item)
|
|
2116
|
+
elif isinstance(item, GroupItem):
|
|
2117
|
+
item_label = "groups"
|
|
2118
|
+
item_index = len(self.groups)
|
|
2119
|
+
|
|
2120
|
+
cref = f"#/{item_label}/{item_index}"
|
|
2121
|
+
|
|
2122
|
+
item.self_ref = cref
|
|
2123
|
+
item.parent = parent_ref
|
|
2124
|
+
|
|
1961
2125
|
self.groups.append(item)
|
|
1962
2126
|
|
|
1963
2127
|
else:
|
|
@@ -1976,7 +2140,7 @@ class DoclingDocument(BaseModel):
|
|
|
1976
2140
|
item_index = int(path[2])
|
|
1977
2141
|
|
|
1978
2142
|
if (
|
|
1979
|
-
len(self.__getattribute__(item_label)) + 1
|
|
2143
|
+
len(self.__getattribute__(item_label)) == item_index + 1
|
|
1980
2144
|
): # we can only pop the last item
|
|
1981
2145
|
del self.__getattribute__(item_label)[item_index]
|
|
1982
2146
|
else:
|
|
@@ -2001,6 +2165,10 @@ class DoclingDocument(BaseModel):
|
|
|
2001
2165
|
if not success:
|
|
2002
2166
|
self._pop_item(item=item)
|
|
2003
2167
|
|
|
2168
|
+
raise ValueError(
|
|
2169
|
+
f"Could not insert item: {item} under parent: {parent_ref.resolve(doc=self)}"
|
|
2170
|
+
)
|
|
2171
|
+
|
|
2004
2172
|
return item.get_ref()
|
|
2005
2173
|
|
|
2006
2174
|
def _delete_items(self, refs: list[RefItem]):
|
|
@@ -2380,17 +2548,6 @@ class DoclingDocument(BaseModel):
|
|
|
2380
2548
|
hyperlink=hyperlink,
|
|
2381
2549
|
)
|
|
2382
2550
|
|
|
2383
|
-
elif label in [DocItemLabel.TITLE]:
|
|
2384
|
-
return self.add_title(
|
|
2385
|
-
text=text,
|
|
2386
|
-
orig=orig,
|
|
2387
|
-
prov=prov,
|
|
2388
|
-
parent=parent,
|
|
2389
|
-
content_layer=content_layer,
|
|
2390
|
-
formatting=formatting,
|
|
2391
|
-
hyperlink=hyperlink,
|
|
2392
|
-
)
|
|
2393
|
-
|
|
2394
2551
|
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
2395
2552
|
return self.add_heading(
|
|
2396
2553
|
text=text,
|
|
@@ -2790,177 +2947,1171 @@ class DoclingDocument(BaseModel):
|
|
|
2790
2947
|
|
|
2791
2948
|
return form_item
|
|
2792
2949
|
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2950
|
+
# ---------------------------
|
|
2951
|
+
# Node Item Insertion Methods
|
|
2952
|
+
# ---------------------------
|
|
2796
2953
|
|
|
2797
|
-
def
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
for
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
return False
|
|
2804
|
-
res.append(self.validate_tree(child))
|
|
2954
|
+
def _get_insertion_stack_and_parent(
|
|
2955
|
+
self, sibling: NodeItem
|
|
2956
|
+
) -> tuple[list[int], RefItem]:
|
|
2957
|
+
"""Get the stack and parent reference for inserting a new item at a sibling."""
|
|
2958
|
+
# Get the stack of the sibling
|
|
2959
|
+
sibling_ref = sibling.get_ref()
|
|
2805
2960
|
|
|
2806
|
-
|
|
2961
|
+
success, stack = self._get_stack_of_refitem(ref=sibling_ref)
|
|
2807
2962
|
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
traverse_pictures: bool = False,
|
|
2813
|
-
page_no: Optional[int] = None,
|
|
2814
|
-
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2815
|
-
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
2816
|
-
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
2817
|
-
"""Iterate elements with level."""
|
|
2818
|
-
for item, stack in self._iterate_items_with_stack(
|
|
2819
|
-
root=root,
|
|
2820
|
-
with_groups=with_groups,
|
|
2821
|
-
traverse_pictures=traverse_pictures,
|
|
2822
|
-
page_no=page_no,
|
|
2823
|
-
included_content_layers=included_content_layers,
|
|
2824
|
-
):
|
|
2825
|
-
yield item, len(stack)
|
|
2963
|
+
if not success:
|
|
2964
|
+
raise ValueError(
|
|
2965
|
+
f"Could not insert at {sibling_ref.cref}: could not find the stack"
|
|
2966
|
+
)
|
|
2826
2967
|
|
|
2827
|
-
|
|
2828
|
-
self,
|
|
2829
|
-
root: Optional[NodeItem] = None,
|
|
2830
|
-
with_groups: bool = False,
|
|
2831
|
-
traverse_pictures: bool = False,
|
|
2832
|
-
page_no: Optional[int] = None,
|
|
2833
|
-
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2834
|
-
_stack: Optional[list[int]] = None,
|
|
2835
|
-
) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
|
|
2836
|
-
"""Iterate elements with stack."""
|
|
2837
|
-
my_layers = (
|
|
2838
|
-
included_content_layers
|
|
2839
|
-
if included_content_layers is not None
|
|
2840
|
-
else DEFAULT_CONTENT_LAYERS
|
|
2841
|
-
)
|
|
2842
|
-
my_stack: list[int] = _stack if _stack is not None else []
|
|
2968
|
+
# Get the parent RefItem
|
|
2969
|
+
parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
|
|
2843
2970
|
|
|
2844
|
-
if
|
|
2845
|
-
|
|
2971
|
+
if parent_ref is None:
|
|
2972
|
+
raise ValueError(f"Could not find a parent at stack: {stack}")
|
|
2846
2973
|
|
|
2847
|
-
|
|
2974
|
+
return stack, parent_ref
|
|
2848
2975
|
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
|
|
2856
|
-
|
|
2857
|
-
|
|
2858
|
-
|
|
2859
|
-
|
|
2860
|
-
)
|
|
2976
|
+
def _insert_in_structure(
|
|
2977
|
+
self,
|
|
2978
|
+
item: NodeItem,
|
|
2979
|
+
stack: list[int],
|
|
2980
|
+
after: bool,
|
|
2981
|
+
created_parent: Optional[bool] = False,
|
|
2982
|
+
) -> None:
|
|
2983
|
+
"""Insert item into the document structure at the specified stack and handle errors."""
|
|
2984
|
+
# Ensure the item has a parent reference
|
|
2985
|
+
if item.parent is None:
|
|
2986
|
+
item.parent = self.body.get_ref()
|
|
2861
2987
|
|
|
2862
|
-
|
|
2863
|
-
yield root, my_stack
|
|
2988
|
+
self._append_item(item=item, parent_ref=item.parent)
|
|
2864
2989
|
|
|
2865
|
-
|
|
2990
|
+
new_ref = item.get_ref()
|
|
2866
2991
|
|
|
2867
|
-
|
|
2868
|
-
|
|
2869
|
-
if (root_is_picture := isinstance(root, PictureItem))
|
|
2870
|
-
else set()
|
|
2992
|
+
success = self.body._add_sibling(
|
|
2993
|
+
doc=self, stack=stack, new_ref=new_ref, after=after
|
|
2871
2994
|
)
|
|
2872
2995
|
|
|
2873
|
-
#
|
|
2874
|
-
|
|
2875
|
-
|
|
2876
|
-
if (
|
|
2877
|
-
root_is_picture
|
|
2878
|
-
and not traverse_pictures
|
|
2879
|
-
and isinstance(child, NodeItem)
|
|
2880
|
-
and child.self_ref not in allowed_pic_refs
|
|
2881
|
-
):
|
|
2882
|
-
continue
|
|
2883
|
-
my_stack[-1] = child_ind
|
|
2884
|
-
|
|
2885
|
-
if isinstance(child, NodeItem):
|
|
2886
|
-
yield from self._iterate_items_with_stack(
|
|
2887
|
-
child,
|
|
2888
|
-
with_groups=with_groups,
|
|
2889
|
-
traverse_pictures=traverse_pictures,
|
|
2890
|
-
page_no=page_no,
|
|
2891
|
-
_stack=my_stack,
|
|
2892
|
-
included_content_layers=my_layers,
|
|
2893
|
-
)
|
|
2894
|
-
|
|
2895
|
-
my_stack.pop()
|
|
2896
|
-
|
|
2897
|
-
def _clear_picture_pil_cache(self):
|
|
2898
|
-
"""Clear cache storage of all images."""
|
|
2899
|
-
for item, level in self.iterate_items(with_groups=False):
|
|
2900
|
-
if isinstance(item, PictureItem):
|
|
2901
|
-
if item.image is not None and item.image._pil is not None:
|
|
2902
|
-
item.image._pil.close()
|
|
2996
|
+
# Error handling can be determined here
|
|
2997
|
+
if not success:
|
|
2998
|
+
self._pop_item(item=item)
|
|
2903
2999
|
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
result: List[Path] = []
|
|
3000
|
+
if created_parent:
|
|
3001
|
+
self.delete_items(node_items=[item.parent.resolve(self)])
|
|
2907
3002
|
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2911
|
-
if (
|
|
2912
|
-
isinstance(item.image.uri, AnyUrl)
|
|
2913
|
-
and item.image.uri.scheme == "file"
|
|
2914
|
-
and item.image.uri.path is not None
|
|
2915
|
-
):
|
|
2916
|
-
local_path = Path(unquote(item.image.uri.path))
|
|
2917
|
-
result.append(local_path)
|
|
2918
|
-
elif isinstance(item.image.uri, Path):
|
|
2919
|
-
result.append(item.image.uri)
|
|
3003
|
+
raise ValueError(
|
|
3004
|
+
f"Could not insert item: {item} under parent: {item.parent.resolve(doc=self)}"
|
|
3005
|
+
)
|
|
2920
3006
|
|
|
2921
|
-
|
|
3007
|
+
def insert_list_group(
|
|
3008
|
+
self,
|
|
3009
|
+
sibling: NodeItem,
|
|
3010
|
+
name: Optional[str] = None,
|
|
3011
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3012
|
+
after: bool = True,
|
|
3013
|
+
) -> ListGroup:
|
|
3014
|
+
"""Creates a new ListGroup item and inserts it into the document.
|
|
2922
3015
|
|
|
2923
|
-
|
|
2924
|
-
|
|
3016
|
+
:param sibling: NodeItem:
|
|
3017
|
+
:param name: Optional[str]: (Default value = None)
|
|
3018
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3019
|
+
:param after: bool: (Default value = True)
|
|
2925
3020
|
|
|
2926
|
-
|
|
2927
|
-
through a file URI are turned into base64 embedded form.
|
|
3021
|
+
:returns: ListGroup: The newly created ListGroup item.
|
|
2928
3022
|
"""
|
|
2929
|
-
|
|
3023
|
+
# Get stack and parent reference of the sibling
|
|
3024
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
2930
3025
|
|
|
2931
|
-
|
|
2932
|
-
if isinstance(item, PictureItem):
|
|
3026
|
+
group = ListGroup(self_ref="#", parent=parent_ref)
|
|
2933
3027
|
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
):
|
|
2939
|
-
assert isinstance(item.image.uri.path, str)
|
|
2940
|
-
tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
|
|
2941
|
-
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
3028
|
+
if name is not None:
|
|
3029
|
+
group.name = name
|
|
3030
|
+
if content_layer:
|
|
3031
|
+
group.content_layer = content_layer
|
|
2942
3032
|
|
|
2943
|
-
|
|
2944
|
-
tmp_image = PILImage.open(str(item.image.uri))
|
|
2945
|
-
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
3033
|
+
self._insert_in_structure(item=group, stack=stack, after=after)
|
|
2946
3034
|
|
|
2947
|
-
return
|
|
3035
|
+
return group
|
|
2948
3036
|
|
|
2949
|
-
def
|
|
2950
|
-
self,
|
|
2951
|
-
|
|
2952
|
-
|
|
3037
|
+
def insert_inline_group(
|
|
3038
|
+
self,
|
|
3039
|
+
sibling: NodeItem,
|
|
3040
|
+
name: Optional[str] = None,
|
|
3041
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3042
|
+
after: bool = True,
|
|
3043
|
+
) -> InlineGroup:
|
|
3044
|
+
"""Creates a new InlineGroup item and inserts it into the document.
|
|
2953
3045
|
|
|
2954
|
-
|
|
2955
|
-
|
|
3046
|
+
:param sibling: NodeItem:
|
|
3047
|
+
:param name: Optional[str]: (Default value = None)
|
|
3048
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3049
|
+
:param after: bool: (Default value = True)
|
|
3050
|
+
|
|
3051
|
+
:returns: InlineGroup: The newly created InlineGroup item.
|
|
2956
3052
|
"""
|
|
2957
|
-
|
|
3053
|
+
# Get stack and parent reference of the sibling
|
|
3054
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
2958
3055
|
|
|
2959
|
-
|
|
2960
|
-
|
|
3056
|
+
# Create a new InlineGroup NodeItem
|
|
3057
|
+
group = InlineGroup(self_ref="#", parent=parent_ref)
|
|
2961
3058
|
|
|
2962
|
-
if
|
|
2963
|
-
|
|
3059
|
+
if name is not None:
|
|
3060
|
+
group.name = name
|
|
3061
|
+
if content_layer:
|
|
3062
|
+
group.content_layer = content_layer
|
|
3063
|
+
|
|
3064
|
+
self._insert_in_structure(item=group, stack=stack, after=after)
|
|
3065
|
+
|
|
3066
|
+
return group
|
|
3067
|
+
|
|
3068
|
+
def insert_group(
|
|
3069
|
+
self,
|
|
3070
|
+
sibling: NodeItem,
|
|
3071
|
+
label: Optional[GroupLabel] = None,
|
|
3072
|
+
name: Optional[str] = None,
|
|
3073
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3074
|
+
after: bool = True,
|
|
3075
|
+
) -> GroupItem:
|
|
3076
|
+
"""Creates a new GroupItem item and inserts it into the document.
|
|
3077
|
+
|
|
3078
|
+
:param sibling: NodeItem:
|
|
3079
|
+
:param label: Optional[GroupLabel]: (Default value = None)
|
|
3080
|
+
:param name: Optional[str]: (Default value = None)
|
|
3081
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3082
|
+
:param after: bool: (Default value = True)
|
|
3083
|
+
|
|
3084
|
+
:returns: GroupItem: The newly created GroupItem.
|
|
3085
|
+
"""
|
|
3086
|
+
if label in [GroupLabel.LIST, GroupLabel.ORDERED_LIST]:
|
|
3087
|
+
return self.insert_list_group(
|
|
3088
|
+
sibling=sibling,
|
|
3089
|
+
name=name,
|
|
3090
|
+
content_layer=content_layer,
|
|
3091
|
+
after=after,
|
|
3092
|
+
)
|
|
3093
|
+
elif label == GroupLabel.INLINE:
|
|
3094
|
+
return self.insert_inline_group(
|
|
3095
|
+
sibling=sibling,
|
|
3096
|
+
name=name,
|
|
3097
|
+
content_layer=content_layer,
|
|
3098
|
+
after=after,
|
|
3099
|
+
)
|
|
3100
|
+
|
|
3101
|
+
# Get stack and parent reference of the sibling
|
|
3102
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3103
|
+
|
|
3104
|
+
# Create a new GroupItem NodeItem
|
|
3105
|
+
group = GroupItem(self_ref="#", parent=parent_ref)
|
|
3106
|
+
|
|
3107
|
+
if name is not None:
|
|
3108
|
+
group.name = name
|
|
3109
|
+
if label is not None:
|
|
3110
|
+
group.label = label
|
|
3111
|
+
if content_layer:
|
|
3112
|
+
group.content_layer = content_layer
|
|
3113
|
+
|
|
3114
|
+
self._insert_in_structure(item=group, stack=stack, after=after)
|
|
3115
|
+
|
|
3116
|
+
return group
|
|
3117
|
+
|
|
3118
|
+
def insert_list_item(
|
|
3119
|
+
self,
|
|
3120
|
+
sibling: NodeItem,
|
|
3121
|
+
text: str,
|
|
3122
|
+
enumerated: bool = False,
|
|
3123
|
+
marker: Optional[str] = None,
|
|
3124
|
+
orig: Optional[str] = None,
|
|
3125
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3126
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3127
|
+
formatting: Optional[Formatting] = None,
|
|
3128
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
3129
|
+
after: bool = True,
|
|
3130
|
+
) -> ListItem:
|
|
3131
|
+
"""Creates a new ListItem item and inserts it into the document.
|
|
3132
|
+
|
|
3133
|
+
:param sibling: NodeItem:
|
|
3134
|
+
:param text: str:
|
|
3135
|
+
:param enumerated: bool: (Default value = False)
|
|
3136
|
+
:param marker: Optional[str]: (Default value = None)
|
|
3137
|
+
:param orig: Optional[str]: (Default value = None)
|
|
3138
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3139
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3140
|
+
:param formatting: Optional[Formatting]: (Default value = None)
|
|
3141
|
+
:param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
|
|
3142
|
+
:param after: bool: (Default value = True)
|
|
3143
|
+
|
|
3144
|
+
:returns: ListItem: The newly created ListItem item.
|
|
3145
|
+
"""
|
|
3146
|
+
# Get stack and parent reference of the sibling
|
|
3147
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3148
|
+
|
|
3149
|
+
# Ensure the parent is a ListGroup
|
|
3150
|
+
|
|
3151
|
+
parent = parent_ref.resolve(self)
|
|
3152
|
+
set_parent = False
|
|
3153
|
+
|
|
3154
|
+
if not isinstance(parent, ListGroup):
|
|
3155
|
+
warnings.warn(
|
|
3156
|
+
"ListItem parent must be a ListGroup, creating one on the fly.",
|
|
3157
|
+
DeprecationWarning,
|
|
3158
|
+
)
|
|
3159
|
+
parent = self.insert_list_group(sibling=sibling, after=after)
|
|
3160
|
+
parent_ref = parent.get_ref()
|
|
3161
|
+
if after:
|
|
3162
|
+
stack[-1] += 1
|
|
3163
|
+
stack.append(0)
|
|
3164
|
+
after = False
|
|
3165
|
+
set_parent = True
|
|
3166
|
+
|
|
3167
|
+
# Create a new ListItem NodeItem
|
|
3168
|
+
if not orig:
|
|
3169
|
+
orig = text
|
|
3170
|
+
|
|
3171
|
+
list_item = ListItem(
|
|
3172
|
+
text=text,
|
|
3173
|
+
orig=orig,
|
|
3174
|
+
self_ref="#",
|
|
3175
|
+
parent=parent_ref,
|
|
3176
|
+
enumerated=enumerated,
|
|
3177
|
+
marker=marker or "",
|
|
3178
|
+
formatting=formatting,
|
|
3179
|
+
hyperlink=hyperlink,
|
|
3180
|
+
)
|
|
3181
|
+
|
|
3182
|
+
if prov:
|
|
3183
|
+
list_item.prov.append(prov)
|
|
3184
|
+
if content_layer:
|
|
3185
|
+
list_item.content_layer = content_layer
|
|
3186
|
+
|
|
3187
|
+
self._insert_in_structure(
|
|
3188
|
+
item=list_item, stack=stack, after=after, created_parent=set_parent
|
|
3189
|
+
)
|
|
3190
|
+
|
|
3191
|
+
return list_item
|
|
3192
|
+
|
|
3193
|
+
def insert_text(
|
|
3194
|
+
self,
|
|
3195
|
+
sibling: NodeItem,
|
|
3196
|
+
label: DocItemLabel,
|
|
3197
|
+
text: str,
|
|
3198
|
+
orig: Optional[str] = None,
|
|
3199
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3200
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3201
|
+
formatting: Optional[Formatting] = None,
|
|
3202
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
3203
|
+
after: bool = True,
|
|
3204
|
+
) -> TextItem:
|
|
3205
|
+
"""Creates a new TextItem item and inserts it into the document.
|
|
3206
|
+
|
|
3207
|
+
:param sibling: NodeItem:
|
|
3208
|
+
:param label: DocItemLabel:
|
|
3209
|
+
:param text: str:
|
|
3210
|
+
:param orig: Optional[str]: (Default value = None)
|
|
3211
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3212
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3213
|
+
:param formatting: Optional[Formatting]: (Default value = None)
|
|
3214
|
+
:param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
|
|
3215
|
+
:param after: bool: (Default value = True)
|
|
3216
|
+
|
|
3217
|
+
:returns: TextItem: The newly created TextItem item.
|
|
3218
|
+
"""
|
|
3219
|
+
if label in [DocItemLabel.TITLE]:
|
|
3220
|
+
return self.insert_title(
|
|
3221
|
+
sibling=sibling,
|
|
3222
|
+
text=text,
|
|
3223
|
+
orig=orig,
|
|
3224
|
+
prov=prov,
|
|
3225
|
+
content_layer=content_layer,
|
|
3226
|
+
formatting=formatting,
|
|
3227
|
+
hyperlink=hyperlink,
|
|
3228
|
+
after=after,
|
|
3229
|
+
)
|
|
3230
|
+
|
|
3231
|
+
elif label in [DocItemLabel.LIST_ITEM]:
|
|
3232
|
+
return self.insert_list_item(
|
|
3233
|
+
sibling=sibling,
|
|
3234
|
+
text=text,
|
|
3235
|
+
orig=orig,
|
|
3236
|
+
prov=prov,
|
|
3237
|
+
content_layer=content_layer,
|
|
3238
|
+
formatting=formatting,
|
|
3239
|
+
hyperlink=hyperlink,
|
|
3240
|
+
after=after,
|
|
3241
|
+
)
|
|
3242
|
+
|
|
3243
|
+
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
3244
|
+
return self.insert_heading(
|
|
3245
|
+
sibling=sibling,
|
|
3246
|
+
text=text,
|
|
3247
|
+
orig=orig,
|
|
3248
|
+
prov=prov,
|
|
3249
|
+
content_layer=content_layer,
|
|
3250
|
+
formatting=formatting,
|
|
3251
|
+
hyperlink=hyperlink,
|
|
3252
|
+
after=after,
|
|
3253
|
+
)
|
|
3254
|
+
|
|
3255
|
+
elif label in [DocItemLabel.CODE]:
|
|
3256
|
+
return self.insert_code(
|
|
3257
|
+
sibling=sibling,
|
|
3258
|
+
text=text,
|
|
3259
|
+
orig=orig,
|
|
3260
|
+
prov=prov,
|
|
3261
|
+
content_layer=content_layer,
|
|
3262
|
+
formatting=formatting,
|
|
3263
|
+
hyperlink=hyperlink,
|
|
3264
|
+
after=after,
|
|
3265
|
+
)
|
|
3266
|
+
|
|
3267
|
+
elif label in [DocItemLabel.FORMULA]:
|
|
3268
|
+
return self.insert_formula(
|
|
3269
|
+
sibling=sibling,
|
|
3270
|
+
text=text,
|
|
3271
|
+
orig=orig,
|
|
3272
|
+
prov=prov,
|
|
3273
|
+
content_layer=content_layer,
|
|
3274
|
+
formatting=formatting,
|
|
3275
|
+
hyperlink=hyperlink,
|
|
3276
|
+
after=after,
|
|
3277
|
+
)
|
|
3278
|
+
|
|
3279
|
+
else:
|
|
3280
|
+
# Get stack and parent reference of the sibling
|
|
3281
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3282
|
+
|
|
3283
|
+
# Create a new TextItem NodeItem
|
|
3284
|
+
if not orig:
|
|
3285
|
+
orig = text
|
|
3286
|
+
|
|
3287
|
+
text_item = TextItem(
|
|
3288
|
+
label=label,
|
|
3289
|
+
text=text,
|
|
3290
|
+
orig=orig,
|
|
3291
|
+
self_ref="#",
|
|
3292
|
+
parent=parent_ref,
|
|
3293
|
+
formatting=formatting,
|
|
3294
|
+
hyperlink=hyperlink,
|
|
3295
|
+
)
|
|
3296
|
+
|
|
3297
|
+
if prov:
|
|
3298
|
+
text_item.prov.append(prov)
|
|
3299
|
+
if content_layer:
|
|
3300
|
+
text_item.content_layer = content_layer
|
|
3301
|
+
|
|
3302
|
+
self._insert_in_structure(item=text_item, stack=stack, after=after)
|
|
3303
|
+
|
|
3304
|
+
return text_item
|
|
3305
|
+
|
|
3306
|
+
def insert_table(
|
|
3307
|
+
self,
|
|
3308
|
+
sibling: NodeItem,
|
|
3309
|
+
data: TableData,
|
|
3310
|
+
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
3311
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3312
|
+
label: DocItemLabel = DocItemLabel.TABLE,
|
|
3313
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3314
|
+
annotations: Optional[list[TableAnnotationType]] = None,
|
|
3315
|
+
after: bool = True,
|
|
3316
|
+
) -> TableItem:
|
|
3317
|
+
"""Creates a new TableItem item and inserts it into the document.
|
|
3318
|
+
|
|
3319
|
+
:param sibling: NodeItem:
|
|
3320
|
+
:param data: TableData:
|
|
3321
|
+
:param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
|
|
3322
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3323
|
+
:param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
|
|
3324
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3325
|
+
:param annotations: Optional[List[TableAnnotationType]]: (Default value = None)
|
|
3326
|
+
:param after: bool: (Default value = True)
|
|
3327
|
+
|
|
3328
|
+
:returns: TableItem: The newly created TableItem item.
|
|
3329
|
+
"""
|
|
3330
|
+
# Get stack and parent reference of the sibling
|
|
3331
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3332
|
+
|
|
3333
|
+
# Create a new ListItem NodeItem
|
|
3334
|
+
table_item = TableItem(
|
|
3335
|
+
label=label,
|
|
3336
|
+
data=data,
|
|
3337
|
+
self_ref="#",
|
|
3338
|
+
parent=parent_ref,
|
|
3339
|
+
annotations=annotations or [],
|
|
3340
|
+
)
|
|
3341
|
+
|
|
3342
|
+
if prov:
|
|
3343
|
+
table_item.prov.append(prov)
|
|
3344
|
+
if content_layer:
|
|
3345
|
+
table_item.content_layer = content_layer
|
|
3346
|
+
if caption:
|
|
3347
|
+
table_item.captions.append(caption.get_ref())
|
|
3348
|
+
|
|
3349
|
+
self._insert_in_structure(item=table_item, stack=stack, after=after)
|
|
3350
|
+
|
|
3351
|
+
return table_item
|
|
3352
|
+
|
|
3353
|
+
def insert_picture(
|
|
3354
|
+
self,
|
|
3355
|
+
sibling: NodeItem,
|
|
3356
|
+
annotations: Optional[List[PictureDataType]] = None,
|
|
3357
|
+
image: Optional[ImageRef] = None,
|
|
3358
|
+
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
3359
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3360
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3361
|
+
after: bool = True,
|
|
3362
|
+
) -> PictureItem:
|
|
3363
|
+
"""Creates a new PictureItem item and inserts it into the document.
|
|
3364
|
+
|
|
3365
|
+
:param sibling: NodeItem:
|
|
3366
|
+
:param annotations: Optional[List[PictureDataType]]: (Default value = None)
|
|
3367
|
+
:param image: Optional[ImageRef]: (Default value = None)
|
|
3368
|
+
:param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
|
|
3369
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3370
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3371
|
+
:param after: bool: (Default value = True)
|
|
3372
|
+
|
|
3373
|
+
:returns: PictureItem: The newly created PictureItem item.
|
|
3374
|
+
"""
|
|
3375
|
+
# Get stack and parent reference of the sibling
|
|
3376
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3377
|
+
|
|
3378
|
+
# Create a new PictureItem NodeItem
|
|
3379
|
+
picture_item = PictureItem(
|
|
3380
|
+
label=DocItemLabel.PICTURE,
|
|
3381
|
+
annotations=annotations or [],
|
|
3382
|
+
image=image,
|
|
3383
|
+
self_ref="#",
|
|
3384
|
+
parent=parent_ref,
|
|
3385
|
+
)
|
|
3386
|
+
|
|
3387
|
+
if prov:
|
|
3388
|
+
picture_item.prov.append(prov)
|
|
3389
|
+
if content_layer:
|
|
3390
|
+
picture_item.content_layer = content_layer
|
|
3391
|
+
if caption:
|
|
3392
|
+
picture_item.captions.append(caption.get_ref())
|
|
3393
|
+
|
|
3394
|
+
self._insert_in_structure(item=picture_item, stack=stack, after=after)
|
|
3395
|
+
|
|
3396
|
+
return picture_item
|
|
3397
|
+
|
|
3398
|
+
def insert_title(
|
|
3399
|
+
self,
|
|
3400
|
+
sibling: NodeItem,
|
|
3401
|
+
text: str,
|
|
3402
|
+
orig: Optional[str] = None,
|
|
3403
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3404
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3405
|
+
formatting: Optional[Formatting] = None,
|
|
3406
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
3407
|
+
after: bool = True,
|
|
3408
|
+
) -> TitleItem:
|
|
3409
|
+
"""Creates a new TitleItem item and inserts it into the document.
|
|
3410
|
+
|
|
3411
|
+
:param sibling: NodeItem:
|
|
3412
|
+
:param text: str:
|
|
3413
|
+
:param orig: Optional[str]: (Default value = None)
|
|
3414
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3415
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3416
|
+
:param formatting: Optional[Formatting]: (Default value = None)
|
|
3417
|
+
:param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
|
|
3418
|
+
:param after: bool: (Default value = True)
|
|
3419
|
+
|
|
3420
|
+
:returns: TitleItem: The newly created TitleItem item.
|
|
3421
|
+
"""
|
|
3422
|
+
# Get stack and parent reference of the sibling
|
|
3423
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3424
|
+
|
|
3425
|
+
# Create a new TitleItem NodeItem
|
|
3426
|
+
if not orig:
|
|
3427
|
+
orig = text
|
|
3428
|
+
|
|
3429
|
+
title_item = TitleItem(
|
|
3430
|
+
text=text,
|
|
3431
|
+
orig=orig,
|
|
3432
|
+
self_ref="#",
|
|
3433
|
+
parent=parent_ref,
|
|
3434
|
+
formatting=formatting,
|
|
3435
|
+
hyperlink=hyperlink,
|
|
3436
|
+
)
|
|
3437
|
+
|
|
3438
|
+
if prov:
|
|
3439
|
+
title_item.prov.append(prov)
|
|
3440
|
+
if content_layer:
|
|
3441
|
+
title_item.content_layer = content_layer
|
|
3442
|
+
|
|
3443
|
+
self._insert_in_structure(item=title_item, stack=stack, after=after)
|
|
3444
|
+
|
|
3445
|
+
return title_item
|
|
3446
|
+
|
|
3447
|
+
def insert_code(
|
|
3448
|
+
self,
|
|
3449
|
+
sibling: NodeItem,
|
|
3450
|
+
text: str,
|
|
3451
|
+
code_language: Optional[CodeLanguageLabel] = None,
|
|
3452
|
+
orig: Optional[str] = None,
|
|
3453
|
+
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
3454
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3455
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3456
|
+
formatting: Optional[Formatting] = None,
|
|
3457
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
3458
|
+
after: bool = True,
|
|
3459
|
+
) -> CodeItem:
|
|
3460
|
+
"""Creates a new CodeItem item and inserts it into the document.
|
|
3461
|
+
|
|
3462
|
+
:param sibling: NodeItem:
|
|
3463
|
+
:param text: str:
|
|
3464
|
+
:param code_language: Optional[str]: (Default value = None)
|
|
3465
|
+
:param orig: Optional[str]: (Default value = None)
|
|
3466
|
+
:param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
|
|
3467
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3468
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3469
|
+
:param formatting: Optional[Formatting]: (Default value = None)
|
|
3470
|
+
:param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
|
|
3471
|
+
:param after: bool: (Default value = True)
|
|
3472
|
+
|
|
3473
|
+
:returns: CodeItem: The newly created CodeItem item.
|
|
3474
|
+
"""
|
|
3475
|
+
# Get stack and parent reference of the sibling
|
|
3476
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3477
|
+
|
|
3478
|
+
# Create a new CodeItem NodeItem
|
|
3479
|
+
if not orig:
|
|
3480
|
+
orig = text
|
|
3481
|
+
|
|
3482
|
+
code_item = CodeItem(
|
|
3483
|
+
text=text,
|
|
3484
|
+
orig=orig,
|
|
3485
|
+
self_ref="#",
|
|
3486
|
+
parent=parent_ref,
|
|
3487
|
+
formatting=formatting,
|
|
3488
|
+
hyperlink=hyperlink,
|
|
3489
|
+
)
|
|
3490
|
+
|
|
3491
|
+
if code_language:
|
|
3492
|
+
code_item.code_language = code_language
|
|
3493
|
+
if content_layer:
|
|
3494
|
+
code_item.content_layer = content_layer
|
|
3495
|
+
if prov:
|
|
3496
|
+
code_item.prov.append(prov)
|
|
3497
|
+
if caption:
|
|
3498
|
+
code_item.captions.append(caption.get_ref())
|
|
3499
|
+
|
|
3500
|
+
self._insert_in_structure(item=code_item, stack=stack, after=after)
|
|
3501
|
+
|
|
3502
|
+
return code_item
|
|
3503
|
+
|
|
3504
|
+
def insert_formula(
|
|
3505
|
+
self,
|
|
3506
|
+
sibling: NodeItem,
|
|
3507
|
+
text: str,
|
|
3508
|
+
orig: Optional[str] = None,
|
|
3509
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3510
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3511
|
+
formatting: Optional[Formatting] = None,
|
|
3512
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
3513
|
+
after: bool = True,
|
|
3514
|
+
) -> FormulaItem:
|
|
3515
|
+
"""Creates a new FormulaItem item and inserts it into the document.
|
|
3516
|
+
|
|
3517
|
+
:param sibling: NodeItem:
|
|
3518
|
+
:param text: str:
|
|
3519
|
+
:param orig: Optional[str]: (Default value = None)
|
|
3520
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3521
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3522
|
+
:param formatting: Optional[Formatting]: (Default value = None)
|
|
3523
|
+
:param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
|
|
3524
|
+
:param after: bool: (Default value = True)
|
|
3525
|
+
|
|
3526
|
+
:returns: FormulaItem: The newly created FormulaItem item.
|
|
3527
|
+
"""
|
|
3528
|
+
# Get stack and parent reference of the sibling
|
|
3529
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3530
|
+
|
|
3531
|
+
# Create a new FormulaItem NodeItem
|
|
3532
|
+
if not orig:
|
|
3533
|
+
orig = text
|
|
3534
|
+
|
|
3535
|
+
formula_item = FormulaItem(
|
|
3536
|
+
text=text,
|
|
3537
|
+
orig=orig,
|
|
3538
|
+
self_ref="#",
|
|
3539
|
+
parent=parent_ref,
|
|
3540
|
+
formatting=formatting,
|
|
3541
|
+
hyperlink=hyperlink,
|
|
3542
|
+
)
|
|
3543
|
+
|
|
3544
|
+
if prov:
|
|
3545
|
+
formula_item.prov.append(prov)
|
|
3546
|
+
if content_layer:
|
|
3547
|
+
formula_item.content_layer = content_layer
|
|
3548
|
+
|
|
3549
|
+
self._insert_in_structure(item=formula_item, stack=stack, after=after)
|
|
3550
|
+
|
|
3551
|
+
return formula_item
|
|
3552
|
+
|
|
3553
|
+
def insert_heading(
|
|
3554
|
+
self,
|
|
3555
|
+
sibling: NodeItem,
|
|
3556
|
+
text: str,
|
|
3557
|
+
orig: Optional[str] = None,
|
|
3558
|
+
level: LevelNumber = 1,
|
|
3559
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3560
|
+
content_layer: Optional[ContentLayer] = None,
|
|
3561
|
+
formatting: Optional[Formatting] = None,
|
|
3562
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
3563
|
+
after: bool = True,
|
|
3564
|
+
) -> SectionHeaderItem:
|
|
3565
|
+
"""Creates a new SectionHeaderItem item and inserts it into the document.
|
|
3566
|
+
|
|
3567
|
+
:param sibling: NodeItem:
|
|
3568
|
+
:param text: str:
|
|
3569
|
+
:param orig: Optional[str]: (Default value = None)
|
|
3570
|
+
:param level: LevelNumber: (Default value = 1)
|
|
3571
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3572
|
+
:param content_layer: Optional[ContentLayer]: (Default value = None)
|
|
3573
|
+
:param formatting: Optional[Formatting]: (Default value = None)
|
|
3574
|
+
:param hyperlink: Optional[Union[AnyUrl, Path]]: (Default value = None)
|
|
3575
|
+
:param after: bool: (Default value = True)
|
|
3576
|
+
|
|
3577
|
+
:returns: SectionHeaderItem: The newly created SectionHeaderItem item.
|
|
3578
|
+
"""
|
|
3579
|
+
# Get stack and parent reference of the sibling
|
|
3580
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3581
|
+
|
|
3582
|
+
# Create a new SectionHeaderItem NodeItem
|
|
3583
|
+
if not orig:
|
|
3584
|
+
orig = text
|
|
3585
|
+
|
|
3586
|
+
section_header_item = SectionHeaderItem(
|
|
3587
|
+
level=level,
|
|
3588
|
+
text=text,
|
|
3589
|
+
orig=orig,
|
|
3590
|
+
self_ref="#",
|
|
3591
|
+
parent=parent_ref,
|
|
3592
|
+
formatting=formatting,
|
|
3593
|
+
hyperlink=hyperlink,
|
|
3594
|
+
)
|
|
3595
|
+
|
|
3596
|
+
if prov:
|
|
3597
|
+
section_header_item.prov.append(prov)
|
|
3598
|
+
if content_layer:
|
|
3599
|
+
section_header_item.content_layer = content_layer
|
|
3600
|
+
|
|
3601
|
+
self._insert_in_structure(item=section_header_item, stack=stack, after=after)
|
|
3602
|
+
|
|
3603
|
+
return section_header_item
|
|
3604
|
+
|
|
3605
|
+
def insert_key_values(
|
|
3606
|
+
self,
|
|
3607
|
+
sibling: NodeItem,
|
|
3608
|
+
graph: GraphData,
|
|
3609
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3610
|
+
after: bool = True,
|
|
3611
|
+
) -> KeyValueItem:
|
|
3612
|
+
"""Creates a new KeyValueItem item and inserts it into the document.
|
|
3613
|
+
|
|
3614
|
+
:param sibling: NodeItem:
|
|
3615
|
+
:param graph: GraphData:
|
|
3616
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3617
|
+
:param after: bool: (Default value = True)
|
|
3618
|
+
|
|
3619
|
+
:returns: KeyValueItem: The newly created KeyValueItem item.
|
|
3620
|
+
"""
|
|
3621
|
+
# Get stack and parent reference of the sibling
|
|
3622
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3623
|
+
|
|
3624
|
+
# Create a new KeyValueItem NodeItem
|
|
3625
|
+
key_value_item = KeyValueItem(graph=graph, self_ref="#", parent=parent_ref)
|
|
3626
|
+
|
|
3627
|
+
if prov:
|
|
3628
|
+
key_value_item.prov.append(prov)
|
|
3629
|
+
|
|
3630
|
+
self._insert_in_structure(item=key_value_item, stack=stack, after=after)
|
|
3631
|
+
|
|
3632
|
+
return key_value_item
|
|
3633
|
+
|
|
3634
|
+
def insert_form(
|
|
3635
|
+
self,
|
|
3636
|
+
sibling: NodeItem,
|
|
3637
|
+
graph: GraphData,
|
|
3638
|
+
prov: Optional[ProvenanceItem] = None,
|
|
3639
|
+
after: bool = True,
|
|
3640
|
+
) -> FormItem:
|
|
3641
|
+
"""Creates a new FormItem item and inserts it into the document.
|
|
3642
|
+
|
|
3643
|
+
:param sibling: NodeItem:
|
|
3644
|
+
:param graph: GraphData:
|
|
3645
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
3646
|
+
:param after: bool: (Default value = True)
|
|
3647
|
+
|
|
3648
|
+
:returns: FormItem: The newly created FormItem item.
|
|
3649
|
+
"""
|
|
3650
|
+
# Get stack and parent reference of the sibling
|
|
3651
|
+
stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling)
|
|
3652
|
+
|
|
3653
|
+
# Create a new FormItem NodeItem
|
|
3654
|
+
form_item = FormItem(graph=graph, self_ref="#", parent=parent_ref)
|
|
3655
|
+
|
|
3656
|
+
if prov:
|
|
3657
|
+
form_item.prov.append(prov)
|
|
3658
|
+
|
|
3659
|
+
self._insert_in_structure(item=form_item, stack=stack, after=after)
|
|
3660
|
+
|
|
3661
|
+
return form_item
|
|
3662
|
+
|
|
3663
|
+
# ---------------------------
|
|
3664
|
+
# Range Manipulation Methods
|
|
3665
|
+
# ---------------------------
|
|
3666
|
+
|
|
3667
|
+
def delete_items_range(
|
|
3668
|
+
self,
|
|
3669
|
+
*,
|
|
3670
|
+
start: NodeItem,
|
|
3671
|
+
end: NodeItem,
|
|
3672
|
+
start_inclusive: bool = True,
|
|
3673
|
+
end_inclusive: bool = True,
|
|
3674
|
+
) -> None:
|
|
3675
|
+
"""Deletes all NodeItems and their children in the range from the start NodeItem to the end NodeItem.
|
|
3676
|
+
|
|
3677
|
+
:param start: NodeItem: The starting NodeItem of the range
|
|
3678
|
+
:param end: NodeItem: The ending NodeItem of the range
|
|
3679
|
+
:param start_inclusive: bool: (Default value = True): If True, the start NodeItem will also be deleted
|
|
3680
|
+
:param end_inclusive: bool: (Default value = True): If True, the end NodeItem will also be deleted
|
|
3681
|
+
|
|
3682
|
+
:returns: None
|
|
3683
|
+
"""
|
|
3684
|
+
start_parent_ref = (
|
|
3685
|
+
start.parent if start.parent is not None else self.body.get_ref()
|
|
3686
|
+
)
|
|
3687
|
+
end_parent_ref = end.parent if end.parent is not None else self.body.get_ref()
|
|
3688
|
+
|
|
3689
|
+
if start.parent != end.parent:
|
|
3690
|
+
raise ValueError(
|
|
3691
|
+
"Start and end NodeItems must have the same parent to delete a range."
|
|
3692
|
+
)
|
|
3693
|
+
|
|
3694
|
+
start_ref = start.get_ref()
|
|
3695
|
+
end_ref = end.get_ref()
|
|
3696
|
+
|
|
3697
|
+
start_parent = start_parent_ref.resolve(doc=self)
|
|
3698
|
+
end_parent = end_parent_ref.resolve(doc=self)
|
|
3699
|
+
|
|
3700
|
+
start_index = start_parent.children.index(start_ref)
|
|
3701
|
+
end_index = end_parent.children.index(end_ref)
|
|
3702
|
+
|
|
3703
|
+
if start_index > end_index:
|
|
3704
|
+
raise ValueError(
|
|
3705
|
+
"Start NodeItem must come before or be the same as the end NodeItem in the document structure."
|
|
3706
|
+
)
|
|
3707
|
+
|
|
3708
|
+
to_delete = start_parent.children[start_index : end_index + 1]
|
|
3709
|
+
|
|
3710
|
+
if not start_inclusive:
|
|
3711
|
+
to_delete = to_delete[1:]
|
|
3712
|
+
if not end_inclusive:
|
|
3713
|
+
to_delete = to_delete[:-1]
|
|
3714
|
+
|
|
3715
|
+
self._delete_items(refs=to_delete)
|
|
3716
|
+
|
|
3717
|
+
def extract_items_range(
|
|
3718
|
+
self,
|
|
3719
|
+
*,
|
|
3720
|
+
start: NodeItem,
|
|
3721
|
+
end: NodeItem,
|
|
3722
|
+
start_inclusive: bool = True,
|
|
3723
|
+
end_inclusive: bool = True,
|
|
3724
|
+
delete: bool = False,
|
|
3725
|
+
) -> "DoclingDocument":
|
|
3726
|
+
"""Extracts NodeItems and children in the range from the start NodeItem to the end as a new DoclingDocument.
|
|
3727
|
+
|
|
3728
|
+
:param start: NodeItem: The starting NodeItem of the range (must be a direct child of the document body)
|
|
3729
|
+
:param end: NodeItem: The ending NodeItem of the range (must be a direct child of the document body)
|
|
3730
|
+
:param start_inclusive: bool: (Default value = True): If True, the start NodeItem will also be extracted
|
|
3731
|
+
:param end_inclusive: bool: (Default value = True): If True, the end NodeItem will also be extracted
|
|
3732
|
+
:param delete: bool: (Default value = False): If True, extracted items are deleted in the original document
|
|
3733
|
+
|
|
3734
|
+
:returns: DoclingDocument: A new document containing the extracted NodeItems and their children
|
|
3735
|
+
"""
|
|
3736
|
+
if not start.parent == end.parent:
|
|
3737
|
+
raise ValueError(
|
|
3738
|
+
"Start and end NodeItems must have the same parent to extract a range."
|
|
3739
|
+
)
|
|
3740
|
+
|
|
3741
|
+
start_ref = start.get_ref()
|
|
3742
|
+
end_ref = end.get_ref()
|
|
3743
|
+
|
|
3744
|
+
start_parent_ref = (
|
|
3745
|
+
start.parent if start.parent is not None else self.body.get_ref()
|
|
3746
|
+
)
|
|
3747
|
+
end_parent_ref = end.parent if end.parent is not None else self.body.get_ref()
|
|
3748
|
+
|
|
3749
|
+
start_parent = start_parent_ref.resolve(doc=self)
|
|
3750
|
+
end_parent = end_parent_ref.resolve(doc=self)
|
|
3751
|
+
|
|
3752
|
+
start_index = start_parent.children.index(start_ref) + (
|
|
3753
|
+
0 if start_inclusive else 1
|
|
3754
|
+
)
|
|
3755
|
+
end_index = end_parent.children.index(end_ref) + (1 if end_inclusive else 0)
|
|
3756
|
+
|
|
3757
|
+
if start_index > end_index:
|
|
3758
|
+
raise ValueError(
|
|
3759
|
+
"Start NodeItem must come before or be the same as the end NodeItem in the document structure."
|
|
3760
|
+
)
|
|
3761
|
+
|
|
3762
|
+
new_doc = DoclingDocument(name=f"{self.name}- Extracted Range")
|
|
3763
|
+
|
|
3764
|
+
ref_items = start_parent.children[start_index:end_index]
|
|
3765
|
+
node_items = [ref.resolve(self) for ref in ref_items]
|
|
3766
|
+
|
|
3767
|
+
new_doc.add_node_items(node_items=node_items, doc=self)
|
|
3768
|
+
|
|
3769
|
+
if delete:
|
|
3770
|
+
self.delete_items_range(
|
|
3771
|
+
start=start,
|
|
3772
|
+
end=end,
|
|
3773
|
+
start_inclusive=start_inclusive,
|
|
3774
|
+
end_inclusive=end_inclusive,
|
|
3775
|
+
)
|
|
3776
|
+
|
|
3777
|
+
return new_doc
|
|
3778
|
+
|
|
3779
|
+
def insert_document(
|
|
3780
|
+
self,
|
|
3781
|
+
doc: "DoclingDocument",
|
|
3782
|
+
sibling: NodeItem,
|
|
3783
|
+
after: bool = True,
|
|
3784
|
+
) -> None:
|
|
3785
|
+
"""Inserts the content from the body of a DoclingDocument into this document at a specific position.
|
|
3786
|
+
|
|
3787
|
+
:param doc: DoclingDocument: The document whose content will be inserted
|
|
3788
|
+
:param sibling: NodeItem: The NodeItem after/before which the new items will be inserted
|
|
3789
|
+
:param after: bool: If True, insert after the sibling; if False, insert before (Default value = True)
|
|
3790
|
+
|
|
3791
|
+
:returns: None
|
|
3792
|
+
"""
|
|
3793
|
+
ref_items = doc.body.children
|
|
3794
|
+
node_items = [ref.resolve(doc) for ref in ref_items]
|
|
3795
|
+
self.insert_node_items(
|
|
3796
|
+
sibling=sibling, node_items=node_items, doc=doc, after=after
|
|
3797
|
+
)
|
|
3798
|
+
|
|
3799
|
+
def add_document(
|
|
3800
|
+
self,
|
|
3801
|
+
doc: "DoclingDocument",
|
|
3802
|
+
parent: Optional[NodeItem] = None,
|
|
3803
|
+
) -> None:
|
|
3804
|
+
"""Adds the content from the body of a DoclingDocument to this document under a specific parent.
|
|
3805
|
+
|
|
3806
|
+
:param doc: DoclingDocument: The document whose content will be added
|
|
3807
|
+
:param parent: Optional[NodeItem]: The parent NodeItem under which new items are added (Default value = None)
|
|
3808
|
+
|
|
3809
|
+
:returns: None
|
|
3810
|
+
"""
|
|
3811
|
+
ref_items = doc.body.children
|
|
3812
|
+
node_items = [ref.resolve(doc) for ref in ref_items]
|
|
3813
|
+
self.add_node_items(node_items=node_items, doc=doc, parent=parent)
|
|
3814
|
+
|
|
3815
|
+
def add_node_items(
|
|
3816
|
+
self,
|
|
3817
|
+
node_items: List[NodeItem],
|
|
3818
|
+
doc: "DoclingDocument",
|
|
3819
|
+
parent: Optional[NodeItem] = None,
|
|
3820
|
+
) -> None:
|
|
3821
|
+
"""Adds multiple NodeItems and their children under a parent in this document.
|
|
3822
|
+
|
|
3823
|
+
:param node_items: list[NodeItem]: The NodeItems to be added
|
|
3824
|
+
:param doc: DoclingDocument: The document to which the NodeItems and their children belong
|
|
3825
|
+
:param parent: Optional[NodeItem]: The parent NodeItem under which new items are added (Default value = None)
|
|
3826
|
+
|
|
3827
|
+
:returns: None
|
|
3828
|
+
"""
|
|
3829
|
+
parent = self.body if parent is None else parent
|
|
3830
|
+
|
|
3831
|
+
# Check for ListItem parent violations
|
|
3832
|
+
if not isinstance(parent, ListGroup):
|
|
3833
|
+
for item in node_items:
|
|
3834
|
+
if isinstance(item, ListItem):
|
|
3835
|
+
raise ValueError("Cannot add ListItem into a non-ListGroup parent.")
|
|
3836
|
+
|
|
3837
|
+
# Append the NodeItems to the document content
|
|
3838
|
+
|
|
3839
|
+
parent_ref = parent.get_ref()
|
|
3840
|
+
|
|
3841
|
+
new_refs = self._append_item_copies(
|
|
3842
|
+
node_items=node_items, parent_ref=parent_ref, doc=doc
|
|
3843
|
+
)
|
|
3844
|
+
|
|
3845
|
+
# Add the new item refs in the document structure
|
|
3846
|
+
|
|
3847
|
+
for ref in new_refs:
|
|
3848
|
+
parent.children.append(ref)
|
|
3849
|
+
|
|
3850
|
+
def insert_node_items(
|
|
3851
|
+
self,
|
|
3852
|
+
sibling: NodeItem,
|
|
3853
|
+
node_items: List[NodeItem],
|
|
3854
|
+
doc: "DoclingDocument",
|
|
3855
|
+
after: bool = True,
|
|
3856
|
+
) -> None:
|
|
3857
|
+
"""Insert multiple NodeItems and their children at a specific position in the document.
|
|
3858
|
+
|
|
3859
|
+
:param sibling: NodeItem: The NodeItem after/before which the new items will be inserted
|
|
3860
|
+
:param node_items: list[NodeItem]: The NodeItems to be inserted
|
|
3861
|
+
:param doc: DoclingDocument: The document to which the NodeItems and their children belong
|
|
3862
|
+
:param after: bool: If True, insert after the sibling; if False, insert before (Default value = True)
|
|
3863
|
+
|
|
3864
|
+
:returns: None
|
|
3865
|
+
"""
|
|
3866
|
+
# Check for ListItem parent violations
|
|
3867
|
+
parent = sibling.parent.resolve(self) if sibling.parent else self.body
|
|
3868
|
+
|
|
3869
|
+
if not isinstance(parent, ListGroup):
|
|
3870
|
+
for item in node_items:
|
|
3871
|
+
if isinstance(item, ListItem):
|
|
3872
|
+
raise ValueError(
|
|
3873
|
+
"Cannot insert ListItem into a non-ListGroup parent."
|
|
3874
|
+
)
|
|
3875
|
+
|
|
3876
|
+
# Append the NodeItems to the document content
|
|
3877
|
+
|
|
3878
|
+
parent_ref = parent.get_ref()
|
|
3879
|
+
|
|
3880
|
+
new_refs = self._append_item_copies(
|
|
3881
|
+
node_items=node_items, parent_ref=parent_ref, doc=doc
|
|
3882
|
+
)
|
|
3883
|
+
|
|
3884
|
+
# Get the stack of the sibling
|
|
3885
|
+
|
|
3886
|
+
sibling_ref = sibling.get_ref()
|
|
3887
|
+
|
|
3888
|
+
success, stack = self._get_stack_of_refitem(ref=sibling_ref)
|
|
3889
|
+
|
|
3890
|
+
if not success:
|
|
3891
|
+
raise ValueError(
|
|
3892
|
+
f"Could not insert at {sibling_ref.cref}: could not find the stack"
|
|
3893
|
+
)
|
|
3894
|
+
|
|
3895
|
+
# Insert the new item refs in the document structure
|
|
3896
|
+
|
|
3897
|
+
reversed_new_refs = new_refs[::-1]
|
|
3898
|
+
|
|
3899
|
+
for ref in reversed_new_refs:
|
|
3900
|
+
success = self.body._add_sibling(
|
|
3901
|
+
doc=self, stack=stack, new_ref=ref, after=after
|
|
3902
|
+
)
|
|
3903
|
+
|
|
3904
|
+
if not success:
|
|
3905
|
+
raise ValueError(
|
|
3906
|
+
f"Could not insert item {ref.cref} at {sibling.get_ref().cref}"
|
|
3907
|
+
)
|
|
3908
|
+
|
|
3909
|
+
def _append_item_copies(
|
|
3910
|
+
self,
|
|
3911
|
+
node_items: List[NodeItem],
|
|
3912
|
+
parent_ref: RefItem,
|
|
3913
|
+
doc: "DoclingDocument",
|
|
3914
|
+
) -> List[RefItem]:
|
|
3915
|
+
"""Append node item copies (with their children) from a different document to the content of this document.
|
|
3916
|
+
|
|
3917
|
+
:param node_items: List[NodeItem]: The NodeItems to be appended
|
|
3918
|
+
:param parent_ref: RefItem: The reference of the parent of the new items in this document
|
|
3919
|
+
:param doc: DoclingDocument: The document from which the NodeItems are taken
|
|
3920
|
+
|
|
3921
|
+
:returns: List[RefItem]: A list of references to the newly added items in this document
|
|
3922
|
+
"""
|
|
3923
|
+
new_refs: List[RefItem] = []
|
|
3924
|
+
|
|
3925
|
+
for item in node_items:
|
|
3926
|
+
item_copy = item.model_copy(deep=True)
|
|
3927
|
+
|
|
3928
|
+
self._append_item(item=item_copy, parent_ref=parent_ref)
|
|
3929
|
+
|
|
3930
|
+
if item_copy.children:
|
|
3931
|
+
children_node_items = [ref.resolve(doc) for ref in item_copy.children]
|
|
3932
|
+
|
|
3933
|
+
item_copy.children = self._append_item_copies(
|
|
3934
|
+
node_items=children_node_items,
|
|
3935
|
+
parent_ref=item_copy.get_ref(),
|
|
3936
|
+
doc=doc,
|
|
3937
|
+
)
|
|
3938
|
+
|
|
3939
|
+
new_ref = item_copy.get_ref()
|
|
3940
|
+
new_refs.append(new_ref)
|
|
3941
|
+
|
|
3942
|
+
return new_refs
|
|
3943
|
+
|
|
3944
|
+
def num_pages(self):
|
|
3945
|
+
"""num_pages."""
|
|
3946
|
+
return len(self.pages.values())
|
|
3947
|
+
|
|
3948
|
+
def validate_tree(self, root) -> bool:
|
|
3949
|
+
"""validate_tree."""
|
|
3950
|
+
res = []
|
|
3951
|
+
for child_ref in root.children:
|
|
3952
|
+
child = child_ref.resolve(self)
|
|
3953
|
+
if child.parent.resolve(self) != root:
|
|
3954
|
+
return False
|
|
3955
|
+
res.append(self.validate_tree(child))
|
|
3956
|
+
|
|
3957
|
+
return all(res) or len(res) == 0
|
|
3958
|
+
|
|
3959
|
+
def iterate_items(
|
|
3960
|
+
self,
|
|
3961
|
+
root: Optional[NodeItem] = None,
|
|
3962
|
+
with_groups: bool = False,
|
|
3963
|
+
traverse_pictures: bool = False,
|
|
3964
|
+
page_no: Optional[int] = None,
|
|
3965
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3966
|
+
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
3967
|
+
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
3968
|
+
"""Iterate elements with level."""
|
|
3969
|
+
for item, stack in self._iterate_items_with_stack(
|
|
3970
|
+
root=root,
|
|
3971
|
+
with_groups=with_groups,
|
|
3972
|
+
traverse_pictures=traverse_pictures,
|
|
3973
|
+
page_no=page_no,
|
|
3974
|
+
included_content_layers=included_content_layers,
|
|
3975
|
+
):
|
|
3976
|
+
yield item, len(stack)
|
|
3977
|
+
|
|
3978
|
+
def _iterate_items_with_stack(
|
|
3979
|
+
self,
|
|
3980
|
+
root: Optional[NodeItem] = None,
|
|
3981
|
+
with_groups: bool = False,
|
|
3982
|
+
traverse_pictures: bool = False,
|
|
3983
|
+
page_no: Optional[int] = None,
|
|
3984
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3985
|
+
_stack: Optional[list[int]] = None,
|
|
3986
|
+
) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
|
|
3987
|
+
"""Iterate elements with stack."""
|
|
3988
|
+
my_layers = (
|
|
3989
|
+
included_content_layers
|
|
3990
|
+
if included_content_layers is not None
|
|
3991
|
+
else DEFAULT_CONTENT_LAYERS
|
|
3992
|
+
)
|
|
3993
|
+
my_stack: list[int] = _stack if _stack is not None else []
|
|
3994
|
+
|
|
3995
|
+
if not root:
|
|
3996
|
+
root = self.body
|
|
3997
|
+
|
|
3998
|
+
# Yield non-group items or group items when with_groups=True
|
|
3999
|
+
|
|
4000
|
+
# Combine conditions to have a single yield point
|
|
4001
|
+
should_yield = (
|
|
4002
|
+
(not isinstance(root, GroupItem) or with_groups)
|
|
4003
|
+
and (
|
|
4004
|
+
not isinstance(root, DocItem)
|
|
4005
|
+
or (
|
|
4006
|
+
page_no is None
|
|
4007
|
+
or any(prov.page_no == page_no for prov in root.prov)
|
|
4008
|
+
)
|
|
4009
|
+
)
|
|
4010
|
+
and root.content_layer in my_layers
|
|
4011
|
+
)
|
|
4012
|
+
|
|
4013
|
+
if should_yield:
|
|
4014
|
+
yield root, my_stack
|
|
4015
|
+
|
|
4016
|
+
my_stack.append(-1)
|
|
4017
|
+
|
|
4018
|
+
allowed_pic_refs: set[str] = (
|
|
4019
|
+
{r.cref for r in root.captions}
|
|
4020
|
+
if (root_is_picture := isinstance(root, PictureItem))
|
|
4021
|
+
else set()
|
|
4022
|
+
)
|
|
4023
|
+
|
|
4024
|
+
# Traverse children
|
|
4025
|
+
for child_ind, child_ref in enumerate(root.children):
|
|
4026
|
+
child = child_ref.resolve(self)
|
|
4027
|
+
if (
|
|
4028
|
+
root_is_picture
|
|
4029
|
+
and not traverse_pictures
|
|
4030
|
+
and isinstance(child, NodeItem)
|
|
4031
|
+
and child.self_ref not in allowed_pic_refs
|
|
4032
|
+
):
|
|
4033
|
+
continue
|
|
4034
|
+
my_stack[-1] = child_ind
|
|
4035
|
+
|
|
4036
|
+
if isinstance(child, NodeItem):
|
|
4037
|
+
yield from self._iterate_items_with_stack(
|
|
4038
|
+
child,
|
|
4039
|
+
with_groups=with_groups,
|
|
4040
|
+
traverse_pictures=traverse_pictures,
|
|
4041
|
+
page_no=page_no,
|
|
4042
|
+
_stack=my_stack,
|
|
4043
|
+
included_content_layers=my_layers,
|
|
4044
|
+
)
|
|
4045
|
+
|
|
4046
|
+
my_stack.pop()
|
|
4047
|
+
|
|
4048
|
+
def _clear_picture_pil_cache(self):
|
|
4049
|
+
"""Clear cache storage of all images."""
|
|
4050
|
+
for item, level in self.iterate_items(with_groups=False):
|
|
4051
|
+
if isinstance(item, PictureItem):
|
|
4052
|
+
if item.image is not None and item.image._pil is not None:
|
|
4053
|
+
item.image._pil.close()
|
|
4054
|
+
|
|
4055
|
+
def _list_images_on_disk(self) -> List[Path]:
|
|
4056
|
+
"""List all images on disk."""
|
|
4057
|
+
result: List[Path] = []
|
|
4058
|
+
|
|
4059
|
+
for item, level in self.iterate_items(with_groups=False):
|
|
4060
|
+
if isinstance(item, PictureItem):
|
|
4061
|
+
if item.image is not None:
|
|
4062
|
+
if (
|
|
4063
|
+
isinstance(item.image.uri, AnyUrl)
|
|
4064
|
+
and item.image.uri.scheme == "file"
|
|
4065
|
+
and item.image.uri.path is not None
|
|
4066
|
+
):
|
|
4067
|
+
local_path = Path(unquote(item.image.uri.path))
|
|
4068
|
+
result.append(local_path)
|
|
4069
|
+
elif isinstance(item.image.uri, Path):
|
|
4070
|
+
result.append(item.image.uri)
|
|
4071
|
+
|
|
4072
|
+
return result
|
|
4073
|
+
|
|
4074
|
+
def _with_embedded_pictures(self) -> "DoclingDocument":
|
|
4075
|
+
"""Document with embedded images.
|
|
4076
|
+
|
|
4077
|
+
Creates a copy of this document where all pictures referenced
|
|
4078
|
+
through a file URI are turned into base64 embedded form.
|
|
4079
|
+
"""
|
|
4080
|
+
result: DoclingDocument = copy.deepcopy(self)
|
|
4081
|
+
|
|
4082
|
+
for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
|
|
4083
|
+
if isinstance(item, PictureItem):
|
|
4084
|
+
|
|
4085
|
+
if item.image is not None:
|
|
4086
|
+
if (
|
|
4087
|
+
isinstance(item.image.uri, AnyUrl)
|
|
4088
|
+
and item.image.uri.scheme == "file"
|
|
4089
|
+
):
|
|
4090
|
+
assert isinstance(item.image.uri.path, str)
|
|
4091
|
+
tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
|
|
4092
|
+
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
4093
|
+
|
|
4094
|
+
elif isinstance(item.image.uri, Path):
|
|
4095
|
+
tmp_image = PILImage.open(str(item.image.uri))
|
|
4096
|
+
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
4097
|
+
|
|
4098
|
+
return result
|
|
4099
|
+
|
|
4100
|
+
def _with_pictures_refs(
|
|
4101
|
+
self, image_dir: Path, reference_path: Optional[Path] = None
|
|
4102
|
+
) -> "DoclingDocument":
|
|
4103
|
+
"""Document with images as refs.
|
|
4104
|
+
|
|
4105
|
+
Creates a copy of this document where all picture data is
|
|
4106
|
+
saved to image_dir and referenced through file URIs.
|
|
4107
|
+
"""
|
|
4108
|
+
result: DoclingDocument = copy.deepcopy(self)
|
|
4109
|
+
|
|
4110
|
+
img_count = 0
|
|
4111
|
+
image_dir.mkdir(parents=True, exist_ok=True)
|
|
4112
|
+
|
|
4113
|
+
if image_dir.is_dir():
|
|
4114
|
+
for item, level in result.iterate_items(with_groups=False):
|
|
2964
4115
|
if isinstance(item, PictureItem):
|
|
2965
4116
|
|
|
2966
4117
|
if (
|
|
@@ -3048,6 +4199,8 @@ class DoclingDocument(BaseModel):
|
|
|
3048
4199
|
artifacts_dir: Optional[Path] = None,
|
|
3049
4200
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
3050
4201
|
indent: int = 2,
|
|
4202
|
+
coord_precision: Optional[int] = None,
|
|
4203
|
+
confid_precision: Optional[int] = None,
|
|
3051
4204
|
):
|
|
3052
4205
|
"""Save as json."""
|
|
3053
4206
|
if isinstance(filename, str):
|
|
@@ -3061,7 +4214,9 @@ class DoclingDocument(BaseModel):
|
|
|
3061
4214
|
artifacts_dir, image_mode, reference_path=reference_path
|
|
3062
4215
|
)
|
|
3063
4216
|
|
|
3064
|
-
out = new_doc.export_to_dict(
|
|
4217
|
+
out = new_doc.export_to_dict(
|
|
4218
|
+
coord_precision=coord_precision, confid_precision=confid_precision
|
|
4219
|
+
)
|
|
3065
4220
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
3066
4221
|
json.dump(out, fw, indent=indent)
|
|
3067
4222
|
|
|
@@ -3087,6 +4242,8 @@ class DoclingDocument(BaseModel):
|
|
|
3087
4242
|
artifacts_dir: Optional[Path] = None,
|
|
3088
4243
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
3089
4244
|
default_flow_style: bool = False,
|
|
4245
|
+
coord_precision: Optional[int] = None,
|
|
4246
|
+
confid_precision: Optional[int] = None,
|
|
3090
4247
|
):
|
|
3091
4248
|
"""Save as yaml."""
|
|
3092
4249
|
if isinstance(filename, str):
|
|
@@ -3100,7 +4257,9 @@ class DoclingDocument(BaseModel):
|
|
|
3100
4257
|
artifacts_dir, image_mode, reference_path=reference_path
|
|
3101
4258
|
)
|
|
3102
4259
|
|
|
3103
|
-
out = new_doc.export_to_dict(
|
|
4260
|
+
out = new_doc.export_to_dict(
|
|
4261
|
+
coord_precision=coord_precision, confid_precision=confid_precision
|
|
4262
|
+
)
|
|
3104
4263
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
3105
4264
|
yaml.dump(out, fw, default_flow_style=default_flow_style)
|
|
3106
4265
|
|
|
@@ -3125,9 +4284,18 @@ class DoclingDocument(BaseModel):
|
|
|
3125
4284
|
mode: str = "json",
|
|
3126
4285
|
by_alias: bool = True,
|
|
3127
4286
|
exclude_none: bool = True,
|
|
4287
|
+
coord_precision: Optional[int] = None,
|
|
4288
|
+
confid_precision: Optional[int] = None,
|
|
3128
4289
|
) -> Dict[str, Any]:
|
|
3129
4290
|
"""Export to dict."""
|
|
3130
|
-
|
|
4291
|
+
context = {}
|
|
4292
|
+
if coord_precision is not None:
|
|
4293
|
+
context[PydanticSerCtxKey.COORD_PREC.value] = coord_precision
|
|
4294
|
+
if confid_precision is not None:
|
|
4295
|
+
context[PydanticSerCtxKey.CONFID_PREC.value] = confid_precision
|
|
4296
|
+
out = self.model_dump(
|
|
4297
|
+
mode=mode, by_alias=by_alias, exclude_none=exclude_none, context=context
|
|
4298
|
+
)
|
|
3131
4299
|
|
|
3132
4300
|
return out
|
|
3133
4301
|
|