docling-core 2.29.0__py3-none-any.whl → 2.30.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/html.py +26 -0
- docling_core/types/doc/document.py +31 -12
- docling_core/types/doc/labels.py +32 -3
- docling_core/types/doc/page.py +19 -0
- docling_core/utils/legacy.py +1 -1
- {docling_core-2.29.0.dist-info → docling_core-2.30.1.dist-info}/METADATA +1 -1
- {docling_core-2.29.0.dist-info → docling_core-2.30.1.dist-info}/RECORD +10 -10
- {docling_core-2.29.0.dist-info → docling_core-2.30.1.dist-info}/LICENSE +0 -0
- {docling_core-2.29.0.dist-info → docling_core-2.30.1.dist-info}/WHEEL +0 -0
- {docling_core-2.29.0.dist-info → docling_core-2.30.1.dist-info}/entry_points.txt +0 -0
|
@@ -370,6 +370,13 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
370
370
|
**kwargs: Any,
|
|
371
371
|
) -> SerializationResult:
|
|
372
372
|
"""Export picture to HTML format."""
|
|
373
|
+
|
|
374
|
+
def get_img_row(imgb64: str, ind: int) -> str:
|
|
375
|
+
row = '<tr><td style="border: 2px solid black; padding: 8px;">'
|
|
376
|
+
row += f'<img src="data:image/png;base64,{imgb64}" alt="image {ind}">'
|
|
377
|
+
row += "</td></tr>\n"
|
|
378
|
+
return row
|
|
379
|
+
|
|
373
380
|
params = HTMLParams(**kwargs)
|
|
374
381
|
|
|
375
382
|
res_parts: list[SerializationResult] = []
|
|
@@ -393,6 +400,22 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
393
400
|
and item.image.uri.scheme == "data"
|
|
394
401
|
):
|
|
395
402
|
img_text = f'<img src="{item.image.uri}">'
|
|
403
|
+
elif len(item.prov) > 1: # more than 1 provenance
|
|
404
|
+
|
|
405
|
+
img_text = (
|
|
406
|
+
'<table style="border-collapse: collapse; width: 100%;">\n'
|
|
407
|
+
)
|
|
408
|
+
for ind, prov in enumerate(item.prov):
|
|
409
|
+
img = item.get_image(doc, prov_index=ind)
|
|
410
|
+
|
|
411
|
+
if img is not None:
|
|
412
|
+
imgb64 = item._image_to_base64(img)
|
|
413
|
+
img_text += get_img_row(imgb64=imgb64, ind=ind)
|
|
414
|
+
else:
|
|
415
|
+
_logger.warning("Could not get image")
|
|
416
|
+
|
|
417
|
+
img_text += "</table>\n"
|
|
418
|
+
|
|
396
419
|
else:
|
|
397
420
|
# get the item.image._pil or crop it out of the page-image
|
|
398
421
|
img = item.get_image(doc)
|
|
@@ -400,6 +423,9 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
400
423
|
if img is not None:
|
|
401
424
|
imgb64 = item._image_to_base64(img)
|
|
402
425
|
img_text = f'<img src="data:image/png;base64,{imgb64}">'
|
|
426
|
+
else:
|
|
427
|
+
_logger.warning("Could not get image")
|
|
428
|
+
|
|
403
429
|
elif params.image_mode == ImageRefMode.REFERENCED:
|
|
404
430
|
if isinstance(item.image, ImageRef) and not (
|
|
405
431
|
isinstance(item.image.uri, AnyUrl)
|
|
@@ -790,7 +790,9 @@ class DocItem(
|
|
|
790
790
|
|
|
791
791
|
return location
|
|
792
792
|
|
|
793
|
-
def get_image(
|
|
793
|
+
def get_image(
|
|
794
|
+
self, doc: "DoclingDocument", prov_index: int = 0
|
|
795
|
+
) -> Optional[PILImage.Image]:
|
|
794
796
|
"""Returns the image of this DocItem.
|
|
795
797
|
|
|
796
798
|
The function returns None if this DocItem has no valid provenance or
|
|
@@ -800,7 +802,7 @@ class DocItem(
|
|
|
800
802
|
if not len(self.prov):
|
|
801
803
|
return None
|
|
802
804
|
|
|
803
|
-
page = doc.pages.get(self.prov[
|
|
805
|
+
page = doc.pages.get(self.prov[prov_index].page_no)
|
|
804
806
|
if page is None or page.size is None or page.image is None:
|
|
805
807
|
return None
|
|
806
808
|
|
|
@@ -808,7 +810,7 @@ class DocItem(
|
|
|
808
810
|
if not page_image:
|
|
809
811
|
return None
|
|
810
812
|
crop_bbox = (
|
|
811
|
-
self.prov[
|
|
813
|
+
self.prov[prov_index]
|
|
812
814
|
.bbox.to_top_left_origin(page_height=page.size.height)
|
|
813
815
|
.scale_to_size(old_size=page.size, new_size=page.image.size)
|
|
814
816
|
# .scaled(scale=page_image.height / page.size.height)
|
|
@@ -973,7 +975,9 @@ class FloatingItem(DocItem):
|
|
|
973
975
|
text += cap.resolve(doc).text
|
|
974
976
|
return text
|
|
975
977
|
|
|
976
|
-
def get_image(
|
|
978
|
+
def get_image(
|
|
979
|
+
self, doc: "DoclingDocument", prov_index: int = 0
|
|
980
|
+
) -> Optional[PILImage.Image]:
|
|
977
981
|
"""Returns the image corresponding to this FloatingItem.
|
|
978
982
|
|
|
979
983
|
This function returns the PIL image from self.image if one is available.
|
|
@@ -985,7 +989,7 @@ class FloatingItem(DocItem):
|
|
|
985
989
|
"""
|
|
986
990
|
if self.image is not None:
|
|
987
991
|
return self.image.pil_image
|
|
988
|
-
return super().get_image(doc=doc)
|
|
992
|
+
return super().get_image(doc=doc, prov_index=prov_index)
|
|
989
993
|
|
|
990
994
|
|
|
991
995
|
class CodeItem(FloatingItem, TextItem):
|
|
@@ -1073,7 +1077,7 @@ class PictureItem(FloatingItem):
|
|
|
1073
1077
|
image_bytes = self.image._pil.tobytes()
|
|
1074
1078
|
|
|
1075
1079
|
# Create a hash object (e.g., SHA-256)
|
|
1076
|
-
hasher = hashlib.sha256()
|
|
1080
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
|
1077
1081
|
|
|
1078
1082
|
# Feed the image bytes into the hash object
|
|
1079
1083
|
hasher.update(image_bytes)
|
|
@@ -2657,16 +2661,25 @@ class DoclingDocument(BaseModel):
|
|
|
2657
2661
|
if should_yield:
|
|
2658
2662
|
yield root, my_stack
|
|
2659
2663
|
|
|
2660
|
-
# Handle picture traversal - only traverse children if requested
|
|
2661
|
-
if isinstance(root, PictureItem) and not traverse_pictures:
|
|
2662
|
-
return
|
|
2663
|
-
|
|
2664
2664
|
my_stack.append(-1)
|
|
2665
2665
|
|
|
2666
|
+
allowed_pic_refs: set[str] = (
|
|
2667
|
+
{r.cref for r in root.captions}
|
|
2668
|
+
if (root_is_picture := isinstance(root, PictureItem))
|
|
2669
|
+
else set()
|
|
2670
|
+
)
|
|
2671
|
+
|
|
2666
2672
|
# Traverse children
|
|
2667
2673
|
for child_ind, child_ref in enumerate(root.children):
|
|
2668
|
-
my_stack[-1] = child_ind
|
|
2669
2674
|
child = child_ref.resolve(self)
|
|
2675
|
+
if (
|
|
2676
|
+
root_is_picture
|
|
2677
|
+
and not traverse_pictures
|
|
2678
|
+
and isinstance(child, DocItem)
|
|
2679
|
+
and child.self_ref not in allowed_pic_refs
|
|
2680
|
+
):
|
|
2681
|
+
continue
|
|
2682
|
+
my_stack[-1] = child_ind
|
|
2670
2683
|
|
|
2671
2684
|
if isinstance(child, NodeItem):
|
|
2672
2685
|
yield from self._iterate_items_with_stack(
|
|
@@ -3597,7 +3610,9 @@ class DoclingDocument(BaseModel):
|
|
|
3597
3610
|
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3598
3611
|
rf"{DocItemLabel.KEY_VALUE_REGION}|"
|
|
3599
3612
|
rf"{DocumentToken.CHART.value}|"
|
|
3600
|
-
rf"{DocumentToken.OTSL.value})
|
|
3613
|
+
rf"{DocumentToken.OTSL.value})>"
|
|
3614
|
+
rf"(?P<content>.*?)"
|
|
3615
|
+
rf"(?:(?P<closed></(?P=tag)>)|(?P<eof>$))"
|
|
3601
3616
|
)
|
|
3602
3617
|
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
3603
3618
|
|
|
@@ -3607,6 +3622,10 @@ class DoclingDocument(BaseModel):
|
|
|
3607
3622
|
tag_name = match.group("tag")
|
|
3608
3623
|
|
|
3609
3624
|
bbox = extract_bounding_box(full_chunk) # Extracts first bbox
|
|
3625
|
+
if not match.group("closed"):
|
|
3626
|
+
# no closing tag; only the existence of the item is recovered
|
|
3627
|
+
full_chunk = f"<{tag_name}></{tag_name}>"
|
|
3628
|
+
|
|
3610
3629
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
3611
3630
|
|
|
3612
3631
|
if tag_name == DocumentToken.OTSL.value:
|
docling_core/types/doc/labels.py
CHANGED
|
@@ -25,6 +25,8 @@ class DocItemLabel(str, Enum):
|
|
|
25
25
|
CHECKBOX_UNSELECTED = "checkbox_unselected"
|
|
26
26
|
FORM = "form"
|
|
27
27
|
KEY_VALUE_REGION = "key_value_region"
|
|
28
|
+
GRADING_SCALE = "grading_scale" # for elements in forms, questionaires representing a grading scale
|
|
29
|
+
# e.g. [strongly disagree | ... | ... | strongly agree]
|
|
28
30
|
|
|
29
31
|
# Additional labels for markup-based formats (e.g. HTML, Word)
|
|
30
32
|
PARAGRAPH = "paragraph"
|
|
@@ -78,6 +80,7 @@ class GroupLabel(str, Enum):
|
|
|
78
80
|
KEY_VALUE_AREA = "key_value_area"
|
|
79
81
|
COMMENT_SECTION = "comment_section"
|
|
80
82
|
INLINE = "inline"
|
|
83
|
+
PICTURE_AREA = "picture_area"
|
|
81
84
|
|
|
82
85
|
def __str__(self):
|
|
83
86
|
"""Get string value."""
|
|
@@ -143,17 +146,43 @@ class TableCellLabel(str, Enum):
|
|
|
143
146
|
"""Get string value."""
|
|
144
147
|
return str(self.value)
|
|
145
148
|
|
|
149
|
+
@staticmethod
|
|
150
|
+
def get_color(label: "TableCellLabel") -> Tuple[int, int, int]:
|
|
151
|
+
"""Return the RGB color associated with a given label."""
|
|
152
|
+
color_map = {
|
|
153
|
+
TableCellLabel.COLUMN_HEADER: (255, 0, 0),
|
|
154
|
+
TableCellLabel.ROW_HEADER: (0, 255, 0),
|
|
155
|
+
TableCellLabel.ROW_SECTION: (0, 0, 255),
|
|
156
|
+
TableCellLabel.BODY: (0, 255, 255),
|
|
157
|
+
}
|
|
158
|
+
return color_map.get(label, (0, 0, 0))
|
|
159
|
+
|
|
146
160
|
|
|
147
161
|
class GraphCellLabel(str, Enum):
|
|
148
162
|
"""GraphCellLabel."""
|
|
149
163
|
|
|
150
164
|
UNSPECIFIED = "unspecified"
|
|
151
165
|
|
|
152
|
-
KEY = "key"
|
|
153
|
-
VALUE = "value"
|
|
154
|
-
|
|
166
|
+
KEY = "key" # used to designate a key (label) of a key-value element
|
|
167
|
+
VALUE = "value" # Data value with or without explicit Key, but filled in,
|
|
168
|
+
# e.g. telephone number, address, quantity, name, date
|
|
169
|
+
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
|
|
155
170
|
CHECKBOX = "checkbox"
|
|
156
171
|
|
|
172
|
+
def __str__(self):
|
|
173
|
+
"""Get string value."""
|
|
174
|
+
return str(self.value)
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def get_color(label: "GraphCellLabel") -> Tuple[int, int, int]:
|
|
178
|
+
"""Return the RGB color associated with a given label."""
|
|
179
|
+
color_map = {
|
|
180
|
+
GraphCellLabel.KEY: (255, 0, 0),
|
|
181
|
+
GraphCellLabel.VALUE: (0, 255, 0),
|
|
182
|
+
GraphCellLabel.EMPTY_VALUE: (0, 0, 255),
|
|
183
|
+
}
|
|
184
|
+
return color_map.get(label, (0, 0, 0))
|
|
185
|
+
|
|
157
186
|
|
|
158
187
|
class GraphLinkLabel(str, Enum):
|
|
159
188
|
"""GraphLinkLabel."""
|
docling_core/types/doc/page.py
CHANGED
|
@@ -472,8 +472,27 @@ class SegmentedPage(BaseModel):
|
|
|
472
472
|
word_cells: List[TextCell] = []
|
|
473
473
|
textline_cells: List[TextCell] = []
|
|
474
474
|
|
|
475
|
+
# These flags are set to differentiate if above lists of this SegmentedPage
|
|
476
|
+
# are empty (page had no content) or if they have not been computed (i.e. textline_cells may be present
|
|
477
|
+
# but word_cells are not)
|
|
478
|
+
has_chars: bool = False
|
|
479
|
+
has_words: bool = False
|
|
480
|
+
has_lines: bool = False
|
|
481
|
+
|
|
475
482
|
image: Optional[ImageRef] = None
|
|
476
483
|
|
|
484
|
+
@model_validator(mode="after")
|
|
485
|
+
def validate_page(self) -> "SegmentedPage":
|
|
486
|
+
"""Validate page."""
|
|
487
|
+
if len(self.textline_cells) > 0:
|
|
488
|
+
self.has_lines = True
|
|
489
|
+
if len(self.word_cells) > 0:
|
|
490
|
+
self.has_words = True
|
|
491
|
+
if len(self.char_cells) > 0:
|
|
492
|
+
self.has_chars = True
|
|
493
|
+
|
|
494
|
+
return self
|
|
495
|
+
|
|
477
496
|
def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
|
|
478
497
|
"""Iterate through text cells of the specified unit type.
|
|
479
498
|
|
docling_core/utils/legacy.py
CHANGED
|
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
def _create_hash(string: str):
|
|
50
|
-
hasher = hashlib.sha256()
|
|
50
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
|
51
51
|
hasher.update(string.encode("utf-8"))
|
|
52
52
|
|
|
53
53
|
return hasher.hexdigest()
|
|
@@ -29,7 +29,7 @@ docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3
|
|
|
29
29
|
docling_core/transforms/serializer/base.py,sha256=9bgpWA0oMmZNRc3yIuZVnu5bJ1glClBsswtVF1vYwMI,6046
|
|
30
30
|
docling_core/transforms/serializer/common.py,sha256=xBwhsgDZbNWMpp6ExpyUWO8_NvHPfbPF1ak2z9API5M,17435
|
|
31
31
|
docling_core/transforms/serializer/doctags.py,sha256=mEmRWVuebcG5pZcR1_HX146cyUk0_FjaLQtMXSgh9hs,17870
|
|
32
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
32
|
+
docling_core/transforms/serializer/html.py,sha256=2zlV8B-xsXHfTM13sb2pRwtKaxLcNUlW0p0NCgbiatk,34088
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
34
|
docling_core/transforms/serializer/markdown.py,sha256=YqThAYMsOWSg6nZnnmrUHZohn0QvfZzRqpLrB-Keev8,17873
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
@@ -40,9 +40,9 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
40
40
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
41
41
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
42
42
|
docling_core/types/doc/base.py,sha256=sM3IyFXzVh2WT8IGh5nejXYh8sf39yBh8TBSlHeJ9CI,12611
|
|
43
|
-
docling_core/types/doc/document.py,sha256=
|
|
44
|
-
docling_core/types/doc/labels.py,sha256=
|
|
45
|
-
docling_core/types/doc/page.py,sha256=
|
|
43
|
+
docling_core/types/doc/document.py,sha256=eboNYL-QVnDNnw3vL7PPVdPosfs5oNfsrVofxmdBDHw,140884
|
|
44
|
+
docling_core/types/doc/labels.py,sha256=vp4h3e7AmBvezRmgrfuPehjAHTZOufphErLB4ENhdME,7171
|
|
45
|
+
docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
|
|
46
46
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
47
47
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
48
48
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
@@ -70,11 +70,11 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
|
|
|
70
70
|
docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
|
|
71
71
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
72
72
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
73
|
-
docling_core/utils/legacy.py,sha256=
|
|
73
|
+
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
74
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
75
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
76
|
-
docling_core-2.
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
76
|
+
docling_core-2.30.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.30.1.dist-info/METADATA,sha256=qvcnzM33mlJCxj-5MIz4VjdfAGHk-xiYqbNPzsEx6GY,5976
|
|
78
|
+
docling_core-2.30.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
docling_core-2.30.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
80
|
+
docling_core-2.30.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|