docling-core 2.30.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/common.py +5 -5
- docling_core/transforms/serializer/html.py +24 -13
- docling_core/types/doc/labels.py +31 -3
- {docling_core-2.30.0.dist-info → docling_core-2.31.0.dist-info}/METADATA +1 -1
- {docling_core-2.30.0.dist-info → docling_core-2.31.0.dist-info}/RECORD +8 -8
- {docling_core-2.30.0.dist-info → docling_core-2.31.0.dist-info}/LICENSE +0 -0
- {docling_core-2.30.0.dist-info → docling_core-2.31.0.dist-info}/WHEEL +0 -0
- {docling_core-2.30.0.dist-info → docling_core-2.31.0.dist-info}/entry_points.txt +0 -0
|
@@ -169,7 +169,7 @@ class CommonParams(BaseModel):
|
|
|
169
169
|
|
|
170
170
|
def merge_with_patch(self, patch: dict[str, Any]) -> Self:
|
|
171
171
|
"""Create an instance by merging the provided patch dict on top of self."""
|
|
172
|
-
res = self.
|
|
172
|
+
res = self.model_copy(update=patch)
|
|
173
173
|
return res
|
|
174
174
|
|
|
175
175
|
|
|
@@ -260,10 +260,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
260
260
|
"""Serialize a document out of its pages."""
|
|
261
261
|
...
|
|
262
262
|
|
|
263
|
-
def _serialize_body(self) -> SerializationResult:
|
|
263
|
+
def _serialize_body(self, **kwargs) -> SerializationResult:
|
|
264
264
|
"""Serialize the document body."""
|
|
265
265
|
subparts = self.get_parts()
|
|
266
|
-
res = self.serialize_doc(parts=subparts)
|
|
266
|
+
res = self.serialize_doc(parts=subparts, **kwargs)
|
|
267
267
|
return res
|
|
268
268
|
|
|
269
269
|
@override
|
|
@@ -278,12 +278,12 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
278
278
|
) -> SerializationResult:
|
|
279
279
|
"""Serialize a given node."""
|
|
280
280
|
my_visited: set[str] = visited if visited is not None else set()
|
|
281
|
-
my_kwargs = self.params.
|
|
281
|
+
my_kwargs = {**self.params.model_dump(), **kwargs}
|
|
282
282
|
empty_res = create_ser_result()
|
|
283
283
|
if item is None or item == self.doc.body:
|
|
284
284
|
if self.doc.body.self_ref not in my_visited:
|
|
285
285
|
my_visited.add(self.doc.body.self_ref)
|
|
286
|
-
return self._serialize_body()
|
|
286
|
+
return self._serialize_body(**my_kwargs)
|
|
287
287
|
else:
|
|
288
288
|
return empty_res
|
|
289
289
|
|
|
@@ -16,6 +16,7 @@ from xml.etree.cElementTree import SubElement, tostring
|
|
|
16
16
|
from xml.sax.saxutils import unescape
|
|
17
17
|
|
|
18
18
|
import latex2mathml.converter
|
|
19
|
+
from PIL.Image import Image
|
|
19
20
|
from pydantic import AnyUrl, BaseModel
|
|
20
21
|
from typing_extensions import override
|
|
21
22
|
|
|
@@ -40,6 +41,7 @@ from docling_core.transforms.serializer.html_styles import (
|
|
|
40
41
|
_get_css_for_single_column,
|
|
41
42
|
_get_css_for_split_page,
|
|
42
43
|
)
|
|
44
|
+
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
43
45
|
from docling_core.types.doc.base import ImageRefMode
|
|
44
46
|
from docling_core.types.doc.document import (
|
|
45
47
|
CodeItem,
|
|
@@ -821,9 +823,22 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
821
823
|
def serialize_doc(
|
|
822
824
|
self,
|
|
823
825
|
parts: list[SerializationResult],
|
|
826
|
+
visualizer: Optional[BaseVisualizer] = None,
|
|
824
827
|
**kwargs: Any,
|
|
825
828
|
) -> SerializationResult:
|
|
826
829
|
"""Serialize a document out of its pages."""
|
|
830
|
+
|
|
831
|
+
def _serialize_page_img(page_img: Image):
|
|
832
|
+
buffered = BytesIO()
|
|
833
|
+
page_img.save(buffered, format="PNG") # Save the image to the byte stream
|
|
834
|
+
img_bytes = buffered.getvalue() # Get the byte data
|
|
835
|
+
|
|
836
|
+
# Encode to Base64 and decode to string
|
|
837
|
+
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
|
|
838
|
+
img_text = f'<img src="data:image/png;base64,{img_base64}">'
|
|
839
|
+
|
|
840
|
+
return f"<figure>{img_text}</figure>"
|
|
841
|
+
|
|
827
842
|
# Create HTML structure
|
|
828
843
|
html_parts = [
|
|
829
844
|
"<!DOCTYPE html>",
|
|
@@ -853,19 +868,26 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
853
868
|
html_parts.append("<table>")
|
|
854
869
|
html_parts.append("<tbody>")
|
|
855
870
|
|
|
871
|
+
vized_pages_dict: dict[Optional[int], Image] = {}
|
|
872
|
+
if visualizer:
|
|
873
|
+
vized_pages_dict = visualizer.get_visualization(doc=self.doc)
|
|
874
|
+
|
|
856
875
|
for page_no, page in pages.items():
|
|
857
876
|
|
|
858
877
|
if isinstance(page_no, int):
|
|
859
878
|
if applicable_pages is not None and page_no not in applicable_pages:
|
|
860
879
|
continue
|
|
861
880
|
page_img = self.doc.pages[page_no].image
|
|
881
|
+
vized_page = vized_pages_dict.get(page_no)
|
|
862
882
|
|
|
863
883
|
html_parts.append("<tr>")
|
|
864
884
|
|
|
865
885
|
html_parts.append("<td>")
|
|
866
886
|
|
|
887
|
+
if vized_page:
|
|
888
|
+
html_parts.append(_serialize_page_img(page_img=vized_page))
|
|
867
889
|
# short-cut: we already have the image in base64
|
|
868
|
-
|
|
890
|
+
elif (
|
|
869
891
|
(page_img is not None)
|
|
870
892
|
and isinstance(page_img, ImageRef)
|
|
871
893
|
and isinstance(page_img.uri, AnyUrl)
|
|
@@ -875,18 +897,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
875
897
|
html_parts.append(f"<figure>{img_text}</figure>")
|
|
876
898
|
|
|
877
899
|
elif (page_img is not None) and (page_img._pil is not None):
|
|
878
|
-
|
|
879
|
-
buffered = BytesIO()
|
|
880
|
-
page_img._pil.save(
|
|
881
|
-
buffered, format="PNG"
|
|
882
|
-
) # Save the image to the byte stream
|
|
883
|
-
img_bytes = buffered.getvalue() # Get the byte data
|
|
884
|
-
|
|
885
|
-
# Encode to Base64 and decode to string
|
|
886
|
-
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
|
|
887
|
-
img_text = f'<img src="data:image/png;base64,{img_base64}">'
|
|
888
|
-
|
|
889
|
-
html_parts.append(f"<figure>{img_text}</figure>")
|
|
900
|
+
html_parts.append(_serialize_page_img(page_img=page_img._pil))
|
|
890
901
|
else:
|
|
891
902
|
html_parts.append("<figure>no page-image found</figure>")
|
|
892
903
|
|
docling_core/types/doc/labels.py
CHANGED
|
@@ -25,6 +25,8 @@ class DocItemLabel(str, Enum):
|
|
|
25
25
|
CHECKBOX_UNSELECTED = "checkbox_unselected"
|
|
26
26
|
FORM = "form"
|
|
27
27
|
KEY_VALUE_REGION = "key_value_region"
|
|
28
|
+
GRADING_SCALE = "grading_scale" # for elements in forms, questionaires representing a grading scale
|
|
29
|
+
# e.g. [strongly disagree | ... | ... | strongly agree]
|
|
28
30
|
|
|
29
31
|
# Additional labels for markup-based formats (e.g. HTML, Word)
|
|
30
32
|
PARAGRAPH = "paragraph"
|
|
@@ -144,17 +146,43 @@ class TableCellLabel(str, Enum):
|
|
|
144
146
|
"""Get string value."""
|
|
145
147
|
return str(self.value)
|
|
146
148
|
|
|
149
|
+
@staticmethod
|
|
150
|
+
def get_color(label: "TableCellLabel") -> Tuple[int, int, int]:
|
|
151
|
+
"""Return the RGB color associated with a given label."""
|
|
152
|
+
color_map = {
|
|
153
|
+
TableCellLabel.COLUMN_HEADER: (255, 0, 0),
|
|
154
|
+
TableCellLabel.ROW_HEADER: (0, 255, 0),
|
|
155
|
+
TableCellLabel.ROW_SECTION: (0, 0, 255),
|
|
156
|
+
TableCellLabel.BODY: (0, 255, 255),
|
|
157
|
+
}
|
|
158
|
+
return color_map.get(label, (0, 0, 0))
|
|
159
|
+
|
|
147
160
|
|
|
148
161
|
class GraphCellLabel(str, Enum):
|
|
149
162
|
"""GraphCellLabel."""
|
|
150
163
|
|
|
151
164
|
UNSPECIFIED = "unspecified"
|
|
152
165
|
|
|
153
|
-
KEY = "key"
|
|
154
|
-
VALUE = "value"
|
|
155
|
-
|
|
166
|
+
KEY = "key" # used to designate a key (label) of a key-value element
|
|
167
|
+
VALUE = "value" # Data value with or without explicit Key, but filled in,
|
|
168
|
+
# e.g. telephone number, address, quantity, name, date
|
|
169
|
+
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
|
|
156
170
|
CHECKBOX = "checkbox"
|
|
157
171
|
|
|
172
|
+
def __str__(self):
|
|
173
|
+
"""Get string value."""
|
|
174
|
+
return str(self.value)
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def get_color(label: "GraphCellLabel") -> Tuple[int, int, int]:
|
|
178
|
+
"""Return the RGB color associated with a given label."""
|
|
179
|
+
color_map = {
|
|
180
|
+
GraphCellLabel.KEY: (255, 0, 0),
|
|
181
|
+
GraphCellLabel.VALUE: (0, 255, 0),
|
|
182
|
+
GraphCellLabel.EMPTY_VALUE: (0, 0, 255),
|
|
183
|
+
}
|
|
184
|
+
return color_map.get(label, (0, 0, 0))
|
|
185
|
+
|
|
158
186
|
|
|
159
187
|
class GraphLinkLabel(str, Enum):
|
|
160
188
|
"""GraphLinkLabel."""
|
|
@@ -27,9 +27,9 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4H
|
|
|
27
27
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
28
28
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
29
29
|
docling_core/transforms/serializer/base.py,sha256=9bgpWA0oMmZNRc3yIuZVnu5bJ1glClBsswtVF1vYwMI,6046
|
|
30
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
30
|
+
docling_core/transforms/serializer/common.py,sha256=TC1EwHIp9PYcI8jeTKeavUAPtounYmS0V1bfS_wDKm0,17427
|
|
31
31
|
docling_core/transforms/serializer/doctags.py,sha256=mEmRWVuebcG5pZcR1_HX146cyUk0_FjaLQtMXSgh9hs,17870
|
|
32
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
32
|
+
docling_core/transforms/serializer/html.py,sha256=Xq9CU5qZTDdwstizYqWNL_TFNDs9NHK_6JvvZk0TP98,34571
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
34
|
docling_core/transforms/serializer/markdown.py,sha256=YqThAYMsOWSg6nZnnmrUHZohn0QvfZzRqpLrB-Keev8,17873
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
@@ -41,7 +41,7 @@ docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,83
|
|
|
41
41
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
42
42
|
docling_core/types/doc/base.py,sha256=sM3IyFXzVh2WT8IGh5nejXYh8sf39yBh8TBSlHeJ9CI,12611
|
|
43
43
|
docling_core/types/doc/document.py,sha256=eboNYL-QVnDNnw3vL7PPVdPosfs5oNfsrVofxmdBDHw,140884
|
|
44
|
-
docling_core/types/doc/labels.py,sha256=
|
|
44
|
+
docling_core/types/doc/labels.py,sha256=vp4h3e7AmBvezRmgrfuPehjAHTZOufphErLB4ENhdME,7171
|
|
45
45
|
docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
|
|
46
46
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
47
47
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
@@ -73,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
73
73
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
74
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
75
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
76
|
-
docling_core-2.
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
76
|
+
docling_core-2.31.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.31.0.dist-info/METADATA,sha256=OohNxPwKcbRVVKm_kpa3HRFhYh9ZMVyBIlf3apF9hm4,5976
|
|
78
|
+
docling_core-2.31.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
docling_core-2.31.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
80
|
+
docling_core-2.31.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|