docling-core 2.44.0__py3-none-any.whl → 2.44.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/html.py +32 -72
- docling_core/types/doc/document.py +14 -12
- {docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/METADATA +1 -1
- {docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/RECORD +8 -8
- {docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/WHEEL +0 -0
- {docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/entry_points.txt +0 -0
- {docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/top_level.txt +0 -0
|
@@ -130,11 +130,14 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
130
130
|
doc_serializer: BaseDocSerializer,
|
|
131
131
|
doc: DoclingDocument,
|
|
132
132
|
is_inline_scope: bool = False,
|
|
133
|
+
visited: Optional[set[str]] = None,
|
|
133
134
|
**kwargs: Any,
|
|
134
135
|
) -> SerializationResult:
|
|
135
136
|
"""Serializes the passed text item to HTML."""
|
|
136
137
|
params = HTMLParams(**kwargs)
|
|
138
|
+
my_visited: set[str] = visited if visited is not None else set()
|
|
137
139
|
res_parts: list[SerializationResult] = []
|
|
140
|
+
post_processed = False
|
|
138
141
|
|
|
139
142
|
# Prepare the HTML based on item type
|
|
140
143
|
if isinstance(item, TitleItem):
|
|
@@ -162,7 +165,28 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
162
165
|
|
|
163
166
|
elif isinstance(item, ListItem):
|
|
164
167
|
# List items are handled by list serializer
|
|
165
|
-
|
|
168
|
+
text_parts: list[str] = []
|
|
169
|
+
if item_text := self._prepare_content(item.text):
|
|
170
|
+
item_text = doc_serializer.post_process(
|
|
171
|
+
text=item_text,
|
|
172
|
+
formatting=item.formatting,
|
|
173
|
+
hyperlink=item.hyperlink,
|
|
174
|
+
)
|
|
175
|
+
post_processed = True
|
|
176
|
+
text_parts.append(item_text)
|
|
177
|
+
nested_parts = [
|
|
178
|
+
r.text
|
|
179
|
+
for r in doc_serializer.get_parts(
|
|
180
|
+
item=item,
|
|
181
|
+
is_inline_scope=is_inline_scope,
|
|
182
|
+
visited=my_visited,
|
|
183
|
+
**kwargs,
|
|
184
|
+
)
|
|
185
|
+
]
|
|
186
|
+
text_parts.extend(nested_parts)
|
|
187
|
+
text_inner = "\n".join(text_parts)
|
|
188
|
+
if nested_parts:
|
|
189
|
+
text_inner = f"\n{text_inner}\n"
|
|
166
190
|
text = (
|
|
167
191
|
get_html_tag_with_text_direction(
|
|
168
192
|
html_tag="li",
|
|
@@ -185,11 +209,12 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
185
209
|
text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
|
|
186
210
|
|
|
187
211
|
# Apply formatting and hyperlinks
|
|
188
|
-
|
|
189
|
-
text=
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
212
|
+
if not post_processed:
|
|
213
|
+
text = doc_serializer.post_process(
|
|
214
|
+
text=text,
|
|
215
|
+
formatting=item.formatting,
|
|
216
|
+
hyperlink=item.hyperlink,
|
|
217
|
+
)
|
|
193
218
|
|
|
194
219
|
if text:
|
|
195
220
|
text_res = create_ser_result(text=text, span_source=item)
|
|
@@ -703,7 +728,6 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
703
728
|
) -> SerializationResult:
|
|
704
729
|
"""Serializes a list to HTML."""
|
|
705
730
|
my_visited: set[str] = visited if visited is not None else set()
|
|
706
|
-
params = HTMLParams(**kwargs)
|
|
707
731
|
# Get all child parts
|
|
708
732
|
parts = doc_serializer.get_parts(
|
|
709
733
|
item=item,
|
|
@@ -713,72 +737,8 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
713
737
|
**kwargs,
|
|
714
738
|
)
|
|
715
739
|
|
|
716
|
-
# Append nested list to parent list item:
|
|
717
|
-
i = 0
|
|
718
|
-
while i < len(parts):
|
|
719
|
-
prt = parts[i]
|
|
720
|
-
if prt.text.startswith(("<ul>", "<ol>")):
|
|
721
|
-
for j in range(i - 1, -1, -1):
|
|
722
|
-
if parts[j].text.startswith(("<li>", "<li ")) and parts[
|
|
723
|
-
j
|
|
724
|
-
].text.endswith("</li>"):
|
|
725
|
-
before, _, _ = parts[j].text.rpartition("</li>")
|
|
726
|
-
parts[j].text = f"{before}\n{prt.text}\n</li>"
|
|
727
|
-
break
|
|
728
|
-
if j > -1:
|
|
729
|
-
parts.pop(i)
|
|
730
|
-
else:
|
|
731
|
-
i += 1
|
|
732
|
-
|
|
733
740
|
# Add all child parts
|
|
734
|
-
text_res = "\n".join(
|
|
735
|
-
[
|
|
736
|
-
(
|
|
737
|
-
p.text
|
|
738
|
-
if (
|
|
739
|
-
(
|
|
740
|
-
p.text.startswith(("<li>", "<li "))
|
|
741
|
-
and p.text.endswith("</li>")
|
|
742
|
-
)
|
|
743
|
-
or (
|
|
744
|
-
p.text.startswith(("<ol>", "<ol "))
|
|
745
|
-
and p.text.endswith("</ol>")
|
|
746
|
-
)
|
|
747
|
-
or (
|
|
748
|
-
p.text.startswith(("<ul>", "<ul "))
|
|
749
|
-
and p.text.endswith("</ul>")
|
|
750
|
-
)
|
|
751
|
-
)
|
|
752
|
-
else (
|
|
753
|
-
get_html_tag_with_text_direction(
|
|
754
|
-
html_tag="li",
|
|
755
|
-
text=p.text,
|
|
756
|
-
attrs=(
|
|
757
|
-
{
|
|
758
|
-
"style": f"list-style-type: '{grandparent_item.marker} ';"
|
|
759
|
-
}
|
|
760
|
-
if params.show_original_list_item_marker
|
|
761
|
-
and grandparent_item.marker
|
|
762
|
-
else {}
|
|
763
|
-
),
|
|
764
|
-
)
|
|
765
|
-
if p.spans
|
|
766
|
-
and p.spans[0].item.parent
|
|
767
|
-
and isinstance(
|
|
768
|
-
(parent_item := p.spans[0].item.parent.resolve(doc)),
|
|
769
|
-
InlineGroup,
|
|
770
|
-
)
|
|
771
|
-
and parent_item.parent
|
|
772
|
-
and isinstance(
|
|
773
|
-
(grandparent_item := parent_item.parent.resolve(doc)),
|
|
774
|
-
ListItem,
|
|
775
|
-
)
|
|
776
|
-
else f"<li>{p.text}</li>"
|
|
777
|
-
)
|
|
778
|
-
)
|
|
779
|
-
for p in parts
|
|
780
|
-
]
|
|
781
|
-
)
|
|
741
|
+
text_res = "\n".join(p.text for p in parts if p.text)
|
|
782
742
|
if text_res:
|
|
783
743
|
tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
|
|
784
744
|
text_res = f"<{tag}>\n{text_res}\n</{tag}>"
|
|
@@ -1373,11 +1373,12 @@ class PictureItem(FloatingItem):
|
|
|
1373
1373
|
) # Encode to Base64 and decode to string
|
|
1374
1374
|
return img_base64
|
|
1375
1375
|
|
|
1376
|
-
|
|
1376
|
+
@staticmethod
|
|
1377
|
+
def _image_to_hexhash(img: Optional[PILImage.Image]) -> Optional[str]:
|
|
1377
1378
|
"""Hexash from the image."""
|
|
1378
|
-
if
|
|
1379
|
+
if img is not None:
|
|
1379
1380
|
# Convert the image to raw bytes
|
|
1380
|
-
image_bytes =
|
|
1381
|
+
image_bytes = img.tobytes()
|
|
1381
1382
|
|
|
1382
1383
|
# Create a hash object (e.g., SHA-256)
|
|
1383
1384
|
hasher = hashlib.sha256(usedforsecurity=False)
|
|
@@ -4116,16 +4117,10 @@ class DoclingDocument(BaseModel):
|
|
|
4116
4117
|
if image_dir.is_dir():
|
|
4117
4118
|
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
|
|
4118
4119
|
if isinstance(item, PictureItem):
|
|
4120
|
+
img = item.get_image(doc=self)
|
|
4121
|
+
if img is not None:
|
|
4119
4122
|
|
|
4120
|
-
|
|
4121
|
-
item.image is not None
|
|
4122
|
-
and isinstance(item.image.uri, AnyUrl)
|
|
4123
|
-
and item.image.uri.scheme == "data"
|
|
4124
|
-
and item.image.pil_image is not None
|
|
4125
|
-
):
|
|
4126
|
-
img = item.image.pil_image
|
|
4127
|
-
|
|
4128
|
-
hexhash = item._image_to_hexhash()
|
|
4123
|
+
hexhash = PictureItem._image_to_hexhash(img)
|
|
4129
4124
|
|
|
4130
4125
|
# loc_path = image_dir / f"image_{img_count:06}.png"
|
|
4131
4126
|
if hexhash is not None:
|
|
@@ -4140,6 +4135,11 @@ class DoclingDocument(BaseModel):
|
|
|
4140
4135
|
else:
|
|
4141
4136
|
obj_path = loc_path
|
|
4142
4137
|
|
|
4138
|
+
if item.image is None:
|
|
4139
|
+
scale = img.size[0] / item.prov[0].bbox.width
|
|
4140
|
+
item.image = ImageRef.from_pil(
|
|
4141
|
+
image=img, dpi=round(72 * scale)
|
|
4142
|
+
)
|
|
4143
4143
|
item.image.uri = Path(obj_path)
|
|
4144
4144
|
|
|
4145
4145
|
# if item.image._pil is not None:
|
|
@@ -4539,6 +4539,8 @@ class DoclingDocument(BaseModel):
|
|
|
4539
4539
|
reference_path = None
|
|
4540
4540
|
else:
|
|
4541
4541
|
reference_path = filename.parent
|
|
4542
|
+
artifacts_dir = reference_path / artifacts_dir
|
|
4543
|
+
|
|
4542
4544
|
return artifacts_dir, reference_path
|
|
4543
4545
|
|
|
4544
4546
|
def _make_copy_with_refmode(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.44.
|
|
3
|
+
Version: 2.44.2
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -30,7 +30,7 @@ docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3
|
|
|
30
30
|
docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
|
|
31
31
|
docling_core/transforms/serializer/common.py,sha256=0TNEGoA_rJ-qkVYp-X8SMUr3jTrbf6TRzPzwufYh5JM,19114
|
|
32
32
|
docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
|
|
33
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
33
|
+
docling_core/transforms/serializer/html.py,sha256=KnSMjtNZlBMfkuhtgB8T70iQSTfG_E8FFDfVRRo9WNs,38087
|
|
34
34
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
35
35
|
docling_core/transforms/serializer/markdown.py,sha256=VwonuAkuOPmQM7ibDIGvQBHOqhTcTJ_t187fLQQiNPo,23951
|
|
36
36
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
@@ -43,7 +43,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
43
43
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
44
44
|
docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
|
|
45
45
|
docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
|
|
46
|
-
docling_core/types/doc/document.py,sha256
|
|
46
|
+
docling_core/types/doc/document.py,sha256=-cL4eGFRbQHgXAsCG8zALxAx-IoanvkqG5E1zvKOMxI,201012
|
|
47
47
|
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
48
48
|
docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
|
|
49
49
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -76,9 +76,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
76
76
|
docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
|
|
77
77
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
78
78
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
79
|
-
docling_core-2.44.
|
|
80
|
-
docling_core-2.44.
|
|
81
|
-
docling_core-2.44.
|
|
82
|
-
docling_core-2.44.
|
|
83
|
-
docling_core-2.44.
|
|
84
|
-
docling_core-2.44.
|
|
79
|
+
docling_core-2.44.2.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
80
|
+
docling_core-2.44.2.dist-info/METADATA,sha256=IZWVMKuPPpzd3ksiFXTPUu3FSw13zuwa5qyaLWlBEyY,6453
|
|
81
|
+
docling_core-2.44.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
82
|
+
docling_core-2.44.2.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
83
|
+
docling_core-2.44.2.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
84
|
+
docling_core-2.44.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|