docling 2.53.0__py3-none-any.whl → 2.54.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/msword_backend.py +176 -75
- docling/backend/webvtt_backend.py +572 -0
- docling/datamodel/base_models.py +23 -23
- docling/datamodel/document.py +2 -0
- docling/document_converter.py +4 -0
- docling/models/table_structure_model.py +3 -3
- {docling-2.53.0.dist-info → docling-2.54.0.dist-info}/METADATA +4 -4
- {docling-2.53.0.dist-info → docling-2.54.0.dist-info}/RECORD +12 -11
- {docling-2.53.0.dist-info → docling-2.54.0.dist-info}/WHEEL +0 -0
- {docling-2.53.0.dist-info → docling-2.54.0.dist-info}/entry_points.txt +0 -0
- {docling-2.53.0.dist-info → docling-2.54.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.53.0.dist-info → docling-2.54.0.dist-info}/top_level.txt +0 -0
|
@@ -12,8 +12,11 @@ from docling_core.types.doc import (
|
|
|
12
12
|
ImageRef,
|
|
13
13
|
ListGroup,
|
|
14
14
|
NodeItem,
|
|
15
|
+
RefItem,
|
|
16
|
+
RichTableCell,
|
|
15
17
|
TableCell,
|
|
16
18
|
TableData,
|
|
19
|
+
TextItem,
|
|
17
20
|
)
|
|
18
21
|
from docling_core.types.doc.document import Formatting
|
|
19
22
|
from docx import Document
|
|
@@ -128,7 +131,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
128
131
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
|
129
132
|
if self.is_valid():
|
|
130
133
|
assert self.docx_obj is not None
|
|
131
|
-
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
|
134
|
+
doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
|
135
|
+
# doc, _ = doc_info
|
|
132
136
|
return doc
|
|
133
137
|
else:
|
|
134
138
|
raise RuntimeError(
|
|
@@ -172,7 +176,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
172
176
|
body: BaseOxmlElement,
|
|
173
177
|
docx_obj: DocxDocument,
|
|
174
178
|
doc: DoclingDocument,
|
|
175
|
-
|
|
179
|
+
# parent:
|
|
180
|
+
) -> tuple[DoclingDocument, list[RefItem]]:
|
|
181
|
+
added_elements = []
|
|
176
182
|
for element in body:
|
|
177
183
|
tag_name = etree.QName(element).localname
|
|
178
184
|
# Check for Inline Images (blip elements)
|
|
@@ -230,8 +236,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
230
236
|
parent=self.parents[level - 1],
|
|
231
237
|
name="shape-text",
|
|
232
238
|
)
|
|
239
|
+
added_elements.append(shape_group.get_ref())
|
|
233
240
|
doc.add_text(
|
|
234
|
-
label=DocItemLabel.
|
|
241
|
+
label=DocItemLabel.TEXT,
|
|
235
242
|
parent=shape_group,
|
|
236
243
|
text=text_content,
|
|
237
244
|
)
|
|
@@ -246,23 +253,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
246
253
|
_log.debug(
|
|
247
254
|
f"Found textbox content with {len(textbox_elements)} elements"
|
|
248
255
|
)
|
|
249
|
-
self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
|
256
|
+
tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
|
257
|
+
added_elements.extend(tbc)
|
|
250
258
|
|
|
251
259
|
# Check for Tables
|
|
252
260
|
if element.tag.endswith("tbl"):
|
|
253
261
|
try:
|
|
254
|
-
self._handle_tables(element, docx_obj, doc)
|
|
262
|
+
t = self._handle_tables(element, docx_obj, doc)
|
|
263
|
+
added_elements.extend(t)
|
|
255
264
|
except Exception:
|
|
256
265
|
_log.debug("could not parse a table, broken docx table")
|
|
257
266
|
# Check for Image
|
|
258
267
|
elif drawing_blip:
|
|
259
|
-
self._handle_pictures(docx_obj, drawing_blip, doc)
|
|
268
|
+
pics = self._handle_pictures(docx_obj, drawing_blip, doc)
|
|
269
|
+
added_elements.extend(pics)
|
|
260
270
|
# Check for Text after the Image
|
|
261
271
|
if (
|
|
262
272
|
tag_name in ["p"]
|
|
263
273
|
and element.find(".//w:t", namespaces=namespaces) is not None
|
|
264
274
|
):
|
|
265
|
-
self._handle_text_elements(element, docx_obj, doc)
|
|
275
|
+
te1 = self._handle_text_elements(element, docx_obj, doc)
|
|
276
|
+
added_elements.extend(te1)
|
|
266
277
|
# Check for the sdt containers, like table of contents
|
|
267
278
|
elif tag_name in ["sdt"]:
|
|
268
279
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
|
@@ -270,15 +281,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
270
281
|
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
|
271
282
|
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
|
272
283
|
for p in paragraphs:
|
|
273
|
-
self._handle_text_elements(p, docx_obj, doc)
|
|
284
|
+
te = self._handle_text_elements(p, docx_obj, doc)
|
|
285
|
+
added_elements.extend(te)
|
|
274
286
|
# Check for Text
|
|
275
287
|
elif tag_name in ["p"]:
|
|
276
288
|
# "tcPr", "sectPr"
|
|
277
|
-
self._handle_text_elements(element, docx_obj, doc)
|
|
289
|
+
te = self._handle_text_elements(element, docx_obj, doc)
|
|
290
|
+
added_elements.extend(te)
|
|
278
291
|
else:
|
|
279
292
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
|
280
293
|
|
|
281
|
-
return doc
|
|
294
|
+
return doc, added_elements
|
|
282
295
|
|
|
283
296
|
def _str_to_int(
|
|
284
297
|
self, s: Optional[str], default: Optional[int] = 0
|
|
@@ -674,14 +687,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
674
687
|
textbox_elements: list,
|
|
675
688
|
docx_obj: DocxDocument,
|
|
676
689
|
doc: DoclingDocument,
|
|
677
|
-
) ->
|
|
690
|
+
) -> List[RefItem]:
|
|
691
|
+
elem_ref: List[RefItem] = []
|
|
678
692
|
"""Process textbox content and add it to the document structure."""
|
|
679
693
|
level = self._get_level()
|
|
680
694
|
# Create a textbox group to contain all text from the textbox
|
|
681
695
|
textbox_group = doc.add_group(
|
|
682
696
|
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
|
|
683
697
|
)
|
|
684
|
-
|
|
698
|
+
elem_ref.append(textbox_group.get_ref())
|
|
685
699
|
# Set this as the current parent to ensure textbox content
|
|
686
700
|
# is properly nested in document structure
|
|
687
701
|
original_parent = self.parents[level]
|
|
@@ -729,11 +743,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
729
743
|
# Mark this paragraph as processed
|
|
730
744
|
processed_paragraphs.add(paragraph_id)
|
|
731
745
|
|
|
732
|
-
self._handle_text_elements(p, docx_obj, doc)
|
|
746
|
+
elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
|
|
733
747
|
|
|
734
748
|
# Restore original parent
|
|
735
749
|
self.parents[level] = original_parent
|
|
736
|
-
return
|
|
750
|
+
return elem_ref
|
|
737
751
|
|
|
738
752
|
def _handle_equations_in_text(self, element, text):
|
|
739
753
|
only_texts = []
|
|
@@ -803,7 +817,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
803
817
|
element: BaseOxmlElement,
|
|
804
818
|
docx_obj: DocxDocument,
|
|
805
819
|
doc: DoclingDocument,
|
|
806
|
-
) ->
|
|
820
|
+
) -> List[RefItem]:
|
|
821
|
+
elem_ref: List[RefItem] = []
|
|
807
822
|
paragraph = Paragraph(element, docx_obj)
|
|
808
823
|
paragraph_elements = self._get_paragraph_elements(paragraph)
|
|
809
824
|
text, equations = self._handle_equations_in_text(
|
|
@@ -811,7 +826,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
811
826
|
)
|
|
812
827
|
|
|
813
828
|
if text is None:
|
|
814
|
-
return
|
|
829
|
+
return elem_ref
|
|
815
830
|
text = text.strip()
|
|
816
831
|
|
|
817
832
|
# Common styles for bullet and numbered lists.
|
|
@@ -832,15 +847,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
832
847
|
# Check if this is actually a numbered list by examining the numFmt
|
|
833
848
|
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
|
|
834
849
|
|
|
835
|
-
self._add_list_item(
|
|
850
|
+
li = self._add_list_item(
|
|
836
851
|
doc=doc,
|
|
837
852
|
numid=numid,
|
|
838
853
|
ilevel=ilevel,
|
|
839
854
|
elements=paragraph_elements,
|
|
840
855
|
is_numbered=is_numbered,
|
|
841
856
|
)
|
|
857
|
+
elem_ref.extend(li) # MUST BE REF!!!
|
|
842
858
|
self._update_history(p_style_id, p_level, numid, ilevel)
|
|
843
|
-
return
|
|
859
|
+
return elem_ref
|
|
844
860
|
elif (
|
|
845
861
|
numid is None
|
|
846
862
|
and self._prev_numid() is not None
|
|
@@ -860,9 +876,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
860
876
|
if p_style_id in ["Title"]:
|
|
861
877
|
for key in range(len(self.parents)):
|
|
862
878
|
self.parents[key] = None
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
)
|
|
879
|
+
te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
|
|
880
|
+
self.parents[0] = te
|
|
881
|
+
elem_ref.append(te.get_ref())
|
|
866
882
|
elif "Heading" in p_style_id:
|
|
867
883
|
style_element = getattr(paragraph.style, "element", None)
|
|
868
884
|
if style_element is not None:
|
|
@@ -871,7 +887,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
871
887
|
)
|
|
872
888
|
else:
|
|
873
889
|
is_numbered_style = False
|
|
874
|
-
self._add_header(doc, p_level, text, is_numbered_style)
|
|
890
|
+
h1 = self._add_header(doc, p_level, text, is_numbered_style)
|
|
891
|
+
elem_ref.extend(h1)
|
|
875
892
|
|
|
876
893
|
elif len(equations) > 0:
|
|
877
894
|
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
|
|
@@ -879,15 +896,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
879
896
|
) > 0:
|
|
880
897
|
# Standalone equation
|
|
881
898
|
level = self._get_level()
|
|
882
|
-
doc.add_text(
|
|
899
|
+
t1 = doc.add_text(
|
|
883
900
|
label=DocItemLabel.FORMULA,
|
|
884
901
|
parent=self.parents[level - 1],
|
|
885
902
|
text=text.replace("<eq>", "").replace("</eq>", ""),
|
|
886
903
|
)
|
|
904
|
+
elem_ref.append(t1.get_ref())
|
|
887
905
|
else:
|
|
888
906
|
# Inline equation
|
|
889
907
|
level = self._get_level()
|
|
890
908
|
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
|
|
909
|
+
elem_ref.append(inline_equation.get_ref())
|
|
891
910
|
text_tmp = text
|
|
892
911
|
for eq in equations:
|
|
893
912
|
if len(text_tmp) == 0:
|
|
@@ -899,23 +918,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
899
918
|
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
|
|
900
919
|
|
|
901
920
|
if len(pre_eq_text) > 0:
|
|
902
|
-
doc.add_text(
|
|
903
|
-
label=DocItemLabel.
|
|
921
|
+
e1 = doc.add_text(
|
|
922
|
+
label=DocItemLabel.TEXT,
|
|
904
923
|
parent=inline_equation,
|
|
905
924
|
text=pre_eq_text,
|
|
906
925
|
)
|
|
907
|
-
|
|
926
|
+
elem_ref.append(e1.get_ref())
|
|
927
|
+
e2 = doc.add_text(
|
|
908
928
|
label=DocItemLabel.FORMULA,
|
|
909
929
|
parent=inline_equation,
|
|
910
930
|
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
|
911
931
|
)
|
|
932
|
+
elem_ref.append(e2.get_ref())
|
|
912
933
|
|
|
913
934
|
if len(text_tmp) > 0:
|
|
914
|
-
doc.add_text(
|
|
915
|
-
label=DocItemLabel.
|
|
935
|
+
e3 = doc.add_text(
|
|
936
|
+
label=DocItemLabel.TEXT,
|
|
916
937
|
parent=inline_equation,
|
|
917
938
|
text=text_tmp.strip(),
|
|
918
939
|
)
|
|
940
|
+
elem_ref.append(e3.get_ref())
|
|
919
941
|
|
|
920
942
|
elif p_style_id in [
|
|
921
943
|
"Paragraph",
|
|
@@ -934,13 +956,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
934
956
|
paragraph_elements=paragraph_elements,
|
|
935
957
|
)
|
|
936
958
|
for text, format, hyperlink in paragraph_elements:
|
|
937
|
-
doc.add_text(
|
|
938
|
-
label=DocItemLabel.
|
|
959
|
+
t2 = doc.add_text(
|
|
960
|
+
label=DocItemLabel.TEXT,
|
|
939
961
|
parent=parent,
|
|
940
962
|
text=text,
|
|
941
963
|
formatting=format,
|
|
942
964
|
hyperlink=hyperlink,
|
|
943
965
|
)
|
|
966
|
+
elem_ref.append(t2.get_ref())
|
|
944
967
|
|
|
945
968
|
else:
|
|
946
969
|
# Text style names can, and will have, not only default values but user values too
|
|
@@ -952,16 +975,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
952
975
|
paragraph_elements=paragraph_elements,
|
|
953
976
|
)
|
|
954
977
|
for text, format, hyperlink in paragraph_elements:
|
|
955
|
-
doc.add_text(
|
|
956
|
-
label=DocItemLabel.
|
|
978
|
+
t3 = doc.add_text(
|
|
979
|
+
label=DocItemLabel.TEXT,
|
|
957
980
|
parent=parent,
|
|
958
981
|
text=text,
|
|
959
982
|
formatting=format,
|
|
960
983
|
hyperlink=hyperlink,
|
|
961
984
|
)
|
|
985
|
+
elem_ref.append(t3.get_ref())
|
|
962
986
|
|
|
963
987
|
self._update_history(p_style_id, p_level, numid, ilevel)
|
|
964
|
-
return
|
|
988
|
+
return elem_ref
|
|
965
989
|
|
|
966
990
|
def _add_header(
|
|
967
991
|
self,
|
|
@@ -969,17 +993,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
969
993
|
curr_level: Optional[int],
|
|
970
994
|
text: str,
|
|
971
995
|
is_numbered_style: bool = False,
|
|
972
|
-
) ->
|
|
996
|
+
) -> List[RefItem]:
|
|
997
|
+
elem_ref: List[RefItem] = []
|
|
973
998
|
level = self._get_level()
|
|
974
999
|
if isinstance(curr_level, int):
|
|
975
1000
|
if curr_level > level:
|
|
976
1001
|
# add invisible group
|
|
977
1002
|
for i in range(level, curr_level):
|
|
978
|
-
|
|
1003
|
+
gr1 = doc.add_group(
|
|
979
1004
|
parent=self.parents[i - 1],
|
|
980
1005
|
label=GroupLabel.SECTION,
|
|
981
1006
|
name=f"header-{i}",
|
|
982
1007
|
)
|
|
1008
|
+
elem_ref.append(gr1.get_ref())
|
|
1009
|
+
self.parents[i] = gr1
|
|
1010
|
+
|
|
983
1011
|
elif curr_level < level:
|
|
984
1012
|
# remove the tail
|
|
985
1013
|
for key in range(len(self.parents)):
|
|
@@ -1019,12 +1047,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1019
1047
|
text = f"{self.numbered_headers[previous_level]}.{text}"
|
|
1020
1048
|
previous_level -= 1
|
|
1021
1049
|
|
|
1022
|
-
|
|
1050
|
+
hd = doc.add_heading(
|
|
1023
1051
|
parent=self.parents[parent_level],
|
|
1024
1052
|
text=text,
|
|
1025
1053
|
level=add_level,
|
|
1026
1054
|
)
|
|
1027
|
-
|
|
1055
|
+
self.parents[current_level] = hd
|
|
1056
|
+
elem_ref.append(hd.get_ref())
|
|
1057
|
+
return elem_ref
|
|
1028
1058
|
|
|
1029
1059
|
def _add_formatted_list_item(
|
|
1030
1060
|
self,
|
|
@@ -1033,12 +1063,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1033
1063
|
marker: str,
|
|
1034
1064
|
enumerated: bool,
|
|
1035
1065
|
level: int,
|
|
1036
|
-
) ->
|
|
1066
|
+
) -> List[RefItem]:
|
|
1067
|
+
elem_ref: List[RefItem] = []
|
|
1037
1068
|
# This should not happen by construction
|
|
1038
1069
|
if not isinstance(self.parents[level], ListGroup):
|
|
1039
|
-
return
|
|
1070
|
+
return elem_ref
|
|
1040
1071
|
if not elements:
|
|
1041
|
-
return
|
|
1072
|
+
return elem_ref
|
|
1042
1073
|
|
|
1043
1074
|
if len(elements) == 1:
|
|
1044
1075
|
text, format, hyperlink = elements[0]
|
|
@@ -1068,6 +1099,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1068
1099
|
formatting=format,
|
|
1069
1100
|
hyperlink=hyperlink,
|
|
1070
1101
|
)
|
|
1102
|
+
return elem_ref
|
|
1071
1103
|
|
|
1072
1104
|
def _add_list_item(
|
|
1073
1105
|
self,
|
|
@@ -1077,10 +1109,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1077
1109
|
ilevel: int,
|
|
1078
1110
|
elements: list,
|
|
1079
1111
|
is_numbered: bool = False,
|
|
1080
|
-
) ->
|
|
1081
|
-
|
|
1112
|
+
) -> List[RefItem]:
|
|
1113
|
+
elem_ref: List[RefItem] = []
|
|
1114
|
+
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
|
1082
1115
|
if not elements:
|
|
1083
|
-
return
|
|
1116
|
+
return elem_ref
|
|
1084
1117
|
enum_marker = ""
|
|
1085
1118
|
|
|
1086
1119
|
level = self._get_level()
|
|
@@ -1091,9 +1124,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1091
1124
|
# Reset counters for the new numbering sequence
|
|
1092
1125
|
self._reset_list_counters_for_new_sequence(numid)
|
|
1093
1126
|
|
|
1094
|
-
self.parents[level
|
|
1095
|
-
|
|
1096
|
-
)
|
|
1127
|
+
list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
|
|
1128
|
+
self.parents[level] = list_gr
|
|
1129
|
+
elem_ref.append(list_gr.get_ref())
|
|
1097
1130
|
|
|
1098
1131
|
# Set marker and enumerated arguments if this is an enumeration element.
|
|
1099
1132
|
if is_numbered:
|
|
@@ -1114,9 +1147,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1114
1147
|
self.level_at_new_list + prev_indent + 1,
|
|
1115
1148
|
self.level_at_new_list + ilevel + 1,
|
|
1116
1149
|
):
|
|
1117
|
-
self.parents[i
|
|
1118
|
-
|
|
1119
|
-
)
|
|
1150
|
+
list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
|
|
1151
|
+
self.parents[i] = list_gr1
|
|
1152
|
+
elem_ref.append(list_gr1.get_ref())
|
|
1120
1153
|
|
|
1121
1154
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
|
1122
1155
|
if is_numbered:
|
|
@@ -1156,7 +1189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1156
1189
|
)
|
|
1157
1190
|
|
|
1158
1191
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
|
1159
|
-
#
|
|
1192
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
|
1160
1193
|
if is_numbered:
|
|
1161
1194
|
counter = self._get_list_counter(numid, ilevel)
|
|
1162
1195
|
enum_marker = str(counter) + "."
|
|
@@ -1165,15 +1198,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1165
1198
|
self._add_formatted_list_item(
|
|
1166
1199
|
doc, elements, enum_marker, is_numbered, level - 1
|
|
1167
1200
|
)
|
|
1168
|
-
|
|
1169
|
-
return
|
|
1201
|
+
return elem_ref
|
|
1170
1202
|
|
|
1171
1203
|
def _handle_tables(
|
|
1172
1204
|
self,
|
|
1173
1205
|
element: BaseOxmlElement,
|
|
1174
1206
|
docx_obj: DocxDocument,
|
|
1175
1207
|
doc: DoclingDocument,
|
|
1176
|
-
) ->
|
|
1208
|
+
) -> List[RefItem]:
|
|
1209
|
+
elem_ref: List[RefItem] = []
|
|
1177
1210
|
table: Table = Table(element, docx_obj)
|
|
1178
1211
|
num_rows = len(table.rows)
|
|
1179
1212
|
num_cols = len(table.columns)
|
|
@@ -1184,9 +1217,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1184
1217
|
# In case we have a table of only 1 cell, we consider it furniture
|
|
1185
1218
|
# And proceed processing the content of the cell as though it's in the document body
|
|
1186
1219
|
self._walk_linear(cell_element._element, docx_obj, doc)
|
|
1187
|
-
return
|
|
1220
|
+
return elem_ref
|
|
1188
1221
|
|
|
1189
1222
|
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
|
1223
|
+
level = self._get_level()
|
|
1224
|
+
docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
|
|
1225
|
+
elem_ref.append(docling_table.get_ref())
|
|
1226
|
+
|
|
1190
1227
|
cell_set: set[CT_Tc] = set()
|
|
1191
1228
|
for row_idx, row in enumerate(table.rows):
|
|
1192
1229
|
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
|
@@ -1223,27 +1260,87 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1223
1260
|
else:
|
|
1224
1261
|
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
|
1225
1262
|
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1263
|
+
provs_in_cell: List[RefItem] = []
|
|
1264
|
+
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
|
|
1265
|
+
ref_for_rich_cell = provs_in_cell[0]
|
|
1266
|
+
rich_table_cell = False
|
|
1267
|
+
|
|
1268
|
+
def group_cell_elements(
|
|
1269
|
+
group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
|
|
1270
|
+
) -> RefItem:
|
|
1271
|
+
group_element = doc.add_group(
|
|
1272
|
+
label=GroupLabel.UNSPECIFIED,
|
|
1273
|
+
name=group_name,
|
|
1274
|
+
parent=docling_table,
|
|
1275
|
+
)
|
|
1276
|
+
for prov in provs_in_cell:
|
|
1277
|
+
group_element.children.append(prov)
|
|
1278
|
+
pr_item = prov.resolve(doc)
|
|
1279
|
+
item_parent = pr_item.parent.resolve(doc)
|
|
1280
|
+
if pr_item.get_ref() in item_parent.children:
|
|
1281
|
+
item_parent.children.remove(pr_item.get_ref())
|
|
1282
|
+
pr_item.parent = group_element.get_ref()
|
|
1283
|
+
ref_for_rich_cell = group_element.get_ref()
|
|
1284
|
+
return ref_for_rich_cell
|
|
1285
|
+
|
|
1286
|
+
if len(provs_in_cell) > 1:
|
|
1287
|
+
# Cell has multiple elements, we need to group them
|
|
1288
|
+
rich_table_cell = True
|
|
1289
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
|
|
1290
|
+
ref_for_rich_cell = group_cell_elements(
|
|
1291
|
+
group_name, doc, provs_in_cell
|
|
1292
|
+
)
|
|
1239
1293
|
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1294
|
+
elif len(provs_in_cell) == 1:
|
|
1295
|
+
item_ref = provs_in_cell[0]
|
|
1296
|
+
pr_item = item_ref.resolve(doc)
|
|
1297
|
+
if isinstance(pr_item, TextItem):
|
|
1298
|
+
# Cell has only one element and it's just a text
|
|
1299
|
+
rich_table_cell = False
|
|
1300
|
+
doc.delete_items(node_items=[pr_item])
|
|
1301
|
+
else:
|
|
1302
|
+
rich_table_cell = True
|
|
1303
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
|
|
1304
|
+
ref_for_rich_cell = group_cell_elements(
|
|
1305
|
+
group_name, doc, provs_in_cell
|
|
1306
|
+
)
|
|
1307
|
+
else:
|
|
1308
|
+
rich_table_cell = False
|
|
1309
|
+
|
|
1310
|
+
if rich_table_cell:
|
|
1311
|
+
rich_cell = RichTableCell(
|
|
1312
|
+
text=text,
|
|
1313
|
+
row_span=spanned_idx - row_idx,
|
|
1314
|
+
col_span=cell.grid_span,
|
|
1315
|
+
start_row_offset_idx=row.grid_cols_before + row_idx,
|
|
1316
|
+
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
|
1317
|
+
start_col_offset_idx=col_idx,
|
|
1318
|
+
end_col_offset_idx=col_idx + cell.grid_span,
|
|
1319
|
+
column_header=row.grid_cols_before + row_idx == 0,
|
|
1320
|
+
row_header=False,
|
|
1321
|
+
ref=ref_for_rich_cell, # points to an artificial group around children
|
|
1322
|
+
)
|
|
1323
|
+
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
|
|
1324
|
+
col_idx += cell.grid_span
|
|
1325
|
+
else:
|
|
1326
|
+
simple_cell = TableCell(
|
|
1327
|
+
text=text,
|
|
1328
|
+
row_span=spanned_idx - row_idx,
|
|
1329
|
+
col_span=cell.grid_span,
|
|
1330
|
+
start_row_offset_idx=row.grid_cols_before + row_idx,
|
|
1331
|
+
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
|
1332
|
+
start_col_offset_idx=col_idx,
|
|
1333
|
+
end_col_offset_idx=col_idx + cell.grid_span,
|
|
1334
|
+
column_header=row.grid_cols_before + row_idx == 0,
|
|
1335
|
+
row_header=False,
|
|
1336
|
+
)
|
|
1337
|
+
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
|
|
1338
|
+
col_idx += cell.grid_span
|
|
1339
|
+
return elem_ref
|
|
1243
1340
|
|
|
1244
1341
|
def _handle_pictures(
|
|
1245
1342
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
|
1246
|
-
) ->
|
|
1343
|
+
) -> List[RefItem]:
|
|
1247
1344
|
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
|
1248
1345
|
image_data: Optional[bytes] = None
|
|
1249
1346
|
rId = drawing_blip[0].get(
|
|
@@ -1255,28 +1352,32 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1255
1352
|
image_data = image_part.blob # Get the binary image data
|
|
1256
1353
|
return image_data
|
|
1257
1354
|
|
|
1355
|
+
elem_ref: List[RefItem] = []
|
|
1258
1356
|
level = self._get_level()
|
|
1259
1357
|
# Open the BytesIO object with PIL to create an Image
|
|
1260
1358
|
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
|
1261
1359
|
if image_data is None:
|
|
1262
1360
|
_log.warning("Warning: image cannot be found")
|
|
1263
|
-
doc.add_picture(
|
|
1361
|
+
p1 = doc.add_picture(
|
|
1264
1362
|
parent=self.parents[level - 1],
|
|
1265
1363
|
caption=None,
|
|
1266
1364
|
)
|
|
1365
|
+
elem_ref.append(p1.get_ref())
|
|
1267
1366
|
else:
|
|
1268
1367
|
try:
|
|
1269
1368
|
image_bytes = BytesIO(image_data)
|
|
1270
1369
|
pil_image = Image.open(image_bytes)
|
|
1271
|
-
doc.add_picture(
|
|
1370
|
+
p2 = doc.add_picture(
|
|
1272
1371
|
parent=self.parents[level - 1],
|
|
1273
1372
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
|
1274
1373
|
caption=None,
|
|
1275
1374
|
)
|
|
1375
|
+
elem_ref.append(p2.get_ref())
|
|
1276
1376
|
except (UnidentifiedImageError, OSError):
|
|
1277
1377
|
_log.warning("Warning: image cannot be loaded by Pillow")
|
|
1278
|
-
doc.add_picture(
|
|
1378
|
+
p3 = doc.add_picture(
|
|
1279
1379
|
parent=self.parents[level - 1],
|
|
1280
1380
|
caption=None,
|
|
1281
1381
|
)
|
|
1282
|
-
|
|
1382
|
+
elem_ref.append(p3.get_ref())
|
|
1383
|
+
return elem_ref
|