docling 2.25.2__py3-none-any.whl → 2.27.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +21 -13
- docling/backend/docling_parse_v2_backend.py +20 -12
- docling/backend/docling_parse_v4_backend.py +185 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +271 -0
- docling/backend/docx/latex/omml.py +453 -0
- docling/backend/html_backend.py +7 -7
- docling/backend/md_backend.py +1 -1
- docling/backend/msexcel_backend.py +2 -45
- docling/backend/mspowerpoint_backend.py +1 -1
- docling/backend/msword_backend.py +65 -3
- docling/backend/pdf_backend.py +7 -2
- docling/backend/pypdfium2_backend.py +52 -30
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +62 -23
- docling/cli/models.py +1 -1
- docling/datamodel/base_models.py +8 -10
- docling/datamodel/pipeline_options.py +27 -31
- docling/document_converter.py +5 -5
- docling/models/base_model.py +9 -1
- docling/models/base_ocr_model.py +27 -16
- docling/models/code_formula_model.py +84 -5
- docling/models/document_picture_classifier.py +1 -1
- docling/models/easyocr_model.py +28 -13
- docling/models/factories/__init__.py +27 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/ocr_mac_model.py +39 -11
- docling/models/page_preprocessing_model.py +4 -0
- docling/models/picture_description_api_model.py +20 -3
- docling/models/picture_description_base_model.py +19 -3
- docling/models/picture_description_vlm_model.py +14 -2
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +28 -0
- docling/models/rapid_ocr_model.py +34 -13
- docling/models/table_structure_model.py +14 -5
- docling/models/tesseract_ocr_cli_model.py +40 -15
- docling/models/tesseract_ocr_model.py +37 -12
- docling/pipeline/standard_pdf_pipeline.py +25 -78
- docling/utils/export.py +8 -6
- docling/utils/layout_postprocessor.py +26 -23
- docling/utils/visualization.py +1 -1
- {docling-2.25.2.dist-info → docling-2.27.0.dist-info}/METADATA +48 -19
- docling-2.27.0.dist-info/RECORD +83 -0
- {docling-2.25.2.dist-info → docling-2.27.0.dist-info}/entry_points.txt +3 -0
- docling-2.25.2.dist-info/RECORD +0 -72
- {docling-2.25.2.dist-info → docling-2.27.0.dist-info}/LICENSE +0 -0
- {docling-2.25.2.dist-info → docling-2.27.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,453 @@
|
|
1
|
+
"""
|
2
|
+
Office Math Markup Language (OMML)
|
3
|
+
|
4
|
+
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
5
|
+
On 23/01/2025
|
6
|
+
"""
|
7
|
+
|
8
|
+
import lxml.etree as ET
|
9
|
+
from pylatexenc.latexencode import UnicodeToLatexEncoder
|
10
|
+
|
11
|
+
from docling.backend.docx.latex.latex_dict import (
|
12
|
+
ALN,
|
13
|
+
ARR,
|
14
|
+
BACKSLASH,
|
15
|
+
BLANK,
|
16
|
+
BRK,
|
17
|
+
CHARS,
|
18
|
+
CHR,
|
19
|
+
CHR_BO,
|
20
|
+
CHR_DEFAULT,
|
21
|
+
D_DEFAULT,
|
22
|
+
F_DEFAULT,
|
23
|
+
FUNC,
|
24
|
+
FUNC_PLACE,
|
25
|
+
LIM_FUNC,
|
26
|
+
LIM_TO,
|
27
|
+
LIM_UPP,
|
28
|
+
POS,
|
29
|
+
POS_DEFAULT,
|
30
|
+
RAD,
|
31
|
+
RAD_DEFAULT,
|
32
|
+
SUB,
|
33
|
+
SUP,
|
34
|
+
D,
|
35
|
+
F,
|
36
|
+
M,
|
37
|
+
T,
|
38
|
+
)
|
39
|
+
|
40
|
+
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
41
|
+
|
42
|
+
|
43
|
+
def load(stream):
|
44
|
+
tree = ET.parse(stream)
|
45
|
+
for omath in tree.findall(OMML_NS + "oMath"):
|
46
|
+
yield oMath2Latex(omath)
|
47
|
+
|
48
|
+
|
49
|
+
def load_string(string):
|
50
|
+
root = ET.fromstring(string)
|
51
|
+
for omath in root.findall(OMML_NS + "oMath"):
|
52
|
+
yield oMath2Latex(omath)
|
53
|
+
|
54
|
+
|
55
|
+
def escape_latex(strs):
|
56
|
+
last = None
|
57
|
+
new_chr = []
|
58
|
+
strs = strs.replace(r"\\", "\\")
|
59
|
+
for c in strs:
|
60
|
+
if (c in CHARS) and (last != BACKSLASH):
|
61
|
+
new_chr.append(BACKSLASH + c)
|
62
|
+
else:
|
63
|
+
new_chr.append(c)
|
64
|
+
last = c
|
65
|
+
return BLANK.join(new_chr)
|
66
|
+
|
67
|
+
|
68
|
+
def get_val(key, default=None, store=CHR):
|
69
|
+
if key is not None:
|
70
|
+
return key if not store else store.get(key, key)
|
71
|
+
else:
|
72
|
+
return default
|
73
|
+
|
74
|
+
|
75
|
+
class Tag2Method(object):
|
76
|
+
|
77
|
+
def call_method(self, elm, stag=None):
|
78
|
+
getmethod = self.tag2meth.get
|
79
|
+
if stag is None:
|
80
|
+
stag = elm.tag.replace(OMML_NS, "")
|
81
|
+
method = getmethod(stag)
|
82
|
+
if method:
|
83
|
+
return method(self, elm)
|
84
|
+
else:
|
85
|
+
return None
|
86
|
+
|
87
|
+
def process_children_list(self, elm, include=None):
|
88
|
+
"""
|
89
|
+
process children of the elm,return iterable
|
90
|
+
"""
|
91
|
+
for _e in list(elm):
|
92
|
+
if OMML_NS not in _e.tag:
|
93
|
+
continue
|
94
|
+
stag = _e.tag.replace(OMML_NS, "")
|
95
|
+
if include and (stag not in include):
|
96
|
+
continue
|
97
|
+
t = self.call_method(_e, stag=stag)
|
98
|
+
if t is None:
|
99
|
+
t = self.process_unknow(_e, stag)
|
100
|
+
if t is None:
|
101
|
+
continue
|
102
|
+
yield (stag, t, _e)
|
103
|
+
|
104
|
+
def process_children_dict(self, elm, include=None):
|
105
|
+
"""
|
106
|
+
process children of the elm,return dict
|
107
|
+
"""
|
108
|
+
latex_chars = dict()
|
109
|
+
for stag, t, e in self.process_children_list(elm, include):
|
110
|
+
latex_chars[stag] = t
|
111
|
+
return latex_chars
|
112
|
+
|
113
|
+
def process_children(self, elm, include=None):
|
114
|
+
"""
|
115
|
+
process children of the elm,return string
|
116
|
+
"""
|
117
|
+
return BLANK.join(
|
118
|
+
(
|
119
|
+
t if not isinstance(t, Tag2Method) else str(t)
|
120
|
+
for stag, t, e in self.process_children_list(elm, include)
|
121
|
+
)
|
122
|
+
)
|
123
|
+
|
124
|
+
def process_unknow(self, elm, stag):
|
125
|
+
return None
|
126
|
+
|
127
|
+
|
128
|
+
class Pr(Tag2Method):
|
129
|
+
|
130
|
+
text = ""
|
131
|
+
|
132
|
+
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
133
|
+
|
134
|
+
__innerdict = None # can't use the __dict__
|
135
|
+
|
136
|
+
""" common properties of element"""
|
137
|
+
|
138
|
+
def __init__(self, elm):
|
139
|
+
self.__innerdict = {}
|
140
|
+
self.text = self.process_children(elm)
|
141
|
+
|
142
|
+
def __str__(self):
|
143
|
+
return self.text
|
144
|
+
|
145
|
+
def __unicode__(self):
|
146
|
+
return self.__str__(self)
|
147
|
+
|
148
|
+
def __getattr__(self, name):
|
149
|
+
return self.__innerdict.get(name, None)
|
150
|
+
|
151
|
+
def do_brk(self, elm):
|
152
|
+
self.__innerdict["brk"] = BRK
|
153
|
+
return BRK
|
154
|
+
|
155
|
+
def do_common(self, elm):
|
156
|
+
stag = elm.tag.replace(OMML_NS, "")
|
157
|
+
if stag in self.__val_tags:
|
158
|
+
t = elm.get("{0}val".format(OMML_NS))
|
159
|
+
self.__innerdict[stag] = t
|
160
|
+
return None
|
161
|
+
|
162
|
+
tag2meth = {
|
163
|
+
"brk": do_brk,
|
164
|
+
"chr": do_common,
|
165
|
+
"pos": do_common,
|
166
|
+
"begChr": do_common,
|
167
|
+
"endChr": do_common,
|
168
|
+
"type": do_common,
|
169
|
+
}
|
170
|
+
|
171
|
+
|
172
|
+
class oMath2Latex(Tag2Method):
|
173
|
+
"""
|
174
|
+
Convert oMath element of omml to latex
|
175
|
+
"""
|
176
|
+
|
177
|
+
_t_dict = T
|
178
|
+
|
179
|
+
__direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
|
180
|
+
u = UnicodeToLatexEncoder(
|
181
|
+
replacement_latex_protection="braces-all",
|
182
|
+
unknown_char_policy="keep",
|
183
|
+
unknown_char_warning=False,
|
184
|
+
)
|
185
|
+
|
186
|
+
def __init__(self, element):
|
187
|
+
self._latex = self.process_children(element)
|
188
|
+
|
189
|
+
def __str__(self):
|
190
|
+
return self.latex.replace(" ", " ")
|
191
|
+
|
192
|
+
def __unicode__(self):
|
193
|
+
return self.__str__(self)
|
194
|
+
|
195
|
+
def process_unknow(self, elm, stag):
|
196
|
+
if stag in self.__direct_tags:
|
197
|
+
return self.process_children(elm)
|
198
|
+
elif stag[-2:] == "Pr":
|
199
|
+
return Pr(elm)
|
200
|
+
else:
|
201
|
+
return None
|
202
|
+
|
203
|
+
@property
|
204
|
+
def latex(self):
|
205
|
+
return self._latex
|
206
|
+
|
207
|
+
def do_acc(self, elm):
|
208
|
+
"""
|
209
|
+
the accent function
|
210
|
+
"""
|
211
|
+
c_dict = self.process_children_dict(elm)
|
212
|
+
latex_s = get_val(
|
213
|
+
c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
|
214
|
+
)
|
215
|
+
return latex_s.format(c_dict["e"])
|
216
|
+
|
217
|
+
def do_bar(self, elm):
|
218
|
+
"""
|
219
|
+
the bar function
|
220
|
+
"""
|
221
|
+
c_dict = self.process_children_dict(elm)
|
222
|
+
pr = c_dict["barPr"]
|
223
|
+
latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
|
224
|
+
return pr.text + latex_s.format(c_dict["e"])
|
225
|
+
|
226
|
+
def do_d(self, elm):
|
227
|
+
"""
|
228
|
+
the delimiter object
|
229
|
+
"""
|
230
|
+
c_dict = self.process_children_dict(elm)
|
231
|
+
pr = c_dict["dPr"]
|
232
|
+
null = D_DEFAULT.get("null")
|
233
|
+
|
234
|
+
s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
|
235
|
+
e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
|
236
|
+
delim = pr.text + D.format(
|
237
|
+
left=null if not s_val else escape_latex(s_val),
|
238
|
+
text=c_dict["e"],
|
239
|
+
right=null if not e_val else escape_latex(e_val),
|
240
|
+
)
|
241
|
+
return delim
|
242
|
+
|
243
|
+
def do_spre(self, elm):
|
244
|
+
"""
|
245
|
+
the Pre-Sub-Superscript object -- Not support yet
|
246
|
+
"""
|
247
|
+
pass
|
248
|
+
|
249
|
+
def do_sub(self, elm):
|
250
|
+
text = self.process_children(elm)
|
251
|
+
return SUB.format(text)
|
252
|
+
|
253
|
+
def do_sup(self, elm):
|
254
|
+
text = self.process_children(elm)
|
255
|
+
return SUP.format(text)
|
256
|
+
|
257
|
+
def do_f(self, elm):
|
258
|
+
"""
|
259
|
+
the fraction object
|
260
|
+
"""
|
261
|
+
c_dict = self.process_children_dict(elm)
|
262
|
+
pr = c_dict["fPr"]
|
263
|
+
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
264
|
+
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
265
|
+
|
266
|
+
def do_func(self, elm):
|
267
|
+
"""
|
268
|
+
the Function-Apply object (Examples:sin cos)
|
269
|
+
"""
|
270
|
+
c_dict = self.process_children_dict(elm)
|
271
|
+
func_name = c_dict.get("fName")
|
272
|
+
return func_name.replace(FUNC_PLACE, c_dict.get("e"))
|
273
|
+
|
274
|
+
def do_fname(self, elm):
|
275
|
+
"""
|
276
|
+
the func name
|
277
|
+
"""
|
278
|
+
latex_chars = []
|
279
|
+
for stag, t, e in self.process_children_list(elm):
|
280
|
+
if stag == "r":
|
281
|
+
if FUNC.get(t):
|
282
|
+
latex_chars.append(FUNC[t])
|
283
|
+
else:
|
284
|
+
raise NotSupport("Not support func %s" % t)
|
285
|
+
else:
|
286
|
+
latex_chars.append(t)
|
287
|
+
t = BLANK.join(latex_chars)
|
288
|
+
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
289
|
+
|
290
|
+
def do_groupchr(self, elm):
|
291
|
+
"""
|
292
|
+
the Group-Character object
|
293
|
+
"""
|
294
|
+
c_dict = self.process_children_dict(elm)
|
295
|
+
pr = c_dict["groupChrPr"]
|
296
|
+
latex_s = get_val(pr.chr)
|
297
|
+
return pr.text + latex_s.format(c_dict["e"])
|
298
|
+
|
299
|
+
def do_rad(self, elm):
|
300
|
+
"""
|
301
|
+
the radical object
|
302
|
+
"""
|
303
|
+
c_dict = self.process_children_dict(elm)
|
304
|
+
text = c_dict.get("e")
|
305
|
+
deg_text = c_dict.get("deg")
|
306
|
+
if deg_text:
|
307
|
+
return RAD.format(deg=deg_text, text=text)
|
308
|
+
else:
|
309
|
+
return RAD_DEFAULT.format(text=text)
|
310
|
+
|
311
|
+
def do_eqarr(self, elm):
|
312
|
+
"""
|
313
|
+
the Array object
|
314
|
+
"""
|
315
|
+
return ARR.format(
|
316
|
+
text=BRK.join(
|
317
|
+
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
318
|
+
)
|
319
|
+
)
|
320
|
+
|
321
|
+
def do_limlow(self, elm):
|
322
|
+
"""
|
323
|
+
the Lower-Limit object
|
324
|
+
"""
|
325
|
+
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
326
|
+
latex_s = LIM_FUNC.get(t_dict["e"])
|
327
|
+
if not latex_s:
|
328
|
+
raise NotSupport("Not support lim %s" % t_dict["e"])
|
329
|
+
else:
|
330
|
+
return latex_s.format(lim=t_dict.get("lim"))
|
331
|
+
|
332
|
+
def do_limupp(self, elm):
|
333
|
+
"""
|
334
|
+
the Upper-Limit object
|
335
|
+
"""
|
336
|
+
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
337
|
+
return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
|
338
|
+
|
339
|
+
def do_lim(self, elm):
|
340
|
+
"""
|
341
|
+
the lower limit of the limLow object and the upper limit of the limUpp function
|
342
|
+
"""
|
343
|
+
return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
|
344
|
+
|
345
|
+
def do_m(self, elm):
|
346
|
+
"""
|
347
|
+
the Matrix object
|
348
|
+
"""
|
349
|
+
rows = []
|
350
|
+
for stag, t, e in self.process_children_list(elm):
|
351
|
+
if stag == "mPr":
|
352
|
+
pass
|
353
|
+
elif stag == "mr":
|
354
|
+
rows.append(t)
|
355
|
+
return M.format(text=BRK.join(rows))
|
356
|
+
|
357
|
+
def do_mr(self, elm):
|
358
|
+
"""
|
359
|
+
a single row of the matrix m
|
360
|
+
"""
|
361
|
+
return ALN.join(
|
362
|
+
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
363
|
+
)
|
364
|
+
|
365
|
+
def do_nary(self, elm):
|
366
|
+
"""
|
367
|
+
the n-ary object
|
368
|
+
"""
|
369
|
+
res = []
|
370
|
+
bo = ""
|
371
|
+
for stag, t, e in self.process_children_list(elm):
|
372
|
+
if stag == "naryPr":
|
373
|
+
bo = get_val(t.chr, store=CHR_BO)
|
374
|
+
else:
|
375
|
+
res.append(t)
|
376
|
+
return bo + BLANK.join(res)
|
377
|
+
|
378
|
+
def process_unicode(self, s):
|
379
|
+
# s = s if isinstance(s,unicode) else unicode(s,'utf-8')
|
380
|
+
# print(s, self._t_dict.get(s, s), unicode_to_latex(s))
|
381
|
+
# _str.append( self._t_dict.get(s, s) )
|
382
|
+
|
383
|
+
out_latex_str = self.u.unicode_to_latex(s)
|
384
|
+
|
385
|
+
# print(s, out_latex_str)
|
386
|
+
|
387
|
+
if (
|
388
|
+
s.startswith("{") is False
|
389
|
+
and out_latex_str.startswith("{")
|
390
|
+
and s.endswith("}") is False
|
391
|
+
and out_latex_str.endswith("}")
|
392
|
+
):
|
393
|
+
out_latex_str = f" {out_latex_str[1:-1]} "
|
394
|
+
|
395
|
+
# print(s, out_latex_str)
|
396
|
+
|
397
|
+
if "ensuremath" in out_latex_str:
|
398
|
+
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
|
399
|
+
out_latex_str = out_latex_str.replace("}", " ")
|
400
|
+
|
401
|
+
# print(s, out_latex_str)
|
402
|
+
|
403
|
+
if out_latex_str.strip().startswith("\\text"):
|
404
|
+
out_latex_str = f" \\text{{{out_latex_str}}} "
|
405
|
+
|
406
|
+
# print(s, out_latex_str)
|
407
|
+
|
408
|
+
return out_latex_str
|
409
|
+
|
410
|
+
def do_r(self, elm):
|
411
|
+
"""
|
412
|
+
Get text from 'r' element,And try convert them to latex symbols
|
413
|
+
@todo text style support , (sty)
|
414
|
+
@todo \text (latex pure text support)
|
415
|
+
"""
|
416
|
+
_str = []
|
417
|
+
_base_str = []
|
418
|
+
for s in elm.findtext("./{0}t".format(OMML_NS)):
|
419
|
+
out_latex_str = self.process_unicode(s)
|
420
|
+
_str.append(out_latex_str)
|
421
|
+
_base_str.append(s)
|
422
|
+
|
423
|
+
proc_str = escape_latex(BLANK.join(_str))
|
424
|
+
base_proc_str = BLANK.join(_base_str)
|
425
|
+
|
426
|
+
if "{" not in base_proc_str and "\\{" in proc_str:
|
427
|
+
proc_str = proc_str.replace("\\{", "{")
|
428
|
+
|
429
|
+
if "}" not in base_proc_str and "\\}" in proc_str:
|
430
|
+
proc_str = proc_str.replace("\\}", "}")
|
431
|
+
|
432
|
+
return proc_str
|
433
|
+
|
434
|
+
tag2meth = {
|
435
|
+
"acc": do_acc,
|
436
|
+
"r": do_r,
|
437
|
+
"bar": do_bar,
|
438
|
+
"sub": do_sub,
|
439
|
+
"sup": do_sup,
|
440
|
+
"f": do_f,
|
441
|
+
"func": do_func,
|
442
|
+
"fName": do_fname,
|
443
|
+
"groupChr": do_groupchr,
|
444
|
+
"d": do_d,
|
445
|
+
"rad": do_rad,
|
446
|
+
"eqArr": do_eqarr,
|
447
|
+
"limLow": do_limlow,
|
448
|
+
"limUpp": do_limupp,
|
449
|
+
"lim": do_lim,
|
450
|
+
"m": do_m,
|
451
|
+
"mr": do_mr,
|
452
|
+
"nary": do_nary,
|
453
|
+
}
|
docling/backend/html_backend.py
CHANGED
@@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
134
134
|
self.analyze_tag(cast(Tag, element), doc)
|
135
135
|
except Exception as exc_child:
|
136
136
|
_log.error(
|
137
|
-
f"Error processing child from tag{tag.name}: {exc_child}"
|
137
|
+
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
|
138
138
|
)
|
139
139
|
raise exc_child
|
140
140
|
elif isinstance(element, NavigableString) and not isinstance(
|
@@ -347,11 +347,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
347
347
|
content_layer=self.content_layer,
|
348
348
|
)
|
349
349
|
self.level += 1
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
350
|
+
self.walk(element, doc)
|
351
|
+
self.parents[self.level + 1] = None
|
352
|
+
self.level -= 1
|
353
|
+
else:
|
354
|
+
self.walk(element, doc)
|
355
355
|
|
356
356
|
elif element.text.strip():
|
357
357
|
text = element.text.strip()
|
@@ -457,7 +457,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
457
457
|
end_row_offset_idx=row_idx + row_span,
|
458
458
|
start_col_offset_idx=col_idx,
|
459
459
|
end_col_offset_idx=col_idx + col_span,
|
460
|
-
|
460
|
+
column_header=col_header,
|
461
461
|
row_header=((not col_header) and html_cell.name == "th"),
|
462
462
|
)
|
463
463
|
data.table_cells.append(table_cell)
|
docling/backend/md_backend.py
CHANGED
@@ -136,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
136
136
|
end_row_offset_idx=trow_ind + row_span,
|
137
137
|
start_col_offset_idx=tcol_ind,
|
138
138
|
end_col_offset_idx=tcol_ind + col_span,
|
139
|
-
|
139
|
+
column_header=trow_ind == 0,
|
140
140
|
row_header=False,
|
141
141
|
)
|
142
142
|
tcells.append(icell)
|
@@ -164,7 +164,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
164
164
|
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
165
165
|
start_col_offset_idx=excel_cell.col,
|
166
166
|
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
167
|
-
|
167
|
+
column_header=excel_cell.row == 0,
|
168
168
|
row_header=False,
|
169
169
|
)
|
170
170
|
table_data.table_cells.append(cell)
|
@@ -173,7 +173,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
173
173
|
|
174
174
|
return doc
|
175
175
|
|
176
|
-
def _find_data_tables(self, sheet: Worksheet):
|
176
|
+
def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
|
177
177
|
"""
|
178
178
|
Find all compact rectangular data tables in a sheet.
|
179
179
|
"""
|
@@ -340,47 +340,4 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
340
340
|
except:
|
341
341
|
_log.error("could not extract the image from excel sheets")
|
342
342
|
|
343
|
-
"""
|
344
|
-
for idx, chart in enumerate(sheet._charts): # type: ignore
|
345
|
-
try:
|
346
|
-
chart_path = f"chart_{idx + 1}.png"
|
347
|
-
_log.info(
|
348
|
-
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
349
|
-
)
|
350
|
-
|
351
|
-
_log.info(f"Chart {idx + 1}:")
|
352
|
-
|
353
|
-
# Chart type
|
354
|
-
# _log.info(f"Type: {type(chart).__name__}")
|
355
|
-
print(f"Type: {type(chart).__name__}")
|
356
|
-
|
357
|
-
# Extract series data
|
358
|
-
for series_idx, series in enumerate(chart.series):
|
359
|
-
#_log.info(f"Series {series_idx + 1}:")
|
360
|
-
print(f"Series {series_idx + 1} type: {type(series).__name__}")
|
361
|
-
#print(f"x-values: {series.xVal}")
|
362
|
-
#print(f"y-values: {series.yVal}")
|
363
|
-
|
364
|
-
print(f"xval type: {type(series.xVal).__name__}")
|
365
|
-
|
366
|
-
xvals = []
|
367
|
-
for _ in series.xVal.numLit.pt:
|
368
|
-
print(f"xval type: {type(_).__name__}")
|
369
|
-
if hasattr(_, 'v'):
|
370
|
-
xvals.append(_.v)
|
371
|
-
|
372
|
-
print(f"x-values: {xvals}")
|
373
|
-
|
374
|
-
yvals = []
|
375
|
-
for _ in series.yVal:
|
376
|
-
if hasattr(_, 'v'):
|
377
|
-
yvals.append(_.v)
|
378
|
-
|
379
|
-
print(f"y-values: {yvals}")
|
380
|
-
|
381
|
-
except Exception as exc:
|
382
|
-
print(exc)
|
383
|
-
continue
|
384
|
-
"""
|
385
|
-
|
386
343
|
return doc
|
@@ -346,7 +346,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
346
346
|
end_row_offset_idx=row_idx + row_span,
|
347
347
|
start_col_offset_idx=col_idx,
|
348
348
|
end_col_offset_idx=col_idx + col_span,
|
349
|
-
|
349
|
+
column_header=row_idx == 0,
|
350
350
|
row_header=False,
|
351
351
|
)
|
352
352
|
if len(cell.text.strip()) > 0:
|
@@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError
|
|
26
26
|
from typing_extensions import override
|
27
27
|
|
28
28
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
29
|
+
from docling.backend.docx.latex.omml import oMath2Latex
|
29
30
|
from docling.datamodel.base_models import InputFormat
|
30
31
|
from docling.datamodel.document import InputDocument
|
31
32
|
|
@@ -260,6 +261,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
260
261
|
else:
|
261
262
|
return label, None
|
262
263
|
|
264
|
+
def handle_equations_in_text(self, element, text):
|
265
|
+
only_texts = []
|
266
|
+
only_equations = []
|
267
|
+
texts_and_equations = []
|
268
|
+
for subt in element.iter():
|
269
|
+
tag_name = etree.QName(subt).localname
|
270
|
+
if tag_name == "t" and "math" not in subt.tag:
|
271
|
+
only_texts.append(subt.text)
|
272
|
+
texts_and_equations.append(subt.text)
|
273
|
+
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
274
|
+
latex_equation = str(oMath2Latex(subt))
|
275
|
+
only_equations.append(latex_equation)
|
276
|
+
texts_and_equations.append(latex_equation)
|
277
|
+
|
278
|
+
if "".join(only_texts) != text:
|
279
|
+
return text
|
280
|
+
|
281
|
+
return "".join(texts_and_equations), only_equations
|
282
|
+
|
263
283
|
def handle_text_elements(
|
264
284
|
self,
|
265
285
|
element: BaseOxmlElement,
|
@@ -268,9 +288,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
268
288
|
) -> None:
|
269
289
|
paragraph = Paragraph(element, docx_obj)
|
270
290
|
|
271
|
-
|
291
|
+
raw_text = paragraph.text
|
292
|
+
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
|
293
|
+
|
294
|
+
if text is None:
|
272
295
|
return
|
273
|
-
text =
|
296
|
+
text = text.strip()
|
274
297
|
|
275
298
|
# Common styles for bullet and numbered lists.
|
276
299
|
# "List Bullet", "List Number", "List Paragraph"
|
@@ -323,6 +346,45 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
323
346
|
elif "Heading" in p_style_id:
|
324
347
|
self.add_header(doc, p_level, text)
|
325
348
|
|
349
|
+
elif len(equations) > 0:
|
350
|
+
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
351
|
+
# Standalone equation
|
352
|
+
level = self.get_level()
|
353
|
+
doc.add_text(
|
354
|
+
label=DocItemLabel.FORMULA,
|
355
|
+
parent=self.parents[level - 1],
|
356
|
+
text=text,
|
357
|
+
)
|
358
|
+
else:
|
359
|
+
# Inline equation
|
360
|
+
level = self.get_level()
|
361
|
+
inline_equation = doc.add_group(
|
362
|
+
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
363
|
+
)
|
364
|
+
text_tmp = text
|
365
|
+
for eq in equations:
|
366
|
+
if len(text_tmp) == 0:
|
367
|
+
break
|
368
|
+
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
|
369
|
+
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
|
370
|
+
if len(pre_eq_text) > 0:
|
371
|
+
doc.add_text(
|
372
|
+
label=DocItemLabel.PARAGRAPH,
|
373
|
+
parent=inline_equation,
|
374
|
+
text=pre_eq_text,
|
375
|
+
)
|
376
|
+
doc.add_text(
|
377
|
+
label=DocItemLabel.FORMULA,
|
378
|
+
parent=inline_equation,
|
379
|
+
text=eq,
|
380
|
+
)
|
381
|
+
if len(text_tmp) > 0:
|
382
|
+
doc.add_text(
|
383
|
+
label=DocItemLabel.PARAGRAPH,
|
384
|
+
parent=inline_equation,
|
385
|
+
text=text_tmp,
|
386
|
+
)
|
387
|
+
|
326
388
|
elif p_style_id in [
|
327
389
|
"Paragraph",
|
328
390
|
"Normal",
|
@@ -539,7 +601,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
539
601
|
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
540
602
|
start_col_offset_idx=col_idx,
|
541
603
|
end_col_offset_idx=col_idx + cell.grid_span,
|
542
|
-
|
604
|
+
column_header=row.grid_cols_before + row_idx == 0,
|
543
605
|
row_header=False,
|
544
606
|
)
|
545
607
|
data.table_cells.append(table_cell)
|
docling/backend/pdf_backend.py
CHANGED
@@ -4,10 +4,11 @@ from pathlib import Path
|
|
4
4
|
from typing import Iterable, Optional, Set, Union
|
5
5
|
|
6
6
|
from docling_core.types.doc import BoundingBox, Size
|
7
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
7
8
|
from PIL import Image
|
8
9
|
|
9
10
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
10
|
-
from docling.datamodel.base_models import
|
11
|
+
from docling.datamodel.base_models import InputFormat
|
11
12
|
from docling.datamodel.document import InputDocument
|
12
13
|
|
13
14
|
|
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
|
|
17
18
|
pass
|
18
19
|
|
19
20
|
@abstractmethod
|
20
|
-
def
|
21
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
22
|
+
pass
|
23
|
+
|
24
|
+
@abstractmethod
|
25
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
21
26
|
pass
|
22
27
|
|
23
28
|
@abstractmethod
|