docling 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +185 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +1 -1
  14. docling/backend/msword_backend.py +65 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +60 -21
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +26 -30
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/ocr_mac_model.py +39 -11
  31. docling/models/page_preprocessing_model.py +4 -0
  32. docling/models/picture_description_api_model.py +20 -3
  33. docling/models/picture_description_base_model.py +19 -3
  34. docling/models/picture_description_vlm_model.py +14 -2
  35. docling/models/plugins/__init__.py +0 -0
  36. docling/models/plugins/defaults.py +28 -0
  37. docling/models/rapid_ocr_model.py +34 -13
  38. docling/models/table_structure_model.py +13 -4
  39. docling/models/tesseract_ocr_cli_model.py +40 -15
  40. docling/models/tesseract_ocr_model.py +37 -12
  41. docling/pipeline/standard_pdf_pipeline.py +25 -78
  42. docling/utils/export.py +8 -6
  43. docling/utils/layout_postprocessor.py +26 -23
  44. docling/utils/visualization.py +1 -1
  45. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/METADATA +48 -19
  46. docling-2.27.0.dist-info/RECORD +83 -0
  47. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/entry_points.txt +3 -0
  48. docling-2.26.0.dist-info/RECORD +0 -72
  49. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/LICENSE +0 -0
  50. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,453 @@
1
+ """
2
+ Office Math Markup Language (OMML)
3
+
4
+ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
5
+ On 23/01/2025
6
+ """
7
+
8
+ import lxml.etree as ET
9
+ from pylatexenc.latexencode import UnicodeToLatexEncoder
10
+
11
+ from docling.backend.docx.latex.latex_dict import (
12
+ ALN,
13
+ ARR,
14
+ BACKSLASH,
15
+ BLANK,
16
+ BRK,
17
+ CHARS,
18
+ CHR,
19
+ CHR_BO,
20
+ CHR_DEFAULT,
21
+ D_DEFAULT,
22
+ F_DEFAULT,
23
+ FUNC,
24
+ FUNC_PLACE,
25
+ LIM_FUNC,
26
+ LIM_TO,
27
+ LIM_UPP,
28
+ POS,
29
+ POS_DEFAULT,
30
+ RAD,
31
+ RAD_DEFAULT,
32
+ SUB,
33
+ SUP,
34
+ D,
35
+ F,
36
+ M,
37
+ T,
38
+ )
39
+
40
+ OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
41
+
42
+
43
+ def load(stream):
44
+ tree = ET.parse(stream)
45
+ for omath in tree.findall(OMML_NS + "oMath"):
46
+ yield oMath2Latex(omath)
47
+
48
+
49
+ def load_string(string):
50
+ root = ET.fromstring(string)
51
+ for omath in root.findall(OMML_NS + "oMath"):
52
+ yield oMath2Latex(omath)
53
+
54
+
55
+ def escape_latex(strs):
56
+ last = None
57
+ new_chr = []
58
+ strs = strs.replace(r"\\", "\\")
59
+ for c in strs:
60
+ if (c in CHARS) and (last != BACKSLASH):
61
+ new_chr.append(BACKSLASH + c)
62
+ else:
63
+ new_chr.append(c)
64
+ last = c
65
+ return BLANK.join(new_chr)
66
+
67
+
68
+ def get_val(key, default=None, store=CHR):
69
+ if key is not None:
70
+ return key if not store else store.get(key, key)
71
+ else:
72
+ return default
73
+
74
+
75
+ class Tag2Method(object):
76
+
77
+ def call_method(self, elm, stag=None):
78
+ getmethod = self.tag2meth.get
79
+ if stag is None:
80
+ stag = elm.tag.replace(OMML_NS, "")
81
+ method = getmethod(stag)
82
+ if method:
83
+ return method(self, elm)
84
+ else:
85
+ return None
86
+
87
+ def process_children_list(self, elm, include=None):
88
+ """
89
+ process children of the elm,return iterable
90
+ """
91
+ for _e in list(elm):
92
+ if OMML_NS not in _e.tag:
93
+ continue
94
+ stag = _e.tag.replace(OMML_NS, "")
95
+ if include and (stag not in include):
96
+ continue
97
+ t = self.call_method(_e, stag=stag)
98
+ if t is None:
99
+ t = self.process_unknow(_e, stag)
100
+ if t is None:
101
+ continue
102
+ yield (stag, t, _e)
103
+
104
+ def process_children_dict(self, elm, include=None):
105
+ """
106
+ process children of the elm,return dict
107
+ """
108
+ latex_chars = dict()
109
+ for stag, t, e in self.process_children_list(elm, include):
110
+ latex_chars[stag] = t
111
+ return latex_chars
112
+
113
+ def process_children(self, elm, include=None):
114
+ """
115
+ process children of the elm,return string
116
+ """
117
+ return BLANK.join(
118
+ (
119
+ t if not isinstance(t, Tag2Method) else str(t)
120
+ for stag, t, e in self.process_children_list(elm, include)
121
+ )
122
+ )
123
+
124
+ def process_unknow(self, elm, stag):
125
+ return None
126
+
127
+
128
+ class Pr(Tag2Method):
129
+
130
+ text = ""
131
+
132
+ __val_tags = ("chr", "pos", "begChr", "endChr", "type")
133
+
134
+ __innerdict = None # can't use the __dict__
135
+
136
+ """ common properties of element"""
137
+
138
+ def __init__(self, elm):
139
+ self.__innerdict = {}
140
+ self.text = self.process_children(elm)
141
+
142
+ def __str__(self):
143
+ return self.text
144
+
145
+ def __unicode__(self):
146
+ return self.__str__(self)
147
+
148
+ def __getattr__(self, name):
149
+ return self.__innerdict.get(name, None)
150
+
151
+ def do_brk(self, elm):
152
+ self.__innerdict["brk"] = BRK
153
+ return BRK
154
+
155
+ def do_common(self, elm):
156
+ stag = elm.tag.replace(OMML_NS, "")
157
+ if stag in self.__val_tags:
158
+ t = elm.get("{0}val".format(OMML_NS))
159
+ self.__innerdict[stag] = t
160
+ return None
161
+
162
+ tag2meth = {
163
+ "brk": do_brk,
164
+ "chr": do_common,
165
+ "pos": do_common,
166
+ "begChr": do_common,
167
+ "endChr": do_common,
168
+ "type": do_common,
169
+ }
170
+
171
+
172
+ class oMath2Latex(Tag2Method):
173
+ """
174
+ Convert oMath element of omml to latex
175
+ """
176
+
177
+ _t_dict = T
178
+
179
+ __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
180
+ u = UnicodeToLatexEncoder(
181
+ replacement_latex_protection="braces-all",
182
+ unknown_char_policy="keep",
183
+ unknown_char_warning=False,
184
+ )
185
+
186
+ def __init__(self, element):
187
+ self._latex = self.process_children(element)
188
+
189
+ def __str__(self):
190
+ return self.latex.replace(" ", " ")
191
+
192
+ def __unicode__(self):
193
+ return self.__str__(self)
194
+
195
+ def process_unknow(self, elm, stag):
196
+ if stag in self.__direct_tags:
197
+ return self.process_children(elm)
198
+ elif stag[-2:] == "Pr":
199
+ return Pr(elm)
200
+ else:
201
+ return None
202
+
203
+ @property
204
+ def latex(self):
205
+ return self._latex
206
+
207
+ def do_acc(self, elm):
208
+ """
209
+ the accent function
210
+ """
211
+ c_dict = self.process_children_dict(elm)
212
+ latex_s = get_val(
213
+ c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
214
+ )
215
+ return latex_s.format(c_dict["e"])
216
+
217
+ def do_bar(self, elm):
218
+ """
219
+ the bar function
220
+ """
221
+ c_dict = self.process_children_dict(elm)
222
+ pr = c_dict["barPr"]
223
+ latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
224
+ return pr.text + latex_s.format(c_dict["e"])
225
+
226
+ def do_d(self, elm):
227
+ """
228
+ the delimiter object
229
+ """
230
+ c_dict = self.process_children_dict(elm)
231
+ pr = c_dict["dPr"]
232
+ null = D_DEFAULT.get("null")
233
+
234
+ s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
235
+ e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
236
+ delim = pr.text + D.format(
237
+ left=null if not s_val else escape_latex(s_val),
238
+ text=c_dict["e"],
239
+ right=null if not e_val else escape_latex(e_val),
240
+ )
241
+ return delim
242
+
243
+ def do_spre(self, elm):
244
+ """
245
+ the Pre-Sub-Superscript object -- Not support yet
246
+ """
247
+ pass
248
+
249
+ def do_sub(self, elm):
250
+ text = self.process_children(elm)
251
+ return SUB.format(text)
252
+
253
+ def do_sup(self, elm):
254
+ text = self.process_children(elm)
255
+ return SUP.format(text)
256
+
257
+ def do_f(self, elm):
258
+ """
259
+ the fraction object
260
+ """
261
+ c_dict = self.process_children_dict(elm)
262
+ pr = c_dict["fPr"]
263
+ latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
264
+ return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
265
+
266
+ def do_func(self, elm):
267
+ """
268
+ the Function-Apply object (Examples:sin cos)
269
+ """
270
+ c_dict = self.process_children_dict(elm)
271
+ func_name = c_dict.get("fName")
272
+ return func_name.replace(FUNC_PLACE, c_dict.get("e"))
273
+
274
+ def do_fname(self, elm):
275
+ """
276
+ the func name
277
+ """
278
+ latex_chars = []
279
+ for stag, t, e in self.process_children_list(elm):
280
+ if stag == "r":
281
+ if FUNC.get(t):
282
+ latex_chars.append(FUNC[t])
283
+ else:
284
+ raise NotSupport("Not support func %s" % t)
285
+ else:
286
+ latex_chars.append(t)
287
+ t = BLANK.join(latex_chars)
288
+ return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
289
+
290
+ def do_groupchr(self, elm):
291
+ """
292
+ the Group-Character object
293
+ """
294
+ c_dict = self.process_children_dict(elm)
295
+ pr = c_dict["groupChrPr"]
296
+ latex_s = get_val(pr.chr)
297
+ return pr.text + latex_s.format(c_dict["e"])
298
+
299
+ def do_rad(self, elm):
300
+ """
301
+ the radical object
302
+ """
303
+ c_dict = self.process_children_dict(elm)
304
+ text = c_dict.get("e")
305
+ deg_text = c_dict.get("deg")
306
+ if deg_text:
307
+ return RAD.format(deg=deg_text, text=text)
308
+ else:
309
+ return RAD_DEFAULT.format(text=text)
310
+
311
+ def do_eqarr(self, elm):
312
+ """
313
+ the Array object
314
+ """
315
+ return ARR.format(
316
+ text=BRK.join(
317
+ [t for stag, t, e in self.process_children_list(elm, include=("e",))]
318
+ )
319
+ )
320
+
321
+ def do_limlow(self, elm):
322
+ """
323
+ the Lower-Limit object
324
+ """
325
+ t_dict = self.process_children_dict(elm, include=("e", "lim"))
326
+ latex_s = LIM_FUNC.get(t_dict["e"])
327
+ if not latex_s:
328
+ raise NotSupport("Not support lim %s" % t_dict["e"])
329
+ else:
330
+ return latex_s.format(lim=t_dict.get("lim"))
331
+
332
+ def do_limupp(self, elm):
333
+ """
334
+ the Upper-Limit object
335
+ """
336
+ t_dict = self.process_children_dict(elm, include=("e", "lim"))
337
+ return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
338
+
339
+ def do_lim(self, elm):
340
+ """
341
+ the lower limit of the limLow object and the upper limit of the limUpp function
342
+ """
343
+ return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
344
+
345
+ def do_m(self, elm):
346
+ """
347
+ the Matrix object
348
+ """
349
+ rows = []
350
+ for stag, t, e in self.process_children_list(elm):
351
+ if stag == "mPr":
352
+ pass
353
+ elif stag == "mr":
354
+ rows.append(t)
355
+ return M.format(text=BRK.join(rows))
356
+
357
+ def do_mr(self, elm):
358
+ """
359
+ a single row of the matrix m
360
+ """
361
+ return ALN.join(
362
+ [t for stag, t, e in self.process_children_list(elm, include=("e",))]
363
+ )
364
+
365
+ def do_nary(self, elm):
366
+ """
367
+ the n-ary object
368
+ """
369
+ res = []
370
+ bo = ""
371
+ for stag, t, e in self.process_children_list(elm):
372
+ if stag == "naryPr":
373
+ bo = get_val(t.chr, store=CHR_BO)
374
+ else:
375
+ res.append(t)
376
+ return bo + BLANK.join(res)
377
+
378
+ def process_unicode(self, s):
379
+ # s = s if isinstance(s,unicode) else unicode(s,'utf-8')
380
+ # print(s, self._t_dict.get(s, s), unicode_to_latex(s))
381
+ # _str.append( self._t_dict.get(s, s) )
382
+
383
+ out_latex_str = self.u.unicode_to_latex(s)
384
+
385
+ # print(s, out_latex_str)
386
+
387
+ if (
388
+ s.startswith("{") is False
389
+ and out_latex_str.startswith("{")
390
+ and s.endswith("}") is False
391
+ and out_latex_str.endswith("}")
392
+ ):
393
+ out_latex_str = f" {out_latex_str[1:-1]} "
394
+
395
+ # print(s, out_latex_str)
396
+
397
+ if "ensuremath" in out_latex_str:
398
+ out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
399
+ out_latex_str = out_latex_str.replace("}", " ")
400
+
401
+ # print(s, out_latex_str)
402
+
403
+ if out_latex_str.strip().startswith("\\text"):
404
+ out_latex_str = f" \\text{{{out_latex_str}}} "
405
+
406
+ # print(s, out_latex_str)
407
+
408
+ return out_latex_str
409
+
410
+ def do_r(self, elm):
411
+ """
412
+ Get text from 'r' element,And try convert them to latex symbols
413
+ @todo text style support , (sty)
414
+ @todo \text (latex pure text support)
415
+ """
416
+ _str = []
417
+ _base_str = []
418
+ for s in elm.findtext("./{0}t".format(OMML_NS)):
419
+ out_latex_str = self.process_unicode(s)
420
+ _str.append(out_latex_str)
421
+ _base_str.append(s)
422
+
423
+ proc_str = escape_latex(BLANK.join(_str))
424
+ base_proc_str = BLANK.join(_base_str)
425
+
426
+ if "{" not in base_proc_str and "\\{" in proc_str:
427
+ proc_str = proc_str.replace("\\{", "{")
428
+
429
+ if "}" not in base_proc_str and "\\}" in proc_str:
430
+ proc_str = proc_str.replace("\\}", "}")
431
+
432
+ return proc_str
433
+
434
+ tag2meth = {
435
+ "acc": do_acc,
436
+ "r": do_r,
437
+ "bar": do_bar,
438
+ "sub": do_sub,
439
+ "sup": do_sup,
440
+ "f": do_f,
441
+ "func": do_func,
442
+ "fName": do_fname,
443
+ "groupChr": do_groupchr,
444
+ "d": do_d,
445
+ "rad": do_rad,
446
+ "eqArr": do_eqarr,
447
+ "limLow": do_limlow,
448
+ "limUpp": do_limupp,
449
+ "lim": do_lim,
450
+ "m": do_m,
451
+ "mr": do_mr,
452
+ "nary": do_nary,
453
+ }
@@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
134
134
  self.analyze_tag(cast(Tag, element), doc)
135
135
  except Exception as exc_child:
136
136
  _log.error(
137
- f"Error processing child from tag{tag.name}: {exc_child}"
137
+ f"Error processing child from tag {tag.name}: {repr(exc_child)}"
138
138
  )
139
139
  raise exc_child
140
140
  elif isinstance(element, NavigableString) and not isinstance(
@@ -347,11 +347,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
347
347
  content_layer=self.content_layer,
348
348
  )
349
349
  self.level += 1
350
-
351
- self.walk(element, doc)
352
-
353
- self.parents[self.level + 1] = None
354
- self.level -= 1
350
+ self.walk(element, doc)
351
+ self.parents[self.level + 1] = None
352
+ self.level -= 1
353
+ else:
354
+ self.walk(element, doc)
355
355
 
356
356
  elif element.text.strip():
357
357
  text = element.text.strip()
@@ -457,7 +457,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
457
457
  end_row_offset_idx=row_idx + row_span,
458
458
  start_col_offset_idx=col_idx,
459
459
  end_col_offset_idx=col_idx + col_span,
460
- col_header=col_header,
460
+ column_header=col_header,
461
461
  row_header=((not col_header) and html_cell.name == "th"),
462
462
  )
463
463
  data.table_cells.append(table_cell)
@@ -136,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
136
136
  end_row_offset_idx=trow_ind + row_span,
137
137
  start_col_offset_idx=tcol_ind,
138
138
  end_col_offset_idx=tcol_ind + col_span,
139
- col_header=False,
139
+ column_header=trow_ind == 0,
140
140
  row_header=False,
141
141
  )
142
142
  tcells.append(icell)
@@ -164,7 +164,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
164
164
  end_row_offset_idx=excel_cell.row + excel_cell.row_span,
165
165
  start_col_offset_idx=excel_cell.col,
166
166
  end_col_offset_idx=excel_cell.col + excel_cell.col_span,
167
- col_header=False,
167
+ column_header=excel_cell.row == 0,
168
168
  row_header=False,
169
169
  )
170
170
  table_data.table_cells.append(cell)
@@ -173,7 +173,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
173
173
 
174
174
  return doc
175
175
 
176
- def _find_data_tables(self, sheet: Worksheet):
176
+ def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
177
177
  """
178
178
  Find all compact rectangular data tables in a sheet.
179
179
  """
@@ -340,47 +340,4 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
340
340
  except:
341
341
  _log.error("could not extract the image from excel sheets")
342
342
 
343
- """
344
- for idx, chart in enumerate(sheet._charts): # type: ignore
345
- try:
346
- chart_path = f"chart_{idx + 1}.png"
347
- _log.info(
348
- f"Chart found, but dynamic rendering is required for: {chart_path}"
349
- )
350
-
351
- _log.info(f"Chart {idx + 1}:")
352
-
353
- # Chart type
354
- # _log.info(f"Type: {type(chart).__name__}")
355
- print(f"Type: {type(chart).__name__}")
356
-
357
- # Extract series data
358
- for series_idx, series in enumerate(chart.series):
359
- #_log.info(f"Series {series_idx + 1}:")
360
- print(f"Series {series_idx + 1} type: {type(series).__name__}")
361
- #print(f"x-values: {series.xVal}")
362
- #print(f"y-values: {series.yVal}")
363
-
364
- print(f"xval type: {type(series.xVal).__name__}")
365
-
366
- xvals = []
367
- for _ in series.xVal.numLit.pt:
368
- print(f"xval type: {type(_).__name__}")
369
- if hasattr(_, 'v'):
370
- xvals.append(_.v)
371
-
372
- print(f"x-values: {xvals}")
373
-
374
- yvals = []
375
- for _ in series.yVal:
376
- if hasattr(_, 'v'):
377
- yvals.append(_.v)
378
-
379
- print(f"y-values: {yvals}")
380
-
381
- except Exception as exc:
382
- print(exc)
383
- continue
384
- """
385
-
386
343
  return doc
@@ -346,7 +346,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
346
346
  end_row_offset_idx=row_idx + row_span,
347
347
  start_col_offset_idx=col_idx,
348
348
  end_col_offset_idx=col_idx + col_span,
349
- col_header=False,
349
+ column_header=row_idx == 0,
350
350
  row_header=False,
351
351
  )
352
352
  if len(cell.text.strip()) > 0:
@@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError
26
26
  from typing_extensions import override
27
27
 
28
28
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
29
+ from docling.backend.docx.latex.omml import oMath2Latex
29
30
  from docling.datamodel.base_models import InputFormat
30
31
  from docling.datamodel.document import InputDocument
31
32
 
@@ -260,6 +261,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
260
261
  else:
261
262
  return label, None
262
263
 
264
+ def handle_equations_in_text(self, element, text):
265
+ only_texts = []
266
+ only_equations = []
267
+ texts_and_equations = []
268
+ for subt in element.iter():
269
+ tag_name = etree.QName(subt).localname
270
+ if tag_name == "t" and "math" not in subt.tag:
271
+ only_texts.append(subt.text)
272
+ texts_and_equations.append(subt.text)
273
+ elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
274
+ latex_equation = str(oMath2Latex(subt))
275
+ only_equations.append(latex_equation)
276
+ texts_and_equations.append(latex_equation)
277
+
278
+ if "".join(only_texts) != text:
279
+ return text
280
+
281
+ return "".join(texts_and_equations), only_equations
282
+
263
283
  def handle_text_elements(
264
284
  self,
265
285
  element: BaseOxmlElement,
@@ -268,9 +288,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
268
288
  ) -> None:
269
289
  paragraph = Paragraph(element, docx_obj)
270
290
 
271
- if paragraph.text is None:
291
+ raw_text = paragraph.text
292
+ text, equations = self.handle_equations_in_text(element=element, text=raw_text)
293
+
294
+ if text is None:
272
295
  return
273
- text = paragraph.text.strip()
296
+ text = text.strip()
274
297
 
275
298
  # Common styles for bullet and numbered lists.
276
299
  # "List Bullet", "List Number", "List Paragraph"
@@ -323,6 +346,45 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
323
346
  elif "Heading" in p_style_id:
324
347
  self.add_header(doc, p_level, text)
325
348
 
349
+ elif len(equations) > 0:
350
+ if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
351
+ # Standalone equation
352
+ level = self.get_level()
353
+ doc.add_text(
354
+ label=DocItemLabel.FORMULA,
355
+ parent=self.parents[level - 1],
356
+ text=text,
357
+ )
358
+ else:
359
+ # Inline equation
360
+ level = self.get_level()
361
+ inline_equation = doc.add_group(
362
+ label=GroupLabel.INLINE, parent=self.parents[level - 1]
363
+ )
364
+ text_tmp = text
365
+ for eq in equations:
366
+ if len(text_tmp) == 0:
367
+ break
368
+ pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
369
+ text_tmp = text_tmp.split(eq, maxsplit=1)[1]
370
+ if len(pre_eq_text) > 0:
371
+ doc.add_text(
372
+ label=DocItemLabel.PARAGRAPH,
373
+ parent=inline_equation,
374
+ text=pre_eq_text,
375
+ )
376
+ doc.add_text(
377
+ label=DocItemLabel.FORMULA,
378
+ parent=inline_equation,
379
+ text=eq,
380
+ )
381
+ if len(text_tmp) > 0:
382
+ doc.add_text(
383
+ label=DocItemLabel.PARAGRAPH,
384
+ parent=inline_equation,
385
+ text=text_tmp,
386
+ )
387
+
326
388
  elif p_style_id in [
327
389
  "Paragraph",
328
390
  "Normal",
@@ -539,7 +601,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
539
601
  end_row_offset_idx=row.grid_cols_before + spanned_idx,
540
602
  start_col_offset_idx=col_idx,
541
603
  end_col_offset_idx=col_idx + cell.grid_span,
542
- col_header=False,
604
+ column_header=row.grid_cols_before + row_idx == 0,
543
605
  row_header=False,
544
606
  )
545
607
  data.table_cells.append(table_cell)
@@ -4,10 +4,11 @@ from pathlib import Path
4
4
  from typing import Iterable, Optional, Set, Union
5
5
 
6
6
  from docling_core.types.doc import BoundingBox, Size
7
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
7
8
  from PIL import Image
8
9
 
9
10
  from docling.backend.abstract_backend import PaginatedDocumentBackend
10
- from docling.datamodel.base_models import Cell, InputFormat
11
+ from docling.datamodel.base_models import InputFormat
11
12
  from docling.datamodel.document import InputDocument
12
13
 
13
14
 
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
17
18
  pass
18
19
 
19
20
  @abstractmethod
20
- def get_text_cells(self) -> Iterable[Cell]:
21
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
22
+ pass
23
+
24
+ @abstractmethod
25
+ def get_text_cells(self) -> Iterable[TextCell]:
21
26
  pass
22
27
 
23
28
  @abstractmethod