docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,459 @@
1
+ """
2
+ Office Math Markup Language (OMML)
3
+
4
+ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
5
+ On 23/01/2025
6
+ """
7
+
8
+ import logging
9
+
10
+ import lxml.etree as ET
11
+ from pylatexenc.latexencode import UnicodeToLatexEncoder
12
+
13
+ from docling.backend.docx.latex.latex_dict import (
14
+ ALN,
15
+ ARR,
16
+ BACKSLASH,
17
+ BLANK,
18
+ BRK,
19
+ CHARS,
20
+ CHR,
21
+ CHR_BO,
22
+ CHR_DEFAULT,
23
+ D_DEFAULT,
24
+ F_DEFAULT,
25
+ FUNC,
26
+ FUNC_PLACE,
27
+ LIM_FUNC,
28
+ LIM_TO,
29
+ LIM_UPP,
30
+ POS,
31
+ POS_DEFAULT,
32
+ RAD,
33
+ RAD_DEFAULT,
34
+ SUB,
35
+ SUP,
36
+ D,
37
+ F,
38
+ M,
39
+ T,
40
+ )
41
+
42
+ OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
43
+
44
+ _log = logging.getLogger(__name__)
45
+
46
+
47
+ def load(stream):
48
+ tree = ET.parse(stream)
49
+ for omath in tree.findall(OMML_NS + "oMath"):
50
+ yield oMath2Latex(omath)
51
+
52
+
53
+ def load_string(string):
54
+ root = ET.fromstring(string)
55
+ for omath in root.findall(OMML_NS + "oMath"):
56
+ yield oMath2Latex(omath)
57
+
58
+
59
+ def escape_latex(strs):
60
+ last = None
61
+ new_chr = []
62
+ strs = strs.replace(r"\\", "\\")
63
+ for c in strs:
64
+ if (c in CHARS) and (last != BACKSLASH):
65
+ new_chr.append(BACKSLASH + c)
66
+ else:
67
+ new_chr.append(c)
68
+ last = c
69
+ return BLANK.join(new_chr)
70
+
71
+
72
+ def get_val(key, default=None, store=CHR):
73
+ if key is not None:
74
+ return key if not store else store.get(key, key)
75
+ else:
76
+ return default
77
+
78
+
79
+ class Tag2Method:
80
+ def call_method(self, elm, stag=None):
81
+ getmethod = self.tag2meth.get
82
+ if stag is None:
83
+ stag = elm.tag.replace(OMML_NS, "")
84
+ method = getmethod(stag)
85
+ if method:
86
+ return method(self, elm)
87
+ else:
88
+ return None
89
+
90
+ def process_children_list(self, elm, include=None):
91
+ """
92
+ process children of the elm,return iterable
93
+ """
94
+ for _e in list(elm):
95
+ if OMML_NS not in _e.tag:
96
+ continue
97
+ stag = _e.tag.replace(OMML_NS, "")
98
+ if include and (stag not in include):
99
+ continue
100
+ t = self.call_method(_e, stag=stag)
101
+ if t is None:
102
+ t = self.process_unknow(_e, stag)
103
+ if t is None:
104
+ continue
105
+ yield (stag, t, _e)
106
+
107
+ def process_children_dict(self, elm, include=None):
108
+ """
109
+ process children of the elm,return dict
110
+ """
111
+ latex_chars = dict()
112
+ for stag, t, e in self.process_children_list(elm, include):
113
+ latex_chars[stag] = t
114
+ return latex_chars
115
+
116
+ def process_children(self, elm, include=None):
117
+ """
118
+ process children of the elm,return string
119
+ """
120
+ return BLANK.join(
121
+ (
122
+ t if not isinstance(t, Tag2Method) else str(t)
123
+ for stag, t, e in self.process_children_list(elm, include)
124
+ )
125
+ )
126
+
127
+ def process_unknow(self, elm, stag):
128
+ return None
129
+
130
+
131
+ class Pr(Tag2Method):
132
+ text = ""
133
+
134
+ __val_tags = ("chr", "pos", "begChr", "endChr", "type")
135
+
136
+ __innerdict = None # can't use the __dict__
137
+
138
+ """ common properties of element"""
139
+
140
+ def __init__(self, elm):
141
+ self.__innerdict = {}
142
+ self.text = self.process_children(elm)
143
+
144
+ def __str__(self):
145
+ return self.text
146
+
147
+ def __unicode__(self):
148
+ return self.__str__(self)
149
+
150
+ def __getattr__(self, name):
151
+ return self.__innerdict.get(name, None)
152
+
153
+ def do_brk(self, elm):
154
+ self.__innerdict["brk"] = BRK
155
+ return BRK
156
+
157
+ def do_common(self, elm):
158
+ stag = elm.tag.replace(OMML_NS, "")
159
+ if stag in self.__val_tags:
160
+ t = elm.get(f"{OMML_NS}val")
161
+ self.__innerdict[stag] = t
162
+ return None
163
+
164
+ tag2meth = {
165
+ "brk": do_brk,
166
+ "chr": do_common,
167
+ "pos": do_common,
168
+ "begChr": do_common,
169
+ "endChr": do_common,
170
+ "type": do_common,
171
+ }
172
+
173
+
174
+ class oMath2Latex(Tag2Method):
175
+ """
176
+ Convert oMath element of omml to latex
177
+ """
178
+
179
+ _t_dict = T
180
+
181
+ __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
182
+ u = UnicodeToLatexEncoder(
183
+ replacement_latex_protection="braces-all",
184
+ unknown_char_policy="keep",
185
+ unknown_char_warning=False,
186
+ )
187
+
188
+ def __init__(self, element):
189
+ self._latex = self.process_children(element)
190
+
191
+ def __str__(self):
192
+ return self.latex.replace(" ", " ")
193
+
194
+ def __unicode__(self):
195
+ return self.__str__(self)
196
+
197
+ def process_unknow(self, elm, stag):
198
+ if stag in self.__direct_tags:
199
+ return self.process_children(elm)
200
+ elif stag[-2:] == "Pr":
201
+ return Pr(elm)
202
+ else:
203
+ return None
204
+
205
+ @property
206
+ def latex(self):
207
+ return self._latex
208
+
209
+ def do_acc(self, elm):
210
+ """
211
+ the accent function
212
+ """
213
+ c_dict = self.process_children_dict(elm)
214
+ latex_s = get_val(
215
+ c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
216
+ )
217
+ return latex_s.format(c_dict["e"])
218
+
219
+ def do_bar(self, elm):
220
+ """
221
+ the bar function
222
+ """
223
+ c_dict = self.process_children_dict(elm)
224
+ pr = c_dict["barPr"]
225
+ latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
226
+ return pr.text + latex_s.format(c_dict["e"])
227
+
228
+ def do_d(self, elm):
229
+ """
230
+ the delimiter object
231
+ """
232
+ c_dict = self.process_children_dict(elm)
233
+ pr = c_dict["dPr"]
234
+ null = D_DEFAULT.get("null")
235
+
236
+ s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
237
+ e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
238
+ delim = pr.text + D.format(
239
+ left=null if not s_val else escape_latex(s_val),
240
+ text=c_dict["e"],
241
+ right=null if not e_val else escape_latex(e_val),
242
+ )
243
+ return delim
244
+
245
+ def do_spre(self, elm):
246
+ """
247
+ the Pre-Sub-Superscript object -- Not support yet
248
+ """
249
+
250
+ def do_sub(self, elm):
251
+ text = self.process_children(elm)
252
+ return SUB.format(text)
253
+
254
+ def do_sup(self, elm):
255
+ text = self.process_children(elm)
256
+ return SUP.format(text)
257
+
258
+ def do_f(self, elm):
259
+ """
260
+ the fraction object
261
+ """
262
+ c_dict = self.process_children_dict(elm)
263
+ pr = c_dict.get("fPr")
264
+ if pr is None:
265
+ # Handle missing fPr element gracefully
266
+ _log.debug("Missing fPr element in fraction, using default formatting")
267
+ latex_s = F_DEFAULT
268
+ return latex_s.format(
269
+ num=c_dict.get("num"),
270
+ den=c_dict.get("den"),
271
+ )
272
+ latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
273
+ return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
274
+
275
+ def do_func(self, elm):
276
+ """
277
+ the Function-Apply object (Examples:sin cos)
278
+ """
279
+ c_dict = self.process_children_dict(elm)
280
+ func_name = c_dict.get("fName")
281
+ return func_name.replace(FUNC_PLACE, c_dict.get("e"))
282
+
283
+ def do_fname(self, elm):
284
+ """
285
+ the func name
286
+ """
287
+ latex_chars = []
288
+ for stag, t, e in self.process_children_list(elm):
289
+ if stag == "r":
290
+ if FUNC.get(t):
291
+ latex_chars.append(FUNC[t])
292
+ else:
293
+ _log.warning("Function not supported, will default to text: %s", t)
294
+ if isinstance(t, str):
295
+ latex_chars.append(t)
296
+ elif isinstance(t, str):
297
+ latex_chars.append(t)
298
+ t = BLANK.join(latex_chars)
299
+ return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
300
+
301
+ def do_groupchr(self, elm):
302
+ """
303
+ the Group-Character object
304
+ """
305
+ c_dict = self.process_children_dict(elm)
306
+ pr = c_dict["groupChrPr"]
307
+ latex_s = get_val(pr.chr)
308
+ return pr.text + latex_s.format(c_dict["e"])
309
+
310
+ def do_rad(self, elm):
311
+ """
312
+ the radical object
313
+ """
314
+ c_dict = self.process_children_dict(elm)
315
+ text = c_dict.get("e")
316
+ deg_text = c_dict.get("deg")
317
+ if deg_text:
318
+ return RAD.format(deg=deg_text, text=text)
319
+ else:
320
+ return RAD_DEFAULT.format(text=text)
321
+
322
+ def do_eqarr(self, elm):
323
+ """
324
+ the Array object
325
+ """
326
+ return ARR.format(
327
+ text=BRK.join(
328
+ [t for stag, t, e in self.process_children_list(elm, include=("e",))]
329
+ )
330
+ )
331
+
332
+ def do_limlow(self, elm):
333
+ """
334
+ the Lower-Limit object
335
+ """
336
+ t_dict = self.process_children_dict(elm, include=("e", "lim"))
337
+ latex_s = LIM_FUNC.get(t_dict["e"])
338
+ if not latex_s:
339
+ raise RuntimeError("Not support lim {}".format(t_dict["e"]))
340
+ else:
341
+ return latex_s.format(lim=t_dict.get("lim"))
342
+
343
+ def do_limupp(self, elm):
344
+ """
345
+ the Upper-Limit object
346
+ """
347
+ t_dict = self.process_children_dict(elm, include=("e", "lim"))
348
+ return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
349
+
350
+ def do_lim(self, elm):
351
+ """
352
+ the lower limit of the limLow object and the upper limit of the limUpp function
353
+ """
354
+ return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
355
+
356
+ def do_m(self, elm):
357
+ """
358
+ the Matrix object
359
+ """
360
+ rows = []
361
+ for stag, t, e in self.process_children_list(elm):
362
+ if stag == "mPr":
363
+ pass
364
+ elif stag == "mr":
365
+ rows.append(t)
366
+ return M.format(text=BRK.join(rows))
367
+
368
+ def do_mr(self, elm):
369
+ """
370
+ a single row of the matrix m
371
+ """
372
+ return ALN.join(
373
+ [t for stag, t, e in self.process_children_list(elm, include=("e",))]
374
+ )
375
+
376
+ def do_nary(self, elm):
377
+ """
378
+ the n-ary object
379
+ """
380
+ res = []
381
+ bo = ""
382
+ for stag, t, e in self.process_children_list(elm):
383
+ if stag == "naryPr":
384
+ # if <m:naryPr> contains no <m:chr>, the n-ary represents an integral
385
+ bo = get_val(t.chr, default="\\int", store=CHR_BO)
386
+ else:
387
+ res.append(t)
388
+ return bo + BLANK.join(res)
389
+
390
+ def process_unicode(self, s):
391
+ # s = s if isinstance(s,unicode) else unicode(s,'utf-8')
392
+ # print(s, self._t_dict.get(s, s), unicode_to_latex(s))
393
+ # _str.append( self._t_dict.get(s, s) )
394
+
395
+ out_latex_str = self.u.unicode_to_latex(s)
396
+
397
+ if (
398
+ s.startswith("{") is False
399
+ and out_latex_str.startswith("{")
400
+ and s.endswith("}") is False
401
+ and out_latex_str.endswith("}")
402
+ ):
403
+ out_latex_str = f" {out_latex_str[1:-1]} "
404
+
405
+ if "ensuremath" in out_latex_str:
406
+ out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
407
+ out_latex_str = out_latex_str.replace("}", " ")
408
+
409
+ if out_latex_str.strip().startswith("\\text"):
410
+ out_latex_str = f" \\text{{{out_latex_str}}} "
411
+
412
+ return out_latex_str
413
+
414
+ def do_r(self, elm):
415
+ """
416
+ Get text from 'r' element,And try convert them to latex symbols
417
+ @todo text style support , (sty)
418
+ @todo \text (latex pure text support)
419
+ """
420
+ _str = []
421
+ _base_str = []
422
+ found_text = elm.findtext(f"./{OMML_NS}t")
423
+ if found_text:
424
+ for s in found_text:
425
+ out_latex_str = self.process_unicode(s)
426
+ _str.append(out_latex_str)
427
+ _base_str.append(s)
428
+
429
+ proc_str = escape_latex(BLANK.join(_str))
430
+ base_proc_str = BLANK.join(_base_str)
431
+
432
+ if "{" not in base_proc_str and "\\{" in proc_str:
433
+ proc_str = proc_str.replace("\\{", "{")
434
+
435
+ if "}" not in base_proc_str and "\\}" in proc_str:
436
+ proc_str = proc_str.replace("\\}", "}")
437
+
438
+ return proc_str
439
+
440
+ tag2meth = {
441
+ "acc": do_acc,
442
+ "r": do_r,
443
+ "bar": do_bar,
444
+ "sub": do_sub,
445
+ "sup": do_sup,
446
+ "f": do_f,
447
+ "func": do_func,
448
+ "fName": do_fname,
449
+ "groupChr": do_groupchr,
450
+ "d": do_d,
451
+ "rad": do_rad,
452
+ "eqArr": do_eqarr,
453
+ "limLow": do_limlow,
454
+ "limUpp": do_limupp,
455
+ "lim": do_lim,
456
+ "m": do_m,
457
+ "mr": do_mr,
458
+ "nary": do_nary,
459
+ }