docling-core 2.17.0__py3-none-any.whl → 2.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -5,6 +5,7 @@ import copy
5
5
  import hashlib
6
6
  import html
7
7
  import json
8
+ import logging
8
9
  import mimetypes
9
10
  import os
10
11
  import re
@@ -20,6 +21,7 @@ from xml.etree.cElementTree import SubElement, tostring
20
21
  from xml.sax.saxutils import unescape
21
22
 
22
23
  import latex2mathml.converter
24
+ import latex2mathml.exceptions
23
25
  import pandas as pd
24
26
  import yaml
25
27
  from PIL import Image as PILImage
@@ -44,6 +46,8 @@ from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, Group
44
46
  from docling_core.types.doc.tokens import DocumentToken, TableToken
45
47
  from docling_core.types.doc.utils import relative_path
46
48
 
49
+ _logger = logging.getLogger(__name__)
50
+
47
51
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
48
52
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
49
53
  CURRENT_VERSION: Final = "1.0.0"
@@ -2487,34 +2491,47 @@ class DoclingDocument(BaseModel):
2487
2491
  )
2488
2492
  text = ""
2489
2493
 
2490
- # If the formula is not processed correcty, use its image
2491
- if (
2492
- item.text == ""
2493
- and item.orig != ""
2494
- and image_mode == ImageRefMode.EMBEDDED
2495
- and len(item.prov) > 0
2496
- ):
2494
+ def _image_fallback(item: TextItem):
2497
2495
  item_image = item.get_image(doc=self)
2498
2496
  if item_image is not None:
2499
2497
  img_ref = ImageRef.from_pil(item_image, dpi=72)
2500
- text = (
2498
+ return (
2501
2499
  "<figure>"
2502
2500
  f'<img src="{img_ref.uri}" alt="{item.orig}" />'
2503
2501
  "</figure>"
2504
2502
  )
2505
2503
 
2504
+ # If the formula is not processed correcty, use its image
2505
+ if (
2506
+ item.text == ""
2507
+ and item.orig != ""
2508
+ and image_mode == ImageRefMode.EMBEDDED
2509
+ and len(item.prov) > 0
2510
+ ):
2511
+ text = _image_fallback(item)
2512
+
2506
2513
  # Building a math equation in MathML format
2507
2514
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
2508
2515
  elif formula_to_mathml:
2509
- mathml_element = latex2mathml.converter.convert_to_element(
2510
- math_formula, display="block"
2511
- )
2512
- annotation = SubElement(
2513
- mathml_element, "annotation", dict(encoding="TeX")
2514
- )
2515
- annotation.text = math_formula
2516
- mathml = unescape(tostring(mathml_element, encoding="unicode"))
2517
- text = f"<div>{mathml}</div>"
2516
+ try:
2517
+ mathml_element = latex2mathml.converter.convert_to_element(
2518
+ math_formula, display="block"
2519
+ )
2520
+ annotation = SubElement(
2521
+ mathml_element, "annotation", dict(encoding="TeX")
2522
+ )
2523
+ annotation.text = math_formula
2524
+ mathml = unescape(tostring(mathml_element, encoding="unicode"))
2525
+ text = f"<div>{mathml}</div>"
2526
+ except Exception as err:
2527
+ _logger.warning(
2528
+ "Malformed formula cannot be rendered. "
2529
+ f"Error {err.__class__.__name__}, formula={math_formula}"
2530
+ )
2531
+ if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
2532
+ text = _image_fallback(item)
2533
+ else:
2534
+ text = f"<pre>{math_formula}</pre>"
2518
2535
 
2519
2536
  elif math_formula != "":
2520
2537
  text = f"<pre>{math_formula}</pre>"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.17.0
3
+ Version: 2.17.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -24,7 +24,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
26
26
  docling_core/types/doc/base.py,sha256=lMRNq1DUK7K26L2VNZRqFaItCSZ6m9BdYTVaJA98PZQ,11495
27
- docling_core/types/doc/document.py,sha256=X2VPF4RJZTy2txShs3BoQzMVfdyqKv1W9HJAEQnPKuM,98148
27
+ docling_core/types/doc/document.py,sha256=7Mwd3WclfXB7_T9ApXXwEehtazySLwH2FOUamiAmun0,98902
28
28
  docling_core/types/doc/labels.py,sha256=8Luymal9SKXTwyqq1ONKiUTxuMo_nRMYfBkRPFkdSSo,5306
29
29
  docling_core/types/doc/tokens.py,sha256=GMtm5TsNljBPaMYkgmD3WWZmC0FHqKF9imKEEySz4ps,6020
30
30
  docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
56
56
  docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
57
57
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.17.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.17.0.dist-info/METADATA,sha256=VfmZn2a_wVmLXXa7SsZkVnx1yhYCtTefXvSOGwXtrXQ,5790
61
- docling_core-2.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.17.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.17.0.dist-info/RECORD,,
59
+ docling_core-2.17.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.17.1.dist-info/METADATA,sha256=temWeG-Oeonzr9ycqOoEshXSTGaKfSWYT9mWycYRGNA,5790
61
+ docling_core-2.17.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.17.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.17.1.dist-info/RECORD,,