docling-core 2.16.0__tar.gz → 2.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.16.0 → docling_core-2.17.0}/PKG-INFO +2 -1
  2. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/document.py +160 -83
  3. {docling_core-2.16.0 → docling_core-2.17.0}/pyproject.toml +2 -1
  4. {docling_core-2.16.0 → docling_core-2.17.0}/LICENSE +0 -0
  5. {docling_core-2.16.0 → docling_core-2.17.0}/README.md +0 -0
  6. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/__init__.py +0 -0
  7. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/cli/__init__.py +0 -0
  8. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/cli/view.py +0 -0
  9. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/py.typed +0 -0
  10. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  11. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  12. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  13. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  14. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  15. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  16. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  17. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  18. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/__init__.py +0 -0
  19. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  20. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/mapping.py +0 -0
  21. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/meta.py +0 -0
  22. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/package.py +0 -0
  23. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/__init__.py +0 -0
  24. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/__init__.py +0 -0
  25. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/base.py +0 -0
  26. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  27. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  28. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/__init__.py +0 -0
  29. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/base.py +0 -0
  30. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/__init__.py +0 -0
  31. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/base.py +0 -0
  32. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/labels.py +0 -0
  33. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.16.0
3
+ Version: 2.17.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -28,6 +28,7 @@ Classifier: Typing :: Typed
28
28
  Provides-Extra: chunking
29
29
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
30
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
+ Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
31
32
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
32
33
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
33
34
  Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
@@ -3,6 +3,7 @@
3
3
  import base64
4
4
  import copy
5
5
  import hashlib
6
+ import html
6
7
  import json
7
8
  import mimetypes
8
9
  import os
@@ -15,7 +16,10 @@ from io import BytesIO
15
16
  from pathlib import Path
16
17
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
17
18
  from urllib.parse import quote, unquote
19
+ from xml.etree.cElementTree import SubElement, tostring
20
+ from xml.sax.saxutils import unescape
18
21
 
22
+ import latex2mathml.converter
19
23
  import pandas as pd
20
24
  import yaml
21
25
  from PIL import Image as PILImage
@@ -1045,7 +1049,7 @@ class TableItem(FloatingItem):
1045
1049
 
1046
1050
  text = ""
1047
1051
  if doc is not None and add_caption and len(self.captions):
1048
- text = self.caption_text(doc)
1052
+ text = html.escape(self.caption_text(doc))
1049
1053
 
1050
1054
  if len(self.data.table_cells) == 0:
1051
1055
  return ""
@@ -1071,7 +1075,7 @@ class TableItem(FloatingItem):
1071
1075
  if colstart != j:
1072
1076
  continue
1073
1077
 
1074
- content = cell.text.strip()
1078
+ content = html.escape(cell.text.strip())
1075
1079
  celltag = "td"
1076
1080
  if cell.column_header:
1077
1081
  celltag = "th"
@@ -1386,6 +1390,20 @@ class DoclingDocument(BaseModel):
1386
1390
  table tr:nth-child(even) td{
1387
1391
  background-color: LightGray;
1388
1392
  }
1393
+ math annotation {
1394
+ display: none;
1395
+ }
1396
+ .formula-not-decoded {
1397
+ background: repeating-linear-gradient(
1398
+ 45deg, /* Angle of the stripes */
1399
+ LightGray, /* First color */
1400
+ LightGray 10px, /* Length of the first color */
1401
+ White 10px, /* Second color */
1402
+ White 20px /* Length of the second color */
1403
+ );
1404
+ margin: 0;
1405
+ text-align: center;
1406
+ }
1389
1407
  </style>
1390
1408
  </head>"""
1391
1409
 
@@ -2082,6 +2100,46 @@ class DoclingDocument(BaseModel):
2082
2100
  previous_level = 0 # Track the previous item's level
2083
2101
  in_list = False # Track if we're currently processing list items
2084
2102
 
2103
+ # Our export markdown doesn't contain any emphasis styling:
2104
+ # Bold, Italic, or Bold-Italic
2105
+ # Hence, any underscore that we print into Markdown is coming from document text
2106
+ # That means we need to escape it, to properly reflect content in the markdown
2107
+ # However, we need to preserve underscores in image URLs
2108
+ # to maintain their validity
2109
+ # For example: ![image](path/to_image.png) should remain unchanged
2110
+ def _escape_underscores(text):
2111
+ """Escape underscores but leave them intact in the URL.."""
2112
+ # Firstly, identify all the URL patterns.
2113
+ url_pattern = r"!\[.*?\]\((.*?)\)"
2114
+ # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2115
+ latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2116
+ combined_pattern = f"({url_pattern})|({latex_pattern})"
2117
+
2118
+ parts = []
2119
+ last_end = 0
2120
+
2121
+ for match in re.finditer(combined_pattern, text):
2122
+ # Text to add before the URL (needs to be escaped)
2123
+ before_url = text[last_end : match.start()]
2124
+ parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2125
+
2126
+ # Add the full URL part (do not escape)
2127
+ parts.append(match.group(0))
2128
+ last_end = match.end()
2129
+
2130
+ # Add the final part of the text (which needs to be escaped)
2131
+ if last_end < len(text):
2132
+ parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2133
+
2134
+ return "".join(parts)
2135
+
2136
+ def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
2137
+ if do_escape_underscores and escaping_underscores:
2138
+ text = _escape_underscores(text)
2139
+ if do_escape_html:
2140
+ text = html.escape(text, quote=False)
2141
+ mdtexts.append(text)
2142
+
2085
2143
  for ix, (item, level) in enumerate(
2086
2144
  self.iterate_items(self.body, with_groups=True, page_no=page_no)
2087
2145
  ):
@@ -2130,7 +2188,7 @@ class DoclingDocument(BaseModel):
2130
2188
  in_list = False
2131
2189
  marker = "" if strict_text else "#"
2132
2190
  text = f"{marker} {item.text}"
2133
- mdtexts.append(text.strip() + "\n")
2191
+ _append_text(text.strip() + "\n")
2134
2192
 
2135
2193
  elif (
2136
2194
  isinstance(item, TextItem)
@@ -2143,12 +2201,12 @@ class DoclingDocument(BaseModel):
2143
2201
  if len(marker) < 2:
2144
2202
  marker = "##"
2145
2203
  text = f"{marker} {item.text}\n"
2146
- mdtexts.append(text.strip() + "\n")
2204
+ _append_text(text.strip() + "\n")
2147
2205
 
2148
2206
  elif isinstance(item, CodeItem) and item.label in labels:
2149
2207
  in_list = False
2150
2208
  text = f"```\n{item.text}\n```\n"
2151
- mdtexts.append(text)
2209
+ _append_text(text, do_escape_underscores=False, do_escape_html=False)
2152
2210
 
2153
2211
  elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2154
2212
  in_list = True
@@ -2165,30 +2223,42 @@ class DoclingDocument(BaseModel):
2165
2223
  marker = "-" # Markdown needs only dash as item marker.
2166
2224
 
2167
2225
  text = f"{list_indent}{marker} {item.text}"
2168
- mdtexts.append(text)
2226
+ _append_text(text)
2169
2227
 
2170
2228
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2171
2229
  in_list = False
2172
- mdtexts.append(f"$${item.text}$$")
2230
+ if item.text != "":
2231
+ _append_text(
2232
+ f"$${item.text}$$\n",
2233
+ do_escape_underscores=False,
2234
+ do_escape_html=False,
2235
+ )
2236
+ elif item.orig != "":
2237
+ _append_text(
2238
+ "<!-- formula-not-decoded -->\n",
2239
+ do_escape_underscores=False,
2240
+ do_escape_html=False,
2241
+ )
2173
2242
 
2174
2243
  elif isinstance(item, TextItem) and item.label in labels:
2175
2244
  in_list = False
2176
2245
  if len(item.text) and text_width > 0:
2246
+ text = item.text
2177
2247
  wrapped_text = textwrap.fill(text, width=text_width)
2178
- mdtexts.append(wrapped_text + "\n")
2248
+ _append_text(wrapped_text + "\n")
2179
2249
  elif len(item.text):
2180
2250
  text = f"{item.text}\n"
2181
- mdtexts.append(text)
2251
+ _append_text(text)
2182
2252
 
2183
2253
  elif isinstance(item, TableItem) and not strict_text:
2184
2254
  in_list = False
2185
- mdtexts.append(item.caption_text(self))
2255
+ _append_text(item.caption_text(self))
2186
2256
  md_table = item.export_to_markdown()
2187
- mdtexts.append("\n" + md_table + "\n")
2257
+ _append_text("\n" + md_table + "\n")
2188
2258
 
2189
2259
  elif isinstance(item, PictureItem) and not strict_text:
2190
2260
  in_list = False
2191
- mdtexts.append(item.caption_text(self))
2261
+ _append_text(item.caption_text(self))
2192
2262
 
2193
2263
  line = item.export_to_markdown(
2194
2264
  doc=self,
@@ -2196,54 +2266,18 @@ class DoclingDocument(BaseModel):
2196
2266
  image_mode=image_mode,
2197
2267
  )
2198
2268
 
2199
- mdtexts.append(line)
2269
+ _append_text(line, do_escape_html=False, do_escape_underscores=False)
2200
2270
 
2201
2271
  elif isinstance(item, DocItem) and item.label in labels:
2202
2272
  in_list = False
2203
- text = "<missing-text>"
2204
- mdtexts.append(text)
2273
+ text = "<!-- missing-text -->"
2274
+ _append_text(text, do_escape_html=False, do_escape_underscores=False)
2205
2275
 
2206
2276
  mdtext = (delim.join(mdtexts)).strip()
2207
2277
  mdtext = re.sub(
2208
2278
  r"\n\n\n+", "\n\n", mdtext
2209
2279
  ) # remove cases of double or more empty lines.
2210
2280
 
2211
- # Our export markdown doesn't contain any emphasis styling:
2212
- # Bold, Italic, or Bold-Italic
2213
- # Hence, any underscore that we print into Markdown is coming from document text
2214
- # That means we need to escape it, to properly reflect content in the markdown
2215
- # However, we need to preserve underscores in image URLs
2216
- # to maintain their validity
2217
- # For example: ![image](path/to_image.png) should remain unchanged
2218
- def escape_underscores(text):
2219
- """Escape underscores but leave them intact in the URL.."""
2220
- # Firstly, identify all the URL patterns.
2221
- url_pattern = r"!\[.*?\]\((.*?)\)"
2222
- # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2223
- latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2224
- combined_pattern = f"({url_pattern})|({latex_pattern})"
2225
-
2226
- parts = []
2227
- last_end = 0
2228
-
2229
- for match in re.finditer(combined_pattern, text):
2230
- # Text to add before the URL (needs to be escaped)
2231
- before_url = text[last_end : match.start()]
2232
- parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2233
-
2234
- # Add the full URL part (do not escape)
2235
- parts.append(match.group(0))
2236
- last_end = match.end()
2237
-
2238
- # Add the final part of the text (which needs to be escaped)
2239
- if last_end < len(text):
2240
- parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2241
-
2242
- return "".join(parts)
2243
-
2244
- if escaping_underscores:
2245
- mdtext = escape_underscores(mdtext)
2246
-
2247
2281
  return mdtext
2248
2282
 
2249
2283
  def export_to_text( # noqa: C901
@@ -2272,6 +2306,7 @@ class DoclingDocument(BaseModel):
2272
2306
  to_element: int = sys.maxsize,
2273
2307
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2274
2308
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2309
+ formula_to_mathml: bool = True,
2275
2310
  page_no: Optional[int] = None,
2276
2311
  html_lang: str = "en",
2277
2312
  html_head: str = _HTML_DEFAULT_HEAD,
@@ -2291,6 +2326,7 @@ class DoclingDocument(BaseModel):
2291
2326
  to_element=to_element,
2292
2327
  labels=labels,
2293
2328
  image_mode=image_mode,
2329
+ formula_to_mathml=formula_to_mathml,
2294
2330
  page_no=page_no,
2295
2331
  html_lang=html_lang,
2296
2332
  html_head=html_head,
@@ -2337,6 +2373,7 @@ class DoclingDocument(BaseModel):
2337
2373
  to_element: int = sys.maxsize,
2338
2374
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2339
2375
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2376
+ formula_to_mathml: bool = True,
2340
2377
  page_no: Optional[int] = None,
2341
2378
  html_lang: str = "en",
2342
2379
  html_head: str = _HTML_DEFAULT_HEAD,
@@ -2371,6 +2408,15 @@ class DoclingDocument(BaseModel):
2371
2408
 
2372
2409
  in_ordered_list: List[bool] = [] # False
2373
2410
 
2411
+ def _prepare_tag_content(
2412
+ text: str, do_escape_html=True, do_replace_newline=True
2413
+ ) -> str:
2414
+ if do_escape_html:
2415
+ text = html.escape(text, quote=False)
2416
+ if do_replace_newline:
2417
+ text = text.replace("\n", "<br>")
2418
+ return text
2419
+
2374
2420
  for ix, (item, curr_level) in enumerate(
2375
2421
  self.iterate_items(self.body, with_groups=True, page_no=page_no)
2376
2422
  ):
@@ -2401,7 +2447,7 @@ class DoclingDocument(BaseModel):
2401
2447
  ]:
2402
2448
 
2403
2449
  text = "<ol>"
2404
- html_texts.append(text.strip())
2450
+ html_texts.append(text)
2405
2451
 
2406
2452
  # Increment list nesting level when entering a new list
2407
2453
  in_ordered_list.append(True)
@@ -2411,7 +2457,7 @@ class DoclingDocument(BaseModel):
2411
2457
  ]:
2412
2458
 
2413
2459
  text = "<ul>"
2414
- html_texts.append(text.strip())
2460
+ html_texts.append(text)
2415
2461
 
2416
2462
  # Increment list nesting level when entering a new list
2417
2463
  in_ordered_list.append(False)
@@ -2421,54 +2467,86 @@ class DoclingDocument(BaseModel):
2421
2467
 
2422
2468
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2423
2469
 
2424
- text = f"<h1>{item.text}</h1>"
2425
- html_texts.append(text.strip())
2470
+ text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
2471
+ html_texts.append(text)
2426
2472
 
2427
2473
  elif isinstance(item, SectionHeaderItem):
2428
2474
 
2429
- section_level: int = item.level + 1
2430
-
2431
- text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
2432
- html_texts.append(text.strip())
2433
-
2434
- elif isinstance(item, TextItem) and item.label in [
2435
- DocItemLabel.SECTION_HEADER
2436
- ]:
2475
+ section_level: int = min(item.level + 1, 6)
2437
2476
 
2438
- section_level = curr_level
2477
+ text = (
2478
+ f"<h{(section_level)}>"
2479
+ f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
2480
+ )
2481
+ html_texts.append(text)
2439
2482
 
2440
- if section_level <= 1:
2441
- section_level = 2
2483
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2442
2484
 
2443
- if section_level >= 6:
2444
- section_level = 6
2485
+ math_formula = _prepare_tag_content(
2486
+ item.text, do_escape_html=False, do_replace_newline=False
2487
+ )
2488
+ text = ""
2489
+
2490
+ # If the formula is not processed correcty, use its image
2491
+ if (
2492
+ item.text == ""
2493
+ and item.orig != ""
2494
+ and image_mode == ImageRefMode.EMBEDDED
2495
+ and len(item.prov) > 0
2496
+ ):
2497
+ item_image = item.get_image(doc=self)
2498
+ if item_image is not None:
2499
+ img_ref = ImageRef.from_pil(item_image, dpi=72)
2500
+ text = (
2501
+ "<figure>"
2502
+ f'<img src="{img_ref.uri}" alt="{item.orig}" />'
2503
+ "</figure>"
2504
+ )
2445
2505
 
2446
- text = f"<h{section_level}>{item.text}</h{section_level}>"
2447
- html_texts.append(text.strip())
2506
+ # Building a math equation in MathML format
2507
+ # ref https://www.w3.org/TR/wai-aria-1.1/#math
2508
+ elif formula_to_mathml:
2509
+ mathml_element = latex2mathml.converter.convert_to_element(
2510
+ math_formula, display="block"
2511
+ )
2512
+ annotation = SubElement(
2513
+ mathml_element, "annotation", dict(encoding="TeX")
2514
+ )
2515
+ annotation.text = math_formula
2516
+ mathml = unescape(tostring(mathml_element, encoding="unicode"))
2517
+ text = f"<div>{mathml}</div>"
2448
2518
 
2449
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
2519
+ elif math_formula != "":
2520
+ text = f"<pre>{math_formula}</pre>"
2450
2521
 
2451
- text = f"<pre>{item.text}</pre>"
2452
- html_texts.append(text)
2522
+ if text != "":
2523
+ html_texts.append(text)
2524
+ else:
2525
+ html_texts.append(
2526
+ '<div class="formula-not-decoded">Formula not decoded</div>'
2527
+ )
2453
2528
 
2454
2529
  elif isinstance(item, ListItem):
2455
2530
 
2456
- text = f"<li>{item.text}</li>"
2531
+ text = f"<li>{_prepare_tag_content(item.text)}</li>"
2457
2532
  html_texts.append(text)
2458
2533
 
2459
2534
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
2460
2535
 
2461
- text = f"<li>{item.text}</li>"
2536
+ text = f"<li>{_prepare_tag_content(item.text)}</li>"
2462
2537
  html_texts.append(text)
2463
2538
 
2464
- elif isinstance(item, CodeItem) and item.label in labels:
2465
- text = f"<pre><code>{item.text}</code></pre>"
2466
- html_texts.append(text.strip())
2539
+ elif isinstance(item, CodeItem):
2540
+ code_text = _prepare_tag_content(
2541
+ item.text, do_escape_html=False, do_replace_newline=False
2542
+ )
2543
+ text = f"<pre><code>{code_text}</code></pre>"
2544
+ html_texts.append(text)
2467
2545
 
2468
- elif isinstance(item, TextItem) and item.label in labels:
2546
+ elif isinstance(item, TextItem):
2469
2547
 
2470
- text = f"<p>{item.text}</p>"
2471
- html_texts.append(text.strip())
2548
+ text = f"<p>{_prepare_tag_content(item.text)}</p>"
2549
+ html_texts.append(text)
2472
2550
  elif isinstance(item, TableItem):
2473
2551
 
2474
2552
  text = item.export_to_html(doc=self, add_caption=True)
@@ -2489,8 +2567,7 @@ class DoclingDocument(BaseModel):
2489
2567
 
2490
2568
  lines = []
2491
2569
  lines.extend(head_lines)
2492
- for i, line in enumerate(html_texts):
2493
- lines.append(line.replace("\n", "<br>"))
2570
+ lines.extend(html_texts)
2494
2571
 
2495
2572
  delim = "\n"
2496
2573
  html_text = (delim.join(lines)).strip()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.16.0"
3
+ version = "2.17.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
59
59
  transformers = { version = "^4.34.0", optional = true }
60
60
  semchunk = { version = "^2.2.0", optional = true }
61
61
  typer = "^0.12.5"
62
+ latex2mathml = "^3.77.0"
62
63
 
63
64
  [tool.poetry.extras]
64
65
  chunking = ["transformers", "semchunk"]
File without changes
File without changes