docling-core 2.16.1__tar.gz → 2.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.16.1 → docling_core-2.17.1}/PKG-INFO +2 -1
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/doc/document.py +177 -83
- {docling_core-2.16.1 → docling_core-2.17.1}/pyproject.toml +2 -1
- {docling_core-2.16.1 → docling_core-2.17.1}/LICENSE +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/README.md +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/py.typed +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/search/package.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/base.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.16.1 → docling_core-2.17.1}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.17.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -28,6 +28,7 @@ Classifier: Typing :: Typed
|
|
|
28
28
|
Provides-Extra: chunking
|
|
29
29
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
30
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
|
+
Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
|
|
31
32
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
32
33
|
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
33
34
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
|
+
import html
|
|
6
7
|
import json
|
|
8
|
+
import logging
|
|
7
9
|
import mimetypes
|
|
8
10
|
import os
|
|
9
11
|
import re
|
|
@@ -15,7 +17,11 @@ from io import BytesIO
|
|
|
15
17
|
from pathlib import Path
|
|
16
18
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
17
19
|
from urllib.parse import quote, unquote
|
|
20
|
+
from xml.etree.cElementTree import SubElement, tostring
|
|
21
|
+
from xml.sax.saxutils import unescape
|
|
18
22
|
|
|
23
|
+
import latex2mathml.converter
|
|
24
|
+
import latex2mathml.exceptions
|
|
19
25
|
import pandas as pd
|
|
20
26
|
import yaml
|
|
21
27
|
from PIL import Image as PILImage
|
|
@@ -40,6 +46,8 @@ from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, Group
|
|
|
40
46
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
41
47
|
from docling_core.types.doc.utils import relative_path
|
|
42
48
|
|
|
49
|
+
_logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
43
51
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
44
52
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
45
53
|
CURRENT_VERSION: Final = "1.0.0"
|
|
@@ -1045,7 +1053,7 @@ class TableItem(FloatingItem):
|
|
|
1045
1053
|
|
|
1046
1054
|
text = ""
|
|
1047
1055
|
if doc is not None and add_caption and len(self.captions):
|
|
1048
|
-
text = self.caption_text(doc)
|
|
1056
|
+
text = html.escape(self.caption_text(doc))
|
|
1049
1057
|
|
|
1050
1058
|
if len(self.data.table_cells) == 0:
|
|
1051
1059
|
return ""
|
|
@@ -1071,7 +1079,7 @@ class TableItem(FloatingItem):
|
|
|
1071
1079
|
if colstart != j:
|
|
1072
1080
|
continue
|
|
1073
1081
|
|
|
1074
|
-
content = cell.text.strip()
|
|
1082
|
+
content = html.escape(cell.text.strip())
|
|
1075
1083
|
celltag = "td"
|
|
1076
1084
|
if cell.column_header:
|
|
1077
1085
|
celltag = "th"
|
|
@@ -1386,6 +1394,20 @@ class DoclingDocument(BaseModel):
|
|
|
1386
1394
|
table tr:nth-child(even) td{
|
|
1387
1395
|
background-color: LightGray;
|
|
1388
1396
|
}
|
|
1397
|
+
math annotation {
|
|
1398
|
+
display: none;
|
|
1399
|
+
}
|
|
1400
|
+
.formula-not-decoded {
|
|
1401
|
+
background: repeating-linear-gradient(
|
|
1402
|
+
45deg, /* Angle of the stripes */
|
|
1403
|
+
LightGray, /* First color */
|
|
1404
|
+
LightGray 10px, /* Length of the first color */
|
|
1405
|
+
White 10px, /* Second color */
|
|
1406
|
+
White 20px /* Length of the second color */
|
|
1407
|
+
);
|
|
1408
|
+
margin: 0;
|
|
1409
|
+
text-align: center;
|
|
1410
|
+
}
|
|
1389
1411
|
</style>
|
|
1390
1412
|
</head>"""
|
|
1391
1413
|
|
|
@@ -2082,6 +2104,46 @@ class DoclingDocument(BaseModel):
|
|
|
2082
2104
|
previous_level = 0 # Track the previous item's level
|
|
2083
2105
|
in_list = False # Track if we're currently processing list items
|
|
2084
2106
|
|
|
2107
|
+
# Our export markdown doesn't contain any emphasis styling:
|
|
2108
|
+
# Bold, Italic, or Bold-Italic
|
|
2109
|
+
# Hence, any underscore that we print into Markdown is coming from document text
|
|
2110
|
+
# That means we need to escape it, to properly reflect content in the markdown
|
|
2111
|
+
# However, we need to preserve underscores in image URLs
|
|
2112
|
+
# to maintain their validity
|
|
2113
|
+
# For example:  should remain unchanged
|
|
2114
|
+
def _escape_underscores(text):
|
|
2115
|
+
"""Escape underscores but leave them intact in the URL.."""
|
|
2116
|
+
# Firstly, identify all the URL patterns.
|
|
2117
|
+
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
2118
|
+
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
|
|
2119
|
+
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
|
|
2120
|
+
combined_pattern = f"({url_pattern})|({latex_pattern})"
|
|
2121
|
+
|
|
2122
|
+
parts = []
|
|
2123
|
+
last_end = 0
|
|
2124
|
+
|
|
2125
|
+
for match in re.finditer(combined_pattern, text):
|
|
2126
|
+
# Text to add before the URL (needs to be escaped)
|
|
2127
|
+
before_url = text[last_end : match.start()]
|
|
2128
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
2129
|
+
|
|
2130
|
+
# Add the full URL part (do not escape)
|
|
2131
|
+
parts.append(match.group(0))
|
|
2132
|
+
last_end = match.end()
|
|
2133
|
+
|
|
2134
|
+
# Add the final part of the text (which needs to be escaped)
|
|
2135
|
+
if last_end < len(text):
|
|
2136
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
2137
|
+
|
|
2138
|
+
return "".join(parts)
|
|
2139
|
+
|
|
2140
|
+
def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
|
|
2141
|
+
if do_escape_underscores and escaping_underscores:
|
|
2142
|
+
text = _escape_underscores(text)
|
|
2143
|
+
if do_escape_html:
|
|
2144
|
+
text = html.escape(text, quote=False)
|
|
2145
|
+
mdtexts.append(text)
|
|
2146
|
+
|
|
2085
2147
|
for ix, (item, level) in enumerate(
|
|
2086
2148
|
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
2087
2149
|
):
|
|
@@ -2130,7 +2192,7 @@ class DoclingDocument(BaseModel):
|
|
|
2130
2192
|
in_list = False
|
|
2131
2193
|
marker = "" if strict_text else "#"
|
|
2132
2194
|
text = f"{marker} {item.text}"
|
|
2133
|
-
|
|
2195
|
+
_append_text(text.strip() + "\n")
|
|
2134
2196
|
|
|
2135
2197
|
elif (
|
|
2136
2198
|
isinstance(item, TextItem)
|
|
@@ -2143,12 +2205,12 @@ class DoclingDocument(BaseModel):
|
|
|
2143
2205
|
if len(marker) < 2:
|
|
2144
2206
|
marker = "##"
|
|
2145
2207
|
text = f"{marker} {item.text}\n"
|
|
2146
|
-
|
|
2208
|
+
_append_text(text.strip() + "\n")
|
|
2147
2209
|
|
|
2148
2210
|
elif isinstance(item, CodeItem) and item.label in labels:
|
|
2149
2211
|
in_list = False
|
|
2150
2212
|
text = f"```\n{item.text}\n```\n"
|
|
2151
|
-
|
|
2213
|
+
_append_text(text, do_escape_underscores=False, do_escape_html=False)
|
|
2152
2214
|
|
|
2153
2215
|
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2154
2216
|
in_list = True
|
|
@@ -2165,30 +2227,42 @@ class DoclingDocument(BaseModel):
|
|
|
2165
2227
|
marker = "-" # Markdown needs only dash as item marker.
|
|
2166
2228
|
|
|
2167
2229
|
text = f"{list_indent}{marker} {item.text}"
|
|
2168
|
-
|
|
2230
|
+
_append_text(text)
|
|
2169
2231
|
|
|
2170
2232
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2171
2233
|
in_list = False
|
|
2172
|
-
|
|
2234
|
+
if item.text != "":
|
|
2235
|
+
_append_text(
|
|
2236
|
+
f"$${item.text}$$\n",
|
|
2237
|
+
do_escape_underscores=False,
|
|
2238
|
+
do_escape_html=False,
|
|
2239
|
+
)
|
|
2240
|
+
elif item.orig != "":
|
|
2241
|
+
_append_text(
|
|
2242
|
+
"<!-- formula-not-decoded -->\n",
|
|
2243
|
+
do_escape_underscores=False,
|
|
2244
|
+
do_escape_html=False,
|
|
2245
|
+
)
|
|
2173
2246
|
|
|
2174
2247
|
elif isinstance(item, TextItem) and item.label in labels:
|
|
2175
2248
|
in_list = False
|
|
2176
2249
|
if len(item.text) and text_width > 0:
|
|
2250
|
+
text = item.text
|
|
2177
2251
|
wrapped_text = textwrap.fill(text, width=text_width)
|
|
2178
|
-
|
|
2252
|
+
_append_text(wrapped_text + "\n")
|
|
2179
2253
|
elif len(item.text):
|
|
2180
2254
|
text = f"{item.text}\n"
|
|
2181
|
-
|
|
2255
|
+
_append_text(text)
|
|
2182
2256
|
|
|
2183
2257
|
elif isinstance(item, TableItem) and not strict_text:
|
|
2184
2258
|
in_list = False
|
|
2185
|
-
|
|
2259
|
+
_append_text(item.caption_text(self))
|
|
2186
2260
|
md_table = item.export_to_markdown()
|
|
2187
|
-
|
|
2261
|
+
_append_text("\n" + md_table + "\n")
|
|
2188
2262
|
|
|
2189
2263
|
elif isinstance(item, PictureItem) and not strict_text:
|
|
2190
2264
|
in_list = False
|
|
2191
|
-
|
|
2265
|
+
_append_text(item.caption_text(self))
|
|
2192
2266
|
|
|
2193
2267
|
line = item.export_to_markdown(
|
|
2194
2268
|
doc=self,
|
|
@@ -2196,54 +2270,18 @@ class DoclingDocument(BaseModel):
|
|
|
2196
2270
|
image_mode=image_mode,
|
|
2197
2271
|
)
|
|
2198
2272
|
|
|
2199
|
-
|
|
2273
|
+
_append_text(line, do_escape_html=False, do_escape_underscores=False)
|
|
2200
2274
|
|
|
2201
2275
|
elif isinstance(item, DocItem) and item.label in labels:
|
|
2202
2276
|
in_list = False
|
|
2203
|
-
text = "
|
|
2204
|
-
|
|
2277
|
+
text = "<!-- missing-text -->"
|
|
2278
|
+
_append_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2205
2279
|
|
|
2206
2280
|
mdtext = (delim.join(mdtexts)).strip()
|
|
2207
2281
|
mdtext = re.sub(
|
|
2208
2282
|
r"\n\n\n+", "\n\n", mdtext
|
|
2209
2283
|
) # remove cases of double or more empty lines.
|
|
2210
2284
|
|
|
2211
|
-
# Our export markdown doesn't contain any emphasis styling:
|
|
2212
|
-
# Bold, Italic, or Bold-Italic
|
|
2213
|
-
# Hence, any underscore that we print into Markdown is coming from document text
|
|
2214
|
-
# That means we need to escape it, to properly reflect content in the markdown
|
|
2215
|
-
# However, we need to preserve underscores in image URLs
|
|
2216
|
-
# to maintain their validity
|
|
2217
|
-
# For example:  should remain unchanged
|
|
2218
|
-
def escape_underscores(text):
|
|
2219
|
-
"""Escape underscores but leave them intact in the URL.."""
|
|
2220
|
-
# Firstly, identify all the URL patterns.
|
|
2221
|
-
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
2222
|
-
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
|
|
2223
|
-
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
|
|
2224
|
-
combined_pattern = f"({url_pattern})|({latex_pattern})"
|
|
2225
|
-
|
|
2226
|
-
parts = []
|
|
2227
|
-
last_end = 0
|
|
2228
|
-
|
|
2229
|
-
for match in re.finditer(combined_pattern, text):
|
|
2230
|
-
# Text to add before the URL (needs to be escaped)
|
|
2231
|
-
before_url = text[last_end : match.start()]
|
|
2232
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
2233
|
-
|
|
2234
|
-
# Add the full URL part (do not escape)
|
|
2235
|
-
parts.append(match.group(0))
|
|
2236
|
-
last_end = match.end()
|
|
2237
|
-
|
|
2238
|
-
# Add the final part of the text (which needs to be escaped)
|
|
2239
|
-
if last_end < len(text):
|
|
2240
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
2241
|
-
|
|
2242
|
-
return "".join(parts)
|
|
2243
|
-
|
|
2244
|
-
if escaping_underscores:
|
|
2245
|
-
mdtext = escape_underscores(mdtext)
|
|
2246
|
-
|
|
2247
2285
|
return mdtext
|
|
2248
2286
|
|
|
2249
2287
|
def export_to_text( # noqa: C901
|
|
@@ -2272,6 +2310,7 @@ class DoclingDocument(BaseModel):
|
|
|
2272
2310
|
to_element: int = sys.maxsize,
|
|
2273
2311
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2274
2312
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2313
|
+
formula_to_mathml: bool = True,
|
|
2275
2314
|
page_no: Optional[int] = None,
|
|
2276
2315
|
html_lang: str = "en",
|
|
2277
2316
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
@@ -2291,6 +2330,7 @@ class DoclingDocument(BaseModel):
|
|
|
2291
2330
|
to_element=to_element,
|
|
2292
2331
|
labels=labels,
|
|
2293
2332
|
image_mode=image_mode,
|
|
2333
|
+
formula_to_mathml=formula_to_mathml,
|
|
2294
2334
|
page_no=page_no,
|
|
2295
2335
|
html_lang=html_lang,
|
|
2296
2336
|
html_head=html_head,
|
|
@@ -2337,6 +2377,7 @@ class DoclingDocument(BaseModel):
|
|
|
2337
2377
|
to_element: int = sys.maxsize,
|
|
2338
2378
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2339
2379
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2380
|
+
formula_to_mathml: bool = True,
|
|
2340
2381
|
page_no: Optional[int] = None,
|
|
2341
2382
|
html_lang: str = "en",
|
|
2342
2383
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
@@ -2371,6 +2412,15 @@ class DoclingDocument(BaseModel):
|
|
|
2371
2412
|
|
|
2372
2413
|
in_ordered_list: List[bool] = [] # False
|
|
2373
2414
|
|
|
2415
|
+
def _prepare_tag_content(
|
|
2416
|
+
text: str, do_escape_html=True, do_replace_newline=True
|
|
2417
|
+
) -> str:
|
|
2418
|
+
if do_escape_html:
|
|
2419
|
+
text = html.escape(text, quote=False)
|
|
2420
|
+
if do_replace_newline:
|
|
2421
|
+
text = text.replace("\n", "<br>")
|
|
2422
|
+
return text
|
|
2423
|
+
|
|
2374
2424
|
for ix, (item, curr_level) in enumerate(
|
|
2375
2425
|
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
2376
2426
|
):
|
|
@@ -2401,7 +2451,7 @@ class DoclingDocument(BaseModel):
|
|
|
2401
2451
|
]:
|
|
2402
2452
|
|
|
2403
2453
|
text = "<ol>"
|
|
2404
|
-
html_texts.append(text
|
|
2454
|
+
html_texts.append(text)
|
|
2405
2455
|
|
|
2406
2456
|
# Increment list nesting level when entering a new list
|
|
2407
2457
|
in_ordered_list.append(True)
|
|
@@ -2411,7 +2461,7 @@ class DoclingDocument(BaseModel):
|
|
|
2411
2461
|
]:
|
|
2412
2462
|
|
|
2413
2463
|
text = "<ul>"
|
|
2414
|
-
html_texts.append(text
|
|
2464
|
+
html_texts.append(text)
|
|
2415
2465
|
|
|
2416
2466
|
# Increment list nesting level when entering a new list
|
|
2417
2467
|
in_ordered_list.append(False)
|
|
@@ -2421,54 +2471,99 @@ class DoclingDocument(BaseModel):
|
|
|
2421
2471
|
|
|
2422
2472
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2423
2473
|
|
|
2424
|
-
text = f"<h1>{item.text}</h1>"
|
|
2425
|
-
html_texts.append(text
|
|
2474
|
+
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
|
|
2475
|
+
html_texts.append(text)
|
|
2426
2476
|
|
|
2427
2477
|
elif isinstance(item, SectionHeaderItem):
|
|
2428
2478
|
|
|
2429
|
-
section_level: int = item.level + 1
|
|
2479
|
+
section_level: int = min(item.level + 1, 6)
|
|
2430
2480
|
|
|
2431
|
-
text =
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
]:
|
|
2437
|
-
|
|
2438
|
-
section_level = curr_level
|
|
2481
|
+
text = (
|
|
2482
|
+
f"<h{(section_level)}>"
|
|
2483
|
+
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
|
|
2484
|
+
)
|
|
2485
|
+
html_texts.append(text)
|
|
2439
2486
|
|
|
2440
|
-
|
|
2441
|
-
section_level = 2
|
|
2487
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2442
2488
|
|
|
2443
|
-
|
|
2444
|
-
|
|
2489
|
+
math_formula = _prepare_tag_content(
|
|
2490
|
+
item.text, do_escape_html=False, do_replace_newline=False
|
|
2491
|
+
)
|
|
2492
|
+
text = ""
|
|
2493
|
+
|
|
2494
|
+
def _image_fallback(item: TextItem):
|
|
2495
|
+
item_image = item.get_image(doc=self)
|
|
2496
|
+
if item_image is not None:
|
|
2497
|
+
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
2498
|
+
return (
|
|
2499
|
+
"<figure>"
|
|
2500
|
+
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
|
|
2501
|
+
"</figure>"
|
|
2502
|
+
)
|
|
2445
2503
|
|
|
2446
|
-
|
|
2447
|
-
|
|
2504
|
+
# If the formula is not processed correcty, use its image
|
|
2505
|
+
if (
|
|
2506
|
+
item.text == ""
|
|
2507
|
+
and item.orig != ""
|
|
2508
|
+
and image_mode == ImageRefMode.EMBEDDED
|
|
2509
|
+
and len(item.prov) > 0
|
|
2510
|
+
):
|
|
2511
|
+
text = _image_fallback(item)
|
|
2512
|
+
|
|
2513
|
+
# Building a math equation in MathML format
|
|
2514
|
+
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2515
|
+
elif formula_to_mathml:
|
|
2516
|
+
try:
|
|
2517
|
+
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2518
|
+
math_formula, display="block"
|
|
2519
|
+
)
|
|
2520
|
+
annotation = SubElement(
|
|
2521
|
+
mathml_element, "annotation", dict(encoding="TeX")
|
|
2522
|
+
)
|
|
2523
|
+
annotation.text = math_formula
|
|
2524
|
+
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
2525
|
+
text = f"<div>{mathml}</div>"
|
|
2526
|
+
except Exception as err:
|
|
2527
|
+
_logger.warning(
|
|
2528
|
+
"Malformed formula cannot be rendered. "
|
|
2529
|
+
f"Error {err.__class__.__name__}, formula={math_formula}"
|
|
2530
|
+
)
|
|
2531
|
+
if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
|
|
2532
|
+
text = _image_fallback(item)
|
|
2533
|
+
else:
|
|
2534
|
+
text = f"<pre>{math_formula}</pre>"
|
|
2448
2535
|
|
|
2449
|
-
|
|
2536
|
+
elif math_formula != "":
|
|
2537
|
+
text = f"<pre>{math_formula}</pre>"
|
|
2450
2538
|
|
|
2451
|
-
text
|
|
2452
|
-
|
|
2539
|
+
if text != "":
|
|
2540
|
+
html_texts.append(text)
|
|
2541
|
+
else:
|
|
2542
|
+
html_texts.append(
|
|
2543
|
+
'<div class="formula-not-decoded">Formula not decoded</div>'
|
|
2544
|
+
)
|
|
2453
2545
|
|
|
2454
2546
|
elif isinstance(item, ListItem):
|
|
2455
2547
|
|
|
2456
|
-
text = f"<li>{item.text}</li>"
|
|
2548
|
+
text = f"<li>{_prepare_tag_content(item.text)}</li>"
|
|
2457
2549
|
html_texts.append(text)
|
|
2458
2550
|
|
|
2459
2551
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2460
2552
|
|
|
2461
|
-
text = f"<li>{item.text}</li>"
|
|
2553
|
+
text = f"<li>{_prepare_tag_content(item.text)}</li>"
|
|
2462
2554
|
html_texts.append(text)
|
|
2463
2555
|
|
|
2464
|
-
elif isinstance(item, CodeItem)
|
|
2465
|
-
|
|
2466
|
-
|
|
2556
|
+
elif isinstance(item, CodeItem):
|
|
2557
|
+
code_text = _prepare_tag_content(
|
|
2558
|
+
item.text, do_escape_html=False, do_replace_newline=False
|
|
2559
|
+
)
|
|
2560
|
+
text = f"<pre><code>{code_text}</code></pre>"
|
|
2561
|
+
html_texts.append(text)
|
|
2467
2562
|
|
|
2468
|
-
elif isinstance(item, TextItem)
|
|
2563
|
+
elif isinstance(item, TextItem):
|
|
2469
2564
|
|
|
2470
|
-
text = f"<p>{item.text}</p>"
|
|
2471
|
-
html_texts.append(text
|
|
2565
|
+
text = f"<p>{_prepare_tag_content(item.text)}</p>"
|
|
2566
|
+
html_texts.append(text)
|
|
2472
2567
|
elif isinstance(item, TableItem):
|
|
2473
2568
|
|
|
2474
2569
|
text = item.export_to_html(doc=self, add_caption=True)
|
|
@@ -2489,8 +2584,7 @@ class DoclingDocument(BaseModel):
|
|
|
2489
2584
|
|
|
2490
2585
|
lines = []
|
|
2491
2586
|
lines.extend(head_lines)
|
|
2492
|
-
|
|
2493
|
-
lines.append(line.replace("\n", "<br>"))
|
|
2587
|
+
lines.extend(html_texts)
|
|
2494
2588
|
|
|
2495
2589
|
delim = "\n"
|
|
2496
2590
|
html_text = (delim.join(lines)).strip()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.17.1"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
|
|
|
59
59
|
transformers = { version = "^4.34.0", optional = true }
|
|
60
60
|
semchunk = { version = "^2.2.0", optional = true }
|
|
61
61
|
typer = "^0.12.5"
|
|
62
|
+
latex2mathml = "^3.77.0"
|
|
62
63
|
|
|
63
64
|
[tool.poetry.extras]
|
|
64
65
|
chunking = ["transformers", "semchunk"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.16.1 → docling_core-2.17.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.16.1 → docling_core-2.17.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.16.1 → docling_core-2.17.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.16.1 → docling_core-2.17.1}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|