docling-core 2.16.0__tar.gz → 2.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.16.0 → docling_core-2.17.0}/PKG-INFO +2 -1
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/document.py +160 -83
- {docling_core-2.16.0 → docling_core-2.17.0}/pyproject.toml +2 -1
- {docling_core-2.16.0 → docling_core-2.17.0}/LICENSE +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/README.md +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/py.typed +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/package.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/base.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.16.0 → docling_core-2.17.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.17.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -28,6 +28,7 @@ Classifier: Typing :: Typed
|
|
|
28
28
|
Provides-Extra: chunking
|
|
29
29
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
30
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
|
+
Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
|
|
31
32
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
32
33
|
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
33
34
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
|
+
import html
|
|
6
7
|
import json
|
|
7
8
|
import mimetypes
|
|
8
9
|
import os
|
|
@@ -15,7 +16,10 @@ from io import BytesIO
|
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
17
18
|
from urllib.parse import quote, unquote
|
|
19
|
+
from xml.etree.cElementTree import SubElement, tostring
|
|
20
|
+
from xml.sax.saxutils import unescape
|
|
18
21
|
|
|
22
|
+
import latex2mathml.converter
|
|
19
23
|
import pandas as pd
|
|
20
24
|
import yaml
|
|
21
25
|
from PIL import Image as PILImage
|
|
@@ -1045,7 +1049,7 @@ class TableItem(FloatingItem):
|
|
|
1045
1049
|
|
|
1046
1050
|
text = ""
|
|
1047
1051
|
if doc is not None and add_caption and len(self.captions):
|
|
1048
|
-
text = self.caption_text(doc)
|
|
1052
|
+
text = html.escape(self.caption_text(doc))
|
|
1049
1053
|
|
|
1050
1054
|
if len(self.data.table_cells) == 0:
|
|
1051
1055
|
return ""
|
|
@@ -1071,7 +1075,7 @@ class TableItem(FloatingItem):
|
|
|
1071
1075
|
if colstart != j:
|
|
1072
1076
|
continue
|
|
1073
1077
|
|
|
1074
|
-
content = cell.text.strip()
|
|
1078
|
+
content = html.escape(cell.text.strip())
|
|
1075
1079
|
celltag = "td"
|
|
1076
1080
|
if cell.column_header:
|
|
1077
1081
|
celltag = "th"
|
|
@@ -1386,6 +1390,20 @@ class DoclingDocument(BaseModel):
|
|
|
1386
1390
|
table tr:nth-child(even) td{
|
|
1387
1391
|
background-color: LightGray;
|
|
1388
1392
|
}
|
|
1393
|
+
math annotation {
|
|
1394
|
+
display: none;
|
|
1395
|
+
}
|
|
1396
|
+
.formula-not-decoded {
|
|
1397
|
+
background: repeating-linear-gradient(
|
|
1398
|
+
45deg, /* Angle of the stripes */
|
|
1399
|
+
LightGray, /* First color */
|
|
1400
|
+
LightGray 10px, /* Length of the first color */
|
|
1401
|
+
White 10px, /* Second color */
|
|
1402
|
+
White 20px /* Length of the second color */
|
|
1403
|
+
);
|
|
1404
|
+
margin: 0;
|
|
1405
|
+
text-align: center;
|
|
1406
|
+
}
|
|
1389
1407
|
</style>
|
|
1390
1408
|
</head>"""
|
|
1391
1409
|
|
|
@@ -2082,6 +2100,46 @@ class DoclingDocument(BaseModel):
|
|
|
2082
2100
|
previous_level = 0 # Track the previous item's level
|
|
2083
2101
|
in_list = False # Track if we're currently processing list items
|
|
2084
2102
|
|
|
2103
|
+
# Our export markdown doesn't contain any emphasis styling:
|
|
2104
|
+
# Bold, Italic, or Bold-Italic
|
|
2105
|
+
# Hence, any underscore that we print into Markdown is coming from document text
|
|
2106
|
+
# That means we need to escape it, to properly reflect content in the markdown
|
|
2107
|
+
# However, we need to preserve underscores in image URLs
|
|
2108
|
+
# to maintain their validity
|
|
2109
|
+
# For example:  should remain unchanged
|
|
2110
|
+
def _escape_underscores(text):
|
|
2111
|
+
"""Escape underscores but leave them intact in the URL.."""
|
|
2112
|
+
# Firstly, identify all the URL patterns.
|
|
2113
|
+
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
2114
|
+
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
|
|
2115
|
+
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
|
|
2116
|
+
combined_pattern = f"({url_pattern})|({latex_pattern})"
|
|
2117
|
+
|
|
2118
|
+
parts = []
|
|
2119
|
+
last_end = 0
|
|
2120
|
+
|
|
2121
|
+
for match in re.finditer(combined_pattern, text):
|
|
2122
|
+
# Text to add before the URL (needs to be escaped)
|
|
2123
|
+
before_url = text[last_end : match.start()]
|
|
2124
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
2125
|
+
|
|
2126
|
+
# Add the full URL part (do not escape)
|
|
2127
|
+
parts.append(match.group(0))
|
|
2128
|
+
last_end = match.end()
|
|
2129
|
+
|
|
2130
|
+
# Add the final part of the text (which needs to be escaped)
|
|
2131
|
+
if last_end < len(text):
|
|
2132
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
2133
|
+
|
|
2134
|
+
return "".join(parts)
|
|
2135
|
+
|
|
2136
|
+
def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
|
|
2137
|
+
if do_escape_underscores and escaping_underscores:
|
|
2138
|
+
text = _escape_underscores(text)
|
|
2139
|
+
if do_escape_html:
|
|
2140
|
+
text = html.escape(text, quote=False)
|
|
2141
|
+
mdtexts.append(text)
|
|
2142
|
+
|
|
2085
2143
|
for ix, (item, level) in enumerate(
|
|
2086
2144
|
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
2087
2145
|
):
|
|
@@ -2130,7 +2188,7 @@ class DoclingDocument(BaseModel):
|
|
|
2130
2188
|
in_list = False
|
|
2131
2189
|
marker = "" if strict_text else "#"
|
|
2132
2190
|
text = f"{marker} {item.text}"
|
|
2133
|
-
|
|
2191
|
+
_append_text(text.strip() + "\n")
|
|
2134
2192
|
|
|
2135
2193
|
elif (
|
|
2136
2194
|
isinstance(item, TextItem)
|
|
@@ -2143,12 +2201,12 @@ class DoclingDocument(BaseModel):
|
|
|
2143
2201
|
if len(marker) < 2:
|
|
2144
2202
|
marker = "##"
|
|
2145
2203
|
text = f"{marker} {item.text}\n"
|
|
2146
|
-
|
|
2204
|
+
_append_text(text.strip() + "\n")
|
|
2147
2205
|
|
|
2148
2206
|
elif isinstance(item, CodeItem) and item.label in labels:
|
|
2149
2207
|
in_list = False
|
|
2150
2208
|
text = f"```\n{item.text}\n```\n"
|
|
2151
|
-
|
|
2209
|
+
_append_text(text, do_escape_underscores=False, do_escape_html=False)
|
|
2152
2210
|
|
|
2153
2211
|
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2154
2212
|
in_list = True
|
|
@@ -2165,30 +2223,42 @@ class DoclingDocument(BaseModel):
|
|
|
2165
2223
|
marker = "-" # Markdown needs only dash as item marker.
|
|
2166
2224
|
|
|
2167
2225
|
text = f"{list_indent}{marker} {item.text}"
|
|
2168
|
-
|
|
2226
|
+
_append_text(text)
|
|
2169
2227
|
|
|
2170
2228
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2171
2229
|
in_list = False
|
|
2172
|
-
|
|
2230
|
+
if item.text != "":
|
|
2231
|
+
_append_text(
|
|
2232
|
+
f"$${item.text}$$\n",
|
|
2233
|
+
do_escape_underscores=False,
|
|
2234
|
+
do_escape_html=False,
|
|
2235
|
+
)
|
|
2236
|
+
elif item.orig != "":
|
|
2237
|
+
_append_text(
|
|
2238
|
+
"<!-- formula-not-decoded -->\n",
|
|
2239
|
+
do_escape_underscores=False,
|
|
2240
|
+
do_escape_html=False,
|
|
2241
|
+
)
|
|
2173
2242
|
|
|
2174
2243
|
elif isinstance(item, TextItem) and item.label in labels:
|
|
2175
2244
|
in_list = False
|
|
2176
2245
|
if len(item.text) and text_width > 0:
|
|
2246
|
+
text = item.text
|
|
2177
2247
|
wrapped_text = textwrap.fill(text, width=text_width)
|
|
2178
|
-
|
|
2248
|
+
_append_text(wrapped_text + "\n")
|
|
2179
2249
|
elif len(item.text):
|
|
2180
2250
|
text = f"{item.text}\n"
|
|
2181
|
-
|
|
2251
|
+
_append_text(text)
|
|
2182
2252
|
|
|
2183
2253
|
elif isinstance(item, TableItem) and not strict_text:
|
|
2184
2254
|
in_list = False
|
|
2185
|
-
|
|
2255
|
+
_append_text(item.caption_text(self))
|
|
2186
2256
|
md_table = item.export_to_markdown()
|
|
2187
|
-
|
|
2257
|
+
_append_text("\n" + md_table + "\n")
|
|
2188
2258
|
|
|
2189
2259
|
elif isinstance(item, PictureItem) and not strict_text:
|
|
2190
2260
|
in_list = False
|
|
2191
|
-
|
|
2261
|
+
_append_text(item.caption_text(self))
|
|
2192
2262
|
|
|
2193
2263
|
line = item.export_to_markdown(
|
|
2194
2264
|
doc=self,
|
|
@@ -2196,54 +2266,18 @@ class DoclingDocument(BaseModel):
|
|
|
2196
2266
|
image_mode=image_mode,
|
|
2197
2267
|
)
|
|
2198
2268
|
|
|
2199
|
-
|
|
2269
|
+
_append_text(line, do_escape_html=False, do_escape_underscores=False)
|
|
2200
2270
|
|
|
2201
2271
|
elif isinstance(item, DocItem) and item.label in labels:
|
|
2202
2272
|
in_list = False
|
|
2203
|
-
text = "
|
|
2204
|
-
|
|
2273
|
+
text = "<!-- missing-text -->"
|
|
2274
|
+
_append_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2205
2275
|
|
|
2206
2276
|
mdtext = (delim.join(mdtexts)).strip()
|
|
2207
2277
|
mdtext = re.sub(
|
|
2208
2278
|
r"\n\n\n+", "\n\n", mdtext
|
|
2209
2279
|
) # remove cases of double or more empty lines.
|
|
2210
2280
|
|
|
2211
|
-
# Our export markdown doesn't contain any emphasis styling:
|
|
2212
|
-
# Bold, Italic, or Bold-Italic
|
|
2213
|
-
# Hence, any underscore that we print into Markdown is coming from document text
|
|
2214
|
-
# That means we need to escape it, to properly reflect content in the markdown
|
|
2215
|
-
# However, we need to preserve underscores in image URLs
|
|
2216
|
-
# to maintain their validity
|
|
2217
|
-
# For example:  should remain unchanged
|
|
2218
|
-
def escape_underscores(text):
|
|
2219
|
-
"""Escape underscores but leave them intact in the URL.."""
|
|
2220
|
-
# Firstly, identify all the URL patterns.
|
|
2221
|
-
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
2222
|
-
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
|
|
2223
|
-
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
|
|
2224
|
-
combined_pattern = f"({url_pattern})|({latex_pattern})"
|
|
2225
|
-
|
|
2226
|
-
parts = []
|
|
2227
|
-
last_end = 0
|
|
2228
|
-
|
|
2229
|
-
for match in re.finditer(combined_pattern, text):
|
|
2230
|
-
# Text to add before the URL (needs to be escaped)
|
|
2231
|
-
before_url = text[last_end : match.start()]
|
|
2232
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
2233
|
-
|
|
2234
|
-
# Add the full URL part (do not escape)
|
|
2235
|
-
parts.append(match.group(0))
|
|
2236
|
-
last_end = match.end()
|
|
2237
|
-
|
|
2238
|
-
# Add the final part of the text (which needs to be escaped)
|
|
2239
|
-
if last_end < len(text):
|
|
2240
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
2241
|
-
|
|
2242
|
-
return "".join(parts)
|
|
2243
|
-
|
|
2244
|
-
if escaping_underscores:
|
|
2245
|
-
mdtext = escape_underscores(mdtext)
|
|
2246
|
-
|
|
2247
2281
|
return mdtext
|
|
2248
2282
|
|
|
2249
2283
|
def export_to_text( # noqa: C901
|
|
@@ -2272,6 +2306,7 @@ class DoclingDocument(BaseModel):
|
|
|
2272
2306
|
to_element: int = sys.maxsize,
|
|
2273
2307
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2274
2308
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2309
|
+
formula_to_mathml: bool = True,
|
|
2275
2310
|
page_no: Optional[int] = None,
|
|
2276
2311
|
html_lang: str = "en",
|
|
2277
2312
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
@@ -2291,6 +2326,7 @@ class DoclingDocument(BaseModel):
|
|
|
2291
2326
|
to_element=to_element,
|
|
2292
2327
|
labels=labels,
|
|
2293
2328
|
image_mode=image_mode,
|
|
2329
|
+
formula_to_mathml=formula_to_mathml,
|
|
2294
2330
|
page_no=page_no,
|
|
2295
2331
|
html_lang=html_lang,
|
|
2296
2332
|
html_head=html_head,
|
|
@@ -2337,6 +2373,7 @@ class DoclingDocument(BaseModel):
|
|
|
2337
2373
|
to_element: int = sys.maxsize,
|
|
2338
2374
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2339
2375
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2376
|
+
formula_to_mathml: bool = True,
|
|
2340
2377
|
page_no: Optional[int] = None,
|
|
2341
2378
|
html_lang: str = "en",
|
|
2342
2379
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
@@ -2371,6 +2408,15 @@ class DoclingDocument(BaseModel):
|
|
|
2371
2408
|
|
|
2372
2409
|
in_ordered_list: List[bool] = [] # False
|
|
2373
2410
|
|
|
2411
|
+
def _prepare_tag_content(
|
|
2412
|
+
text: str, do_escape_html=True, do_replace_newline=True
|
|
2413
|
+
) -> str:
|
|
2414
|
+
if do_escape_html:
|
|
2415
|
+
text = html.escape(text, quote=False)
|
|
2416
|
+
if do_replace_newline:
|
|
2417
|
+
text = text.replace("\n", "<br>")
|
|
2418
|
+
return text
|
|
2419
|
+
|
|
2374
2420
|
for ix, (item, curr_level) in enumerate(
|
|
2375
2421
|
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
2376
2422
|
):
|
|
@@ -2401,7 +2447,7 @@ class DoclingDocument(BaseModel):
|
|
|
2401
2447
|
]:
|
|
2402
2448
|
|
|
2403
2449
|
text = "<ol>"
|
|
2404
|
-
html_texts.append(text
|
|
2450
|
+
html_texts.append(text)
|
|
2405
2451
|
|
|
2406
2452
|
# Increment list nesting level when entering a new list
|
|
2407
2453
|
in_ordered_list.append(True)
|
|
@@ -2411,7 +2457,7 @@ class DoclingDocument(BaseModel):
|
|
|
2411
2457
|
]:
|
|
2412
2458
|
|
|
2413
2459
|
text = "<ul>"
|
|
2414
|
-
html_texts.append(text
|
|
2460
|
+
html_texts.append(text)
|
|
2415
2461
|
|
|
2416
2462
|
# Increment list nesting level when entering a new list
|
|
2417
2463
|
in_ordered_list.append(False)
|
|
@@ -2421,54 +2467,86 @@ class DoclingDocument(BaseModel):
|
|
|
2421
2467
|
|
|
2422
2468
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2423
2469
|
|
|
2424
|
-
text = f"<h1>{item.text}</h1>"
|
|
2425
|
-
html_texts.append(text
|
|
2470
|
+
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
|
|
2471
|
+
html_texts.append(text)
|
|
2426
2472
|
|
|
2427
2473
|
elif isinstance(item, SectionHeaderItem):
|
|
2428
2474
|
|
|
2429
|
-
section_level: int = item.level + 1
|
|
2430
|
-
|
|
2431
|
-
text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
|
|
2432
|
-
html_texts.append(text.strip())
|
|
2433
|
-
|
|
2434
|
-
elif isinstance(item, TextItem) and item.label in [
|
|
2435
|
-
DocItemLabel.SECTION_HEADER
|
|
2436
|
-
]:
|
|
2475
|
+
section_level: int = min(item.level + 1, 6)
|
|
2437
2476
|
|
|
2438
|
-
|
|
2477
|
+
text = (
|
|
2478
|
+
f"<h{(section_level)}>"
|
|
2479
|
+
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
|
|
2480
|
+
)
|
|
2481
|
+
html_texts.append(text)
|
|
2439
2482
|
|
|
2440
|
-
|
|
2441
|
-
section_level = 2
|
|
2483
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2442
2484
|
|
|
2443
|
-
|
|
2444
|
-
|
|
2485
|
+
math_formula = _prepare_tag_content(
|
|
2486
|
+
item.text, do_escape_html=False, do_replace_newline=False
|
|
2487
|
+
)
|
|
2488
|
+
text = ""
|
|
2489
|
+
|
|
2490
|
+
# If the formula is not processed correcty, use its image
|
|
2491
|
+
if (
|
|
2492
|
+
item.text == ""
|
|
2493
|
+
and item.orig != ""
|
|
2494
|
+
and image_mode == ImageRefMode.EMBEDDED
|
|
2495
|
+
and len(item.prov) > 0
|
|
2496
|
+
):
|
|
2497
|
+
item_image = item.get_image(doc=self)
|
|
2498
|
+
if item_image is not None:
|
|
2499
|
+
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
2500
|
+
text = (
|
|
2501
|
+
"<figure>"
|
|
2502
|
+
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
|
|
2503
|
+
"</figure>"
|
|
2504
|
+
)
|
|
2445
2505
|
|
|
2446
|
-
|
|
2447
|
-
|
|
2506
|
+
# Building a math equation in MathML format
|
|
2507
|
+
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2508
|
+
elif formula_to_mathml:
|
|
2509
|
+
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2510
|
+
math_formula, display="block"
|
|
2511
|
+
)
|
|
2512
|
+
annotation = SubElement(
|
|
2513
|
+
mathml_element, "annotation", dict(encoding="TeX")
|
|
2514
|
+
)
|
|
2515
|
+
annotation.text = math_formula
|
|
2516
|
+
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
2517
|
+
text = f"<div>{mathml}</div>"
|
|
2448
2518
|
|
|
2449
|
-
|
|
2519
|
+
elif math_formula != "":
|
|
2520
|
+
text = f"<pre>{math_formula}</pre>"
|
|
2450
2521
|
|
|
2451
|
-
text
|
|
2452
|
-
|
|
2522
|
+
if text != "":
|
|
2523
|
+
html_texts.append(text)
|
|
2524
|
+
else:
|
|
2525
|
+
html_texts.append(
|
|
2526
|
+
'<div class="formula-not-decoded">Formula not decoded</div>'
|
|
2527
|
+
)
|
|
2453
2528
|
|
|
2454
2529
|
elif isinstance(item, ListItem):
|
|
2455
2530
|
|
|
2456
|
-
text = f"<li>{item.text}</li>"
|
|
2531
|
+
text = f"<li>{_prepare_tag_content(item.text)}</li>"
|
|
2457
2532
|
html_texts.append(text)
|
|
2458
2533
|
|
|
2459
2534
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2460
2535
|
|
|
2461
|
-
text = f"<li>{item.text}</li>"
|
|
2536
|
+
text = f"<li>{_prepare_tag_content(item.text)}</li>"
|
|
2462
2537
|
html_texts.append(text)
|
|
2463
2538
|
|
|
2464
|
-
elif isinstance(item, CodeItem)
|
|
2465
|
-
|
|
2466
|
-
|
|
2539
|
+
elif isinstance(item, CodeItem):
|
|
2540
|
+
code_text = _prepare_tag_content(
|
|
2541
|
+
item.text, do_escape_html=False, do_replace_newline=False
|
|
2542
|
+
)
|
|
2543
|
+
text = f"<pre><code>{code_text}</code></pre>"
|
|
2544
|
+
html_texts.append(text)
|
|
2467
2545
|
|
|
2468
|
-
elif isinstance(item, TextItem)
|
|
2546
|
+
elif isinstance(item, TextItem):
|
|
2469
2547
|
|
|
2470
|
-
text = f"<p>{item.text}</p>"
|
|
2471
|
-
html_texts.append(text
|
|
2548
|
+
text = f"<p>{_prepare_tag_content(item.text)}</p>"
|
|
2549
|
+
html_texts.append(text)
|
|
2472
2550
|
elif isinstance(item, TableItem):
|
|
2473
2551
|
|
|
2474
2552
|
text = item.export_to_html(doc=self, add_caption=True)
|
|
@@ -2489,8 +2567,7 @@ class DoclingDocument(BaseModel):
|
|
|
2489
2567
|
|
|
2490
2568
|
lines = []
|
|
2491
2569
|
lines.extend(head_lines)
|
|
2492
|
-
|
|
2493
|
-
lines.append(line.replace("\n", "<br>"))
|
|
2570
|
+
lines.extend(html_texts)
|
|
2494
2571
|
|
|
2495
2572
|
delim = "\n"
|
|
2496
2573
|
html_text = (delim.join(lines)).strip()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.17.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
|
|
|
59
59
|
transformers = { version = "^4.34.0", optional = true }
|
|
60
60
|
semchunk = { version = "^2.2.0", optional = true }
|
|
61
61
|
typer = "^0.12.5"
|
|
62
|
+
latex2mathml = "^3.77.0"
|
|
62
63
|
|
|
63
64
|
[tool.poetry.extras]
|
|
64
65
|
chunking = ["transformers", "semchunk"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.16.0 → docling_core-2.17.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.16.0 → docling_core-2.17.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.16.0 → docling_core-2.17.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|