docling-core 2.17.0__tar.gz → 2.17.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.17.0 → docling_core-2.17.2}/PKG-INFO +1 -1
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/document.py +71 -30
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/utils.py +27 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/pyproject.toml +1 -1
- {docling_core-2.17.0 → docling_core-2.17.2}/LICENSE +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/README.md +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/cli/view.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/py.typed +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/mapping.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/meta.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/package.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/alias.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/file.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/validate.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/validators.py +0 -0
|
@@ -5,6 +5,7 @@ import copy
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import html
|
|
7
7
|
import json
|
|
8
|
+
import logging
|
|
8
9
|
import mimetypes
|
|
9
10
|
import os
|
|
10
11
|
import re
|
|
@@ -20,6 +21,7 @@ from xml.etree.cElementTree import SubElement, tostring
|
|
|
20
21
|
from xml.sax.saxutils import unescape
|
|
21
22
|
|
|
22
23
|
import latex2mathml.converter
|
|
24
|
+
import latex2mathml.exceptions
|
|
23
25
|
import pandas as pd
|
|
24
26
|
import yaml
|
|
25
27
|
from PIL import Image as PILImage
|
|
@@ -42,7 +44,13 @@ from docling_core.types.doc import BoundingBox, Size
|
|
|
42
44
|
from docling_core.types.doc.base import ImageRefMode
|
|
43
45
|
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
|
|
44
46
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
45
|
-
from docling_core.types.doc.utils import
|
|
47
|
+
from docling_core.types.doc.utils import (
|
|
48
|
+
get_html_tag_with_text_direction,
|
|
49
|
+
get_text_direction,
|
|
50
|
+
relative_path,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
_logger = logging.getLogger(__name__)
|
|
46
54
|
|
|
47
55
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
48
56
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
@@ -862,7 +870,9 @@ class PictureItem(FloatingItem):
|
|
|
862
870
|
|
|
863
871
|
caption_text = ""
|
|
864
872
|
if len(text) > 0:
|
|
865
|
-
caption_text =
|
|
873
|
+
caption_text = get_html_tag_with_text_direction(
|
|
874
|
+
html_tag="figcaption", text=text
|
|
875
|
+
)
|
|
866
876
|
|
|
867
877
|
default_response = f"<figure>{caption_text}</figure>"
|
|
868
878
|
|
|
@@ -1086,15 +1096,28 @@ class TableItem(FloatingItem):
|
|
|
1086
1096
|
if colspan > 1:
|
|
1087
1097
|
opening_tag += f' colspan="{colspan}"'
|
|
1088
1098
|
|
|
1099
|
+
text_dir = get_text_direction(content)
|
|
1100
|
+
if text_dir == "rtl":
|
|
1101
|
+
opening_tag += f' dir="{dir}"'
|
|
1102
|
+
|
|
1089
1103
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
1090
1104
|
body += "</tr>"
|
|
1091
1105
|
|
|
1106
|
+
# dir = get_text_direction(text)
|
|
1107
|
+
|
|
1092
1108
|
if len(text) > 0 and len(body) > 0:
|
|
1093
|
-
|
|
1109
|
+
caption_text = get_html_tag_with_text_direction(
|
|
1110
|
+
html_tag="caption", text=text
|
|
1111
|
+
)
|
|
1112
|
+
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
|
|
1113
|
+
|
|
1094
1114
|
elif len(text) == 0 and len(body) > 0:
|
|
1095
1115
|
body = f"<table><tbody>{body}</tbody></table>"
|
|
1096
1116
|
elif len(text) > 0 and len(body) == 0:
|
|
1097
|
-
|
|
1117
|
+
caption_text = get_html_tag_with_text_direction(
|
|
1118
|
+
html_tag="caption", text=text
|
|
1119
|
+
)
|
|
1120
|
+
body = f"<table>{caption_text}</table>"
|
|
1098
1121
|
else:
|
|
1099
1122
|
body = "<table></table>"
|
|
1100
1123
|
|
|
@@ -2466,17 +2489,17 @@ class DoclingDocument(BaseModel):
|
|
|
2466
2489
|
continue
|
|
2467
2490
|
|
|
2468
2491
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2492
|
+
text_inner = _prepare_tag_content(item.text)
|
|
2493
|
+
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
|
|
2469
2494
|
|
|
2470
|
-
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
|
|
2471
2495
|
html_texts.append(text)
|
|
2472
2496
|
|
|
2473
2497
|
elif isinstance(item, SectionHeaderItem):
|
|
2474
2498
|
|
|
2475
2499
|
section_level: int = min(item.level + 1, 6)
|
|
2476
2500
|
|
|
2477
|
-
text = (
|
|
2478
|
-
f"
|
|
2479
|
-
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
|
|
2501
|
+
text = get_html_tag_with_text_direction(
|
|
2502
|
+
html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
|
|
2480
2503
|
)
|
|
2481
2504
|
html_texts.append(text)
|
|
2482
2505
|
|
|
@@ -2487,34 +2510,47 @@ class DoclingDocument(BaseModel):
|
|
|
2487
2510
|
)
|
|
2488
2511
|
text = ""
|
|
2489
2512
|
|
|
2490
|
-
|
|
2491
|
-
if (
|
|
2492
|
-
item.text == ""
|
|
2493
|
-
and item.orig != ""
|
|
2494
|
-
and image_mode == ImageRefMode.EMBEDDED
|
|
2495
|
-
and len(item.prov) > 0
|
|
2496
|
-
):
|
|
2513
|
+
def _image_fallback(item: TextItem):
|
|
2497
2514
|
item_image = item.get_image(doc=self)
|
|
2498
2515
|
if item_image is not None:
|
|
2499
2516
|
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
2500
|
-
|
|
2517
|
+
return (
|
|
2501
2518
|
"<figure>"
|
|
2502
2519
|
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
|
|
2503
2520
|
"</figure>"
|
|
2504
2521
|
)
|
|
2505
2522
|
|
|
2523
|
+
# If the formula is not processed correcty, use its image
|
|
2524
|
+
if (
|
|
2525
|
+
item.text == ""
|
|
2526
|
+
and item.orig != ""
|
|
2527
|
+
and image_mode == ImageRefMode.EMBEDDED
|
|
2528
|
+
and len(item.prov) > 0
|
|
2529
|
+
):
|
|
2530
|
+
text = _image_fallback(item)
|
|
2531
|
+
|
|
2506
2532
|
# Building a math equation in MathML format
|
|
2507
2533
|
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2508
2534
|
elif formula_to_mathml:
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2535
|
+
try:
|
|
2536
|
+
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2537
|
+
math_formula, display="block"
|
|
2538
|
+
)
|
|
2539
|
+
annotation = SubElement(
|
|
2540
|
+
mathml_element, "annotation", dict(encoding="TeX")
|
|
2541
|
+
)
|
|
2542
|
+
annotation.text = math_formula
|
|
2543
|
+
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
2544
|
+
text = f"<div>{mathml}</div>"
|
|
2545
|
+
except Exception as err:
|
|
2546
|
+
_logger.warning(
|
|
2547
|
+
"Malformed formula cannot be rendered. "
|
|
2548
|
+
f"Error {err.__class__.__name__}, formula={math_formula}"
|
|
2549
|
+
)
|
|
2550
|
+
if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
|
|
2551
|
+
text = _image_fallback(item)
|
|
2552
|
+
else:
|
|
2553
|
+
text = f"<pre>{math_formula}</pre>"
|
|
2518
2554
|
|
|
2519
2555
|
elif math_formula != "":
|
|
2520
2556
|
text = f"<pre>{math_formula}</pre>"
|
|
@@ -2527,13 +2563,15 @@ class DoclingDocument(BaseModel):
|
|
|
2527
2563
|
)
|
|
2528
2564
|
|
|
2529
2565
|
elif isinstance(item, ListItem):
|
|
2530
|
-
|
|
2531
|
-
|
|
2566
|
+
text = get_html_tag_with_text_direction(
|
|
2567
|
+
html_tag="li", text=_prepare_tag_content(item.text)
|
|
2568
|
+
)
|
|
2532
2569
|
html_texts.append(text)
|
|
2533
2570
|
|
|
2534
2571
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2535
|
-
|
|
2536
|
-
|
|
2572
|
+
text = get_html_tag_with_text_direction(
|
|
2573
|
+
html_tag="li", text=_prepare_tag_content(item.text)
|
|
2574
|
+
)
|
|
2537
2575
|
html_texts.append(text)
|
|
2538
2576
|
|
|
2539
2577
|
elif isinstance(item, CodeItem):
|
|
@@ -2545,8 +2583,11 @@ class DoclingDocument(BaseModel):
|
|
|
2545
2583
|
|
|
2546
2584
|
elif isinstance(item, TextItem):
|
|
2547
2585
|
|
|
2548
|
-
text =
|
|
2586
|
+
text = get_html_tag_with_text_direction(
|
|
2587
|
+
html_tag="p", text=_prepare_tag_content(item.text)
|
|
2588
|
+
)
|
|
2549
2589
|
html_texts.append(text)
|
|
2590
|
+
|
|
2550
2591
|
elif isinstance(item, TableItem):
|
|
2551
2592
|
|
|
2552
2593
|
text = item.export_to_html(doc=self, add_caption=True)
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
"""Utils for document types."""
|
|
7
7
|
|
|
8
|
+
import unicodedata
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
|
|
10
11
|
|
|
@@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:
|
|
|
46
47
|
|
|
47
48
|
# Combine and return the result
|
|
48
49
|
return Path(*up_segments, *down_segments)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
|
|
53
|
+
"""Form the HTML element with tag, text, and optional dir attribute."""
|
|
54
|
+
text_dir = get_text_direction(text)
|
|
55
|
+
|
|
56
|
+
if text_dir == "ltr":
|
|
57
|
+
return f"<{html_tag}>{text}</{html_tag}>"
|
|
58
|
+
else:
|
|
59
|
+
return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_text_direction(text: str) -> str:
|
|
63
|
+
"""Determine the text direction of a given string as LTR or RTL script."""
|
|
64
|
+
if not text:
|
|
65
|
+
return "ltr" # Default for empty input
|
|
66
|
+
|
|
67
|
+
rtl_scripts = {"R", "AL"}
|
|
68
|
+
rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
"rtl"
|
|
72
|
+
if unicodedata.bidirectional(text[0]) in rtl_scripts
|
|
73
|
+
or rtl_chars > len(text) / 2
|
|
74
|
+
else "ltr"
|
|
75
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|