docling-core 2.17.1__tar.gz → 2.17.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.17.1 → docling_core-2.17.2}/PKG-INFO +1 -1
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/document.py +37 -13
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/utils.py +27 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/pyproject.toml +1 -1
- {docling_core-2.17.1 → docling_core-2.17.2}/LICENSE +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/README.md +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/cli/view.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/py.typed +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/mapping.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/meta.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/package.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/alias.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/file.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/validate.py +0 -0
- {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/validators.py +0 -0
|
@@ -44,7 +44,11 @@ from docling_core.types.doc import BoundingBox, Size
|
|
|
44
44
|
from docling_core.types.doc.base import ImageRefMode
|
|
45
45
|
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
|
|
46
46
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
47
|
-
from docling_core.types.doc.utils import
|
|
47
|
+
from docling_core.types.doc.utils import (
|
|
48
|
+
get_html_tag_with_text_direction,
|
|
49
|
+
get_text_direction,
|
|
50
|
+
relative_path,
|
|
51
|
+
)
|
|
48
52
|
|
|
49
53
|
_logger = logging.getLogger(__name__)
|
|
50
54
|
|
|
@@ -866,7 +870,9 @@ class PictureItem(FloatingItem):
|
|
|
866
870
|
|
|
867
871
|
caption_text = ""
|
|
868
872
|
if len(text) > 0:
|
|
869
|
-
caption_text =
|
|
873
|
+
caption_text = get_html_tag_with_text_direction(
|
|
874
|
+
html_tag="figcaption", text=text
|
|
875
|
+
)
|
|
870
876
|
|
|
871
877
|
default_response = f"<figure>{caption_text}</figure>"
|
|
872
878
|
|
|
@@ -1090,15 +1096,28 @@ class TableItem(FloatingItem):
|
|
|
1090
1096
|
if colspan > 1:
|
|
1091
1097
|
opening_tag += f' colspan="{colspan}"'
|
|
1092
1098
|
|
|
1099
|
+
text_dir = get_text_direction(content)
|
|
1100
|
+
if text_dir == "rtl":
|
|
1101
|
+
opening_tag += f' dir="{dir}"'
|
|
1102
|
+
|
|
1093
1103
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
1094
1104
|
body += "</tr>"
|
|
1095
1105
|
|
|
1106
|
+
# dir = get_text_direction(text)
|
|
1107
|
+
|
|
1096
1108
|
if len(text) > 0 and len(body) > 0:
|
|
1097
|
-
|
|
1109
|
+
caption_text = get_html_tag_with_text_direction(
|
|
1110
|
+
html_tag="caption", text=text
|
|
1111
|
+
)
|
|
1112
|
+
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
|
|
1113
|
+
|
|
1098
1114
|
elif len(text) == 0 and len(body) > 0:
|
|
1099
1115
|
body = f"<table><tbody>{body}</tbody></table>"
|
|
1100
1116
|
elif len(text) > 0 and len(body) == 0:
|
|
1101
|
-
|
|
1117
|
+
caption_text = get_html_tag_with_text_direction(
|
|
1118
|
+
html_tag="caption", text=text
|
|
1119
|
+
)
|
|
1120
|
+
body = f"<table>{caption_text}</table>"
|
|
1102
1121
|
else:
|
|
1103
1122
|
body = "<table></table>"
|
|
1104
1123
|
|
|
@@ -2470,17 +2489,17 @@ class DoclingDocument(BaseModel):
|
|
|
2470
2489
|
continue
|
|
2471
2490
|
|
|
2472
2491
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2492
|
+
text_inner = _prepare_tag_content(item.text)
|
|
2493
|
+
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
|
|
2473
2494
|
|
|
2474
|
-
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
|
|
2475
2495
|
html_texts.append(text)
|
|
2476
2496
|
|
|
2477
2497
|
elif isinstance(item, SectionHeaderItem):
|
|
2478
2498
|
|
|
2479
2499
|
section_level: int = min(item.level + 1, 6)
|
|
2480
2500
|
|
|
2481
|
-
text = (
|
|
2482
|
-
f"
|
|
2483
|
-
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
|
|
2501
|
+
text = get_html_tag_with_text_direction(
|
|
2502
|
+
html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
|
|
2484
2503
|
)
|
|
2485
2504
|
html_texts.append(text)
|
|
2486
2505
|
|
|
@@ -2544,13 +2563,15 @@ class DoclingDocument(BaseModel):
|
|
|
2544
2563
|
)
|
|
2545
2564
|
|
|
2546
2565
|
elif isinstance(item, ListItem):
|
|
2547
|
-
|
|
2548
|
-
|
|
2566
|
+
text = get_html_tag_with_text_direction(
|
|
2567
|
+
html_tag="li", text=_prepare_tag_content(item.text)
|
|
2568
|
+
)
|
|
2549
2569
|
html_texts.append(text)
|
|
2550
2570
|
|
|
2551
2571
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2552
|
-
|
|
2553
|
-
|
|
2572
|
+
text = get_html_tag_with_text_direction(
|
|
2573
|
+
html_tag="li", text=_prepare_tag_content(item.text)
|
|
2574
|
+
)
|
|
2554
2575
|
html_texts.append(text)
|
|
2555
2576
|
|
|
2556
2577
|
elif isinstance(item, CodeItem):
|
|
@@ -2562,8 +2583,11 @@ class DoclingDocument(BaseModel):
|
|
|
2562
2583
|
|
|
2563
2584
|
elif isinstance(item, TextItem):
|
|
2564
2585
|
|
|
2565
|
-
text =
|
|
2586
|
+
text = get_html_tag_with_text_direction(
|
|
2587
|
+
html_tag="p", text=_prepare_tag_content(item.text)
|
|
2588
|
+
)
|
|
2566
2589
|
html_texts.append(text)
|
|
2590
|
+
|
|
2567
2591
|
elif isinstance(item, TableItem):
|
|
2568
2592
|
|
|
2569
2593
|
text = item.export_to_html(doc=self, add_caption=True)
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
"""Utils for document types."""
|
|
7
7
|
|
|
8
|
+
import unicodedata
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
|
|
10
11
|
|
|
@@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:
|
|
|
46
47
|
|
|
47
48
|
# Combine and return the result
|
|
48
49
|
return Path(*up_segments, *down_segments)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
|
|
53
|
+
"""Form the HTML element with tag, text, and optional dir attribute."""
|
|
54
|
+
text_dir = get_text_direction(text)
|
|
55
|
+
|
|
56
|
+
if text_dir == "ltr":
|
|
57
|
+
return f"<{html_tag}>{text}</{html_tag}>"
|
|
58
|
+
else:
|
|
59
|
+
return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_text_direction(text: str) -> str:
|
|
63
|
+
"""Determine the text direction of a given string as LTR or RTL script."""
|
|
64
|
+
if not text:
|
|
65
|
+
return "ltr" # Default for empty input
|
|
66
|
+
|
|
67
|
+
rtl_scripts = {"R", "AL"}
|
|
68
|
+
rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
"rtl"
|
|
72
|
+
if unicodedata.bidirectional(text[0]) in rtl_scripts
|
|
73
|
+
or rtl_chars > len(text) / 2
|
|
74
|
+
else "ltr"
|
|
75
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|