docling-core 2.17.0__tar.gz → 2.17.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.17.0 → docling_core-2.17.2}/PKG-INFO +1 -1
  2. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/document.py +71 -30
  3. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/utils.py +27 -0
  4. {docling_core-2.17.0 → docling_core-2.17.2}/pyproject.toml +1 -1
  5. {docling_core-2.17.0 → docling_core-2.17.2}/LICENSE +0 -0
  6. {docling_core-2.17.0 → docling_core-2.17.2}/README.md +0 -0
  7. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/__init__.py +0 -0
  8. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/py.typed +0 -0
  11. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/search/package.py +0 -0
  24. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  29. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/__init__.py +0 -0
  30. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/base.py +0 -0
  31. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/__init__.py +0 -0
  32. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/base.py +0 -0
  33. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/labels.py +0 -0
  34. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/doc/tokens.py +0 -0
  35. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.17.0 → docling_core-2.17.2}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.17.0
3
+ Version: 2.17.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -5,6 +5,7 @@ import copy
5
5
  import hashlib
6
6
  import html
7
7
  import json
8
+ import logging
8
9
  import mimetypes
9
10
  import os
10
11
  import re
@@ -20,6 +21,7 @@ from xml.etree.cElementTree import SubElement, tostring
20
21
  from xml.sax.saxutils import unescape
21
22
 
22
23
  import latex2mathml.converter
24
+ import latex2mathml.exceptions
23
25
  import pandas as pd
24
26
  import yaml
25
27
  from PIL import Image as PILImage
@@ -42,7 +44,13 @@ from docling_core.types.doc import BoundingBox, Size
42
44
  from docling_core.types.doc.base import ImageRefMode
43
45
  from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
44
46
  from docling_core.types.doc.tokens import DocumentToken, TableToken
45
- from docling_core.types.doc.utils import relative_path
47
+ from docling_core.types.doc.utils import (
48
+ get_html_tag_with_text_direction,
49
+ get_text_direction,
50
+ relative_path,
51
+ )
52
+
53
+ _logger = logging.getLogger(__name__)
46
54
 
47
55
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
48
56
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -862,7 +870,9 @@ class PictureItem(FloatingItem):
862
870
 
863
871
  caption_text = ""
864
872
  if len(text) > 0:
865
- caption_text = f"<figcaption>{text}</figcaption>"
873
+ caption_text = get_html_tag_with_text_direction(
874
+ html_tag="figcaption", text=text
875
+ )
866
876
 
867
877
  default_response = f"<figure>{caption_text}</figure>"
868
878
 
@@ -1086,15 +1096,28 @@ class TableItem(FloatingItem):
1086
1096
  if colspan > 1:
1087
1097
  opening_tag += f' colspan="{colspan}"'
1088
1098
 
1099
+ text_dir = get_text_direction(content)
1100
+ if text_dir == "rtl":
1101
+ opening_tag += f' dir="{dir}"'
1102
+
1089
1103
  body += f"<{opening_tag}>{content}</{celltag}>"
1090
1104
  body += "</tr>"
1091
1105
 
1106
+ # dir = get_text_direction(text)
1107
+
1092
1108
  if len(text) > 0 and len(body) > 0:
1093
- body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
1109
+ caption_text = get_html_tag_with_text_direction(
1110
+ html_tag="caption", text=text
1111
+ )
1112
+ body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
1113
+
1094
1114
  elif len(text) == 0 and len(body) > 0:
1095
1115
  body = f"<table><tbody>{body}</tbody></table>"
1096
1116
  elif len(text) > 0 and len(body) == 0:
1097
- body = f"<table><caption>{text}</caption></table>"
1117
+ caption_text = get_html_tag_with_text_direction(
1118
+ html_tag="caption", text=text
1119
+ )
1120
+ body = f"<table>{caption_text}</table>"
1098
1121
  else:
1099
1122
  body = "<table></table>"
1100
1123
 
@@ -2466,17 +2489,17 @@ class DoclingDocument(BaseModel):
2466
2489
  continue
2467
2490
 
2468
2491
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2492
+ text_inner = _prepare_tag_content(item.text)
2493
+ text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
2469
2494
 
2470
- text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
2471
2495
  html_texts.append(text)
2472
2496
 
2473
2497
  elif isinstance(item, SectionHeaderItem):
2474
2498
 
2475
2499
  section_level: int = min(item.level + 1, 6)
2476
2500
 
2477
- text = (
2478
- f"<h{(section_level)}>"
2479
- f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
2501
+ text = get_html_tag_with_text_direction(
2502
+ html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
2480
2503
  )
2481
2504
  html_texts.append(text)
2482
2505
 
@@ -2487,34 +2510,47 @@ class DoclingDocument(BaseModel):
2487
2510
  )
2488
2511
  text = ""
2489
2512
 
2490
- # If the formula is not processed correcty, use its image
2491
- if (
2492
- item.text == ""
2493
- and item.orig != ""
2494
- and image_mode == ImageRefMode.EMBEDDED
2495
- and len(item.prov) > 0
2496
- ):
2513
+ def _image_fallback(item: TextItem):
2497
2514
  item_image = item.get_image(doc=self)
2498
2515
  if item_image is not None:
2499
2516
  img_ref = ImageRef.from_pil(item_image, dpi=72)
2500
- text = (
2517
+ return (
2501
2518
  "<figure>"
2502
2519
  f'<img src="{img_ref.uri}" alt="{item.orig}" />'
2503
2520
  "</figure>"
2504
2521
  )
2505
2522
 
2523
+ # If the formula is not processed correcty, use its image
2524
+ if (
2525
+ item.text == ""
2526
+ and item.orig != ""
2527
+ and image_mode == ImageRefMode.EMBEDDED
2528
+ and len(item.prov) > 0
2529
+ ):
2530
+ text = _image_fallback(item)
2531
+
2506
2532
  # Building a math equation in MathML format
2507
2533
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
2508
2534
  elif formula_to_mathml:
2509
- mathml_element = latex2mathml.converter.convert_to_element(
2510
- math_formula, display="block"
2511
- )
2512
- annotation = SubElement(
2513
- mathml_element, "annotation", dict(encoding="TeX")
2514
- )
2515
- annotation.text = math_formula
2516
- mathml = unescape(tostring(mathml_element, encoding="unicode"))
2517
- text = f"<div>{mathml}</div>"
2535
+ try:
2536
+ mathml_element = latex2mathml.converter.convert_to_element(
2537
+ math_formula, display="block"
2538
+ )
2539
+ annotation = SubElement(
2540
+ mathml_element, "annotation", dict(encoding="TeX")
2541
+ )
2542
+ annotation.text = math_formula
2543
+ mathml = unescape(tostring(mathml_element, encoding="unicode"))
2544
+ text = f"<div>{mathml}</div>"
2545
+ except Exception as err:
2546
+ _logger.warning(
2547
+ "Malformed formula cannot be rendered. "
2548
+ f"Error {err.__class__.__name__}, formula={math_formula}"
2549
+ )
2550
+ if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
2551
+ text = _image_fallback(item)
2552
+ else:
2553
+ text = f"<pre>{math_formula}</pre>"
2518
2554
 
2519
2555
  elif math_formula != "":
2520
2556
  text = f"<pre>{math_formula}</pre>"
@@ -2527,13 +2563,15 @@ class DoclingDocument(BaseModel):
2527
2563
  )
2528
2564
 
2529
2565
  elif isinstance(item, ListItem):
2530
-
2531
- text = f"<li>{_prepare_tag_content(item.text)}</li>"
2566
+ text = get_html_tag_with_text_direction(
2567
+ html_tag="li", text=_prepare_tag_content(item.text)
2568
+ )
2532
2569
  html_texts.append(text)
2533
2570
 
2534
2571
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
2535
-
2536
- text = f"<li>{_prepare_tag_content(item.text)}</li>"
2572
+ text = get_html_tag_with_text_direction(
2573
+ html_tag="li", text=_prepare_tag_content(item.text)
2574
+ )
2537
2575
  html_texts.append(text)
2538
2576
 
2539
2577
  elif isinstance(item, CodeItem):
@@ -2545,8 +2583,11 @@ class DoclingDocument(BaseModel):
2545
2583
 
2546
2584
  elif isinstance(item, TextItem):
2547
2585
 
2548
- text = f"<p>{_prepare_tag_content(item.text)}</p>"
2586
+ text = get_html_tag_with_text_direction(
2587
+ html_tag="p", text=_prepare_tag_content(item.text)
2588
+ )
2549
2589
  html_texts.append(text)
2590
+
2550
2591
  elif isinstance(item, TableItem):
2551
2592
 
2552
2593
  text = item.export_to_html(doc=self, add_caption=True)
@@ -5,6 +5,7 @@
5
5
 
6
6
  """Utils for document types."""
7
7
 
8
+ import unicodedata
8
9
  from pathlib import Path
9
10
 
10
11
 
@@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:
46
47
 
47
48
  # Combine and return the result
48
49
  return Path(*up_segments, *down_segments)
50
+
51
+
52
+ def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
53
+ """Form the HTML element with tag, text, and optional dir attribute."""
54
+ text_dir = get_text_direction(text)
55
+
56
+ if text_dir == "ltr":
57
+ return f"<{html_tag}>{text}</{html_tag}>"
58
+ else:
59
+ return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
60
+
61
+
62
+ def get_text_direction(text: str) -> str:
63
+ """Determine the text direction of a given string as LTR or RTL script."""
64
+ if not text:
65
+ return "ltr" # Default for empty input
66
+
67
+ rtl_scripts = {"R", "AL"}
68
+ rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
69
+
70
+ return (
71
+ "rtl"
72
+ if unicodedata.bidirectional(text[0]) in rtl_scripts
73
+ or rtl_chars > len(text) / 2
74
+ else "ltr"
75
+ )
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.17.0"
3
+ version = "2.17.2"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes