docling-core 2.17.1__tar.gz → 2.17.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.17.1 → docling_core-2.17.2}/PKG-INFO +1 -1
  2. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/document.py +37 -13
  3. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/utils.py +27 -0
  4. {docling_core-2.17.1 → docling_core-2.17.2}/pyproject.toml +1 -1
  5. {docling_core-2.17.1 → docling_core-2.17.2}/LICENSE +0 -0
  6. {docling_core-2.17.1 → docling_core-2.17.2}/README.md +0 -0
  7. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/__init__.py +0 -0
  8. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/py.typed +0 -0
  11. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/search/package.py +0 -0
  24. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  29. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/__init__.py +0 -0
  30. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/base.py +0 -0
  31. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/__init__.py +0 -0
  32. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/base.py +0 -0
  33. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/labels.py +0 -0
  34. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/doc/tokens.py +0 -0
  35. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.17.1 → docling_core-2.17.2}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.17.1
3
+ Version: 2.17.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -44,7 +44,11 @@ from docling_core.types.doc import BoundingBox, Size
44
44
  from docling_core.types.doc.base import ImageRefMode
45
45
  from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
46
46
  from docling_core.types.doc.tokens import DocumentToken, TableToken
47
- from docling_core.types.doc.utils import relative_path
47
+ from docling_core.types.doc.utils import (
48
+ get_html_tag_with_text_direction,
49
+ get_text_direction,
50
+ relative_path,
51
+ )
48
52
 
49
53
  _logger = logging.getLogger(__name__)
50
54
 
@@ -866,7 +870,9 @@ class PictureItem(FloatingItem):
866
870
 
867
871
  caption_text = ""
868
872
  if len(text) > 0:
869
- caption_text = f"<figcaption>{text}</figcaption>"
873
+ caption_text = get_html_tag_with_text_direction(
874
+ html_tag="figcaption", text=text
875
+ )
870
876
 
871
877
  default_response = f"<figure>{caption_text}</figure>"
872
878
 
@@ -1090,15 +1096,28 @@ class TableItem(FloatingItem):
1090
1096
  if colspan > 1:
1091
1097
  opening_tag += f' colspan="{colspan}"'
1092
1098
 
1099
+ text_dir = get_text_direction(content)
1100
+ if text_dir == "rtl":
1101
+ opening_tag += f' dir="{dir}"'
1102
+
1093
1103
  body += f"<{opening_tag}>{content}</{celltag}>"
1094
1104
  body += "</tr>"
1095
1105
 
1106
+ # dir = get_text_direction(text)
1107
+
1096
1108
  if len(text) > 0 and len(body) > 0:
1097
- body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
1109
+ caption_text = get_html_tag_with_text_direction(
1110
+ html_tag="caption", text=text
1111
+ )
1112
+ body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
1113
+
1098
1114
  elif len(text) == 0 and len(body) > 0:
1099
1115
  body = f"<table><tbody>{body}</tbody></table>"
1100
1116
  elif len(text) > 0 and len(body) == 0:
1101
- body = f"<table><caption>{text}</caption></table>"
1117
+ caption_text = get_html_tag_with_text_direction(
1118
+ html_tag="caption", text=text
1119
+ )
1120
+ body = f"<table>{caption_text}</table>"
1102
1121
  else:
1103
1122
  body = "<table></table>"
1104
1123
 
@@ -2470,17 +2489,17 @@ class DoclingDocument(BaseModel):
2470
2489
  continue
2471
2490
 
2472
2491
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2492
+ text_inner = _prepare_tag_content(item.text)
2493
+ text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
2473
2494
 
2474
- text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
2475
2495
  html_texts.append(text)
2476
2496
 
2477
2497
  elif isinstance(item, SectionHeaderItem):
2478
2498
 
2479
2499
  section_level: int = min(item.level + 1, 6)
2480
2500
 
2481
- text = (
2482
- f"<h{(section_level)}>"
2483
- f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
2501
+ text = get_html_tag_with_text_direction(
2502
+ html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
2484
2503
  )
2485
2504
  html_texts.append(text)
2486
2505
 
@@ -2544,13 +2563,15 @@ class DoclingDocument(BaseModel):
2544
2563
  )
2545
2564
 
2546
2565
  elif isinstance(item, ListItem):
2547
-
2548
- text = f"<li>{_prepare_tag_content(item.text)}</li>"
2566
+ text = get_html_tag_with_text_direction(
2567
+ html_tag="li", text=_prepare_tag_content(item.text)
2568
+ )
2549
2569
  html_texts.append(text)
2550
2570
 
2551
2571
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
2552
-
2553
- text = f"<li>{_prepare_tag_content(item.text)}</li>"
2572
+ text = get_html_tag_with_text_direction(
2573
+ html_tag="li", text=_prepare_tag_content(item.text)
2574
+ )
2554
2575
  html_texts.append(text)
2555
2576
 
2556
2577
  elif isinstance(item, CodeItem):
@@ -2562,8 +2583,11 @@ class DoclingDocument(BaseModel):
2562
2583
 
2563
2584
  elif isinstance(item, TextItem):
2564
2585
 
2565
- text = f"<p>{_prepare_tag_content(item.text)}</p>"
2586
+ text = get_html_tag_with_text_direction(
2587
+ html_tag="p", text=_prepare_tag_content(item.text)
2588
+ )
2566
2589
  html_texts.append(text)
2590
+
2567
2591
  elif isinstance(item, TableItem):
2568
2592
 
2569
2593
  text = item.export_to_html(doc=self, add_caption=True)
@@ -5,6 +5,7 @@
5
5
 
6
6
  """Utils for document types."""
7
7
 
8
+ import unicodedata
8
9
  from pathlib import Path
9
10
 
10
11
 
@@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:
46
47
 
47
48
  # Combine and return the result
48
49
  return Path(*up_segments, *down_segments)
50
+
51
+
52
+ def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
53
+ """Form the HTML element with tag, text, and optional dir attribute."""
54
+ text_dir = get_text_direction(text)
55
+
56
+ if text_dir == "ltr":
57
+ return f"<{html_tag}>{text}</{html_tag}>"
58
+ else:
59
+ return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
60
+
61
+
62
+ def get_text_direction(text: str) -> str:
63
+ """Determine the text direction of a given string as LTR or RTL script."""
64
+ if not text:
65
+ return "ltr" # Default for empty input
66
+
67
+ rtl_scripts = {"R", "AL"}
68
+ rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
69
+
70
+ return (
71
+ "rtl"
72
+ if unicodedata.bidirectional(text[0]) in rtl_scripts
73
+ or rtl_chars > len(text) / 2
74
+ else "ltr"
75
+ )
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.17.1"
3
+ version = "2.17.2"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes