docling-core 2.36.0__tar.gz → 2.38.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (108) hide show
  1. {docling_core-2.36.0 → docling_core-2.38.0}/PKG-INFO +1 -1
  2. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/hybrid_chunker.py +6 -3
  3. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/serializer/html.py +1 -1
  4. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/visualizer/layout_visualizer.py +2 -2
  5. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +66 -5
  6. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/visualizer/table_visualizer.py +109 -4
  7. docling_core-2.38.0/docling_core/types/doc/__init__.py +84 -0
  8. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/doc/document.py +187 -2
  9. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core.egg-info/PKG-INFO +1 -1
  10. {docling_core-2.36.0 → docling_core-2.38.0}/pyproject.toml +1 -1
  11. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_docling_doc.py +17 -0
  12. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_visualization.py +30 -1
  13. docling_core-2.36.0/docling_core/types/doc/__init__.py +0 -32
  14. {docling_core-2.36.0 → docling_core-2.38.0}/LICENSE +0 -0
  15. {docling_core-2.36.0 → docling_core-2.38.0}/README.md +0 -0
  16. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/__init__.py +0 -0
  17. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/cli/__init__.py +0 -0
  18. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/cli/view.py +0 -0
  19. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/experimental/__init__.py +0 -0
  20. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/py.typed +0 -0
  21. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  22. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  23. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  24. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  25. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  26. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  27. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  28. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  29. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/search/__init__.py +0 -0
  30. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  31. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/search/mapping.py +0 -0
  32. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/search/meta.py +0 -0
  33. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/search/package.py +0 -0
  34. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/__init__.py +0 -0
  35. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/__init__.py +0 -0
  36. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/base.py +0 -0
  37. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  38. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  39. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  40. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  41. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  42. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/serializer/__init__.py +0 -0
  43. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/serializer/base.py +0 -0
  44. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/serializer/common.py +0 -0
  45. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/serializer/doctags.py +0 -0
  46. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  47. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/serializer/markdown.py +0 -0
  48. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  49. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/transforms/visualizer/base.py +0 -0
  50. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/__init__.py +0 -0
  51. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/base.py +0 -0
  52. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/doc/base.py +0 -0
  53. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/doc/labels.py +0 -0
  54. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/doc/page.py +0 -0
  55. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/doc/tokens.py +0 -0
  56. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/doc/utils.py +0 -0
  57. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/gen/__init__.py +0 -0
  58. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/gen/generic.py +0 -0
  59. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/io/__init__.py +0 -0
  60. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  61. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/legacy_doc/base.py +0 -0
  62. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  63. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  64. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  65. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/legacy_doc/document.py +0 -0
  66. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  67. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/nlp/__init__.py +0 -0
  68. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/nlp/qa.py +0 -0
  69. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/nlp/qa_labels.py +0 -0
  70. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/rec/__init__.py +0 -0
  71. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/rec/attribute.py +0 -0
  72. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/rec/base.py +0 -0
  73. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/rec/predicate.py +0 -0
  74. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/rec/record.py +0 -0
  75. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/rec/statement.py +0 -0
  76. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/types/rec/subject.py +0 -0
  77. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/__init__.py +0 -0
  78. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/alias.py +0 -0
  79. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/file.py +0 -0
  80. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/generate_docs.py +0 -0
  81. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/generate_jsonschema.py +0 -0
  82. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/legacy.py +0 -0
  83. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/validate.py +0 -0
  84. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core/utils/validators.py +0 -0
  85. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core.egg-info/SOURCES.txt +0 -0
  86. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core.egg-info/dependency_links.txt +0 -0
  87. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core.egg-info/entry_points.txt +0 -0
  88. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core.egg-info/requires.txt +0 -0
  89. {docling_core-2.36.0 → docling_core-2.38.0}/docling_core.egg-info/top_level.txt +0 -0
  90. {docling_core-2.36.0 → docling_core-2.38.0}/setup.cfg +0 -0
  91. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_base.py +0 -0
  92. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_collection.py +0 -0
  93. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_data_gen_flag.py +0 -0
  94. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_doc_base.py +0 -0
  95. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_doc_legacy_convert.py +0 -0
  96. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_doc_schema.py +0 -0
  97. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_doc_schema_extractor.py +0 -0
  98. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_doctags_load.py +0 -0
  99. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_hierarchical_chunker.py +0 -0
  100. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_hybrid_chunker.py +0 -0
  101. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_json_schema_to_search_mapper.py +0 -0
  102. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_nlp_qa.py +0 -0
  103. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_otsl_table_export.py +0 -0
  104. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_page.py +0 -0
  105. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_rec_schema.py +0 -0
  106. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_search_meta.py +0 -0
  107. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_serialization.py +0 -0
  108. {docling_core-2.36.0 → docling_core-2.38.0}/test/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.36.0
3
+ Version: 2.38.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -234,10 +234,13 @@ class HybridChunker(BaseChunker):
234
234
  if available_length <= 0:
235
235
  warnings.warn(
236
236
  "Headers and captions for this chunk are longer than the total "
237
- "amount of size for the chunk, chunk will be ignored: "
238
- f"{doc_chunk.text=}"
237
+ "available size for the chunk, so they will be ignored: "
238
+ f"{doc_chunk.text=}, {doc_chunk.meta=}"
239
239
  )
240
- return []
240
+ new_chunk = DocChunk(**doc_chunk.export_json_dict())
241
+ new_chunk.meta.captions = None
242
+ new_chunk.meta.headings = None
243
+ return self._split_using_plain_text(doc_chunk=new_chunk)
241
244
  text = doc_chunk.text
242
245
  segments = sem_chunker.chunk(text)
243
246
  chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
@@ -340,7 +340,7 @@ class HTMLTableSerializer(BaseTableSerializer):
340
340
 
341
341
  content = html.escape(cell.text.strip())
342
342
  celltag = "td"
343
- if cell.column_header:
343
+ if cell.column_header or cell.row_header or cell.row_section:
344
344
  celltag = "th"
345
345
 
346
346
  opening_tag = f"{celltag}"
@@ -163,8 +163,8 @@ class LayoutVisualizer(BaseVisualizer):
163
163
  else:
164
164
  raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
165
165
 
166
- if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
167
- # complete previous drawing
166
+ if prev_page_nr is None or page_nr != prev_page_nr: # changing page
167
+ # dump previous drawing
168
168
  if prev_page_nr is not None and prev_image and clusters:
169
169
  self._draw_clusters(
170
170
  image=prev_image,
@@ -1,10 +1,11 @@
1
1
  """Define classes for reading order visualization."""
2
2
 
3
3
  from copy import deepcopy
4
- from typing import Optional
4
+ from typing import Optional, Union
5
5
 
6
- from PIL import ImageDraw
6
+ from PIL import ImageDraw, ImageFont
7
7
  from PIL.Image import Image
8
+ from PIL.ImageFont import FreeTypeFont
8
9
  from pydantic import BaseModel
9
10
  from typing_extensions import override
10
11
 
@@ -12,6 +13,11 @@ from docling_core.transforms.visualizer.base import BaseVisualizer
12
13
  from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
13
14
 
14
15
 
16
+ class _NumberDrawingData(BaseModel):
17
+ xy: tuple[float, float]
18
+ text: str
19
+
20
+
15
21
  class ReadingOrderVisualizer(BaseVisualizer):
16
22
  """Reading order visualizer."""
17
23
 
@@ -19,6 +25,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
19
25
  """Layout visualization parameters."""
20
26
 
21
27
  show_label: bool = True
28
+ show_branch_numbering: bool = False
22
29
  content_layers: set[ContentLayer] = {
23
30
  cl for cl in ContentLayer if cl != ContentLayer.BACKGROUND
24
31
  }
@@ -76,10 +83,17 @@ class ReadingOrderVisualizer(BaseVisualizer):
76
83
  images: Optional[dict[Optional[int], Image]] = None,
77
84
  ):
78
85
  """Draw the reading order."""
79
- # draw = ImageDraw.Draw(image)
86
+ font: Union[ImageFont.ImageFont, FreeTypeFont]
87
+ try:
88
+ font = ImageFont.truetype("arial.ttf", 12)
89
+ except OSError:
90
+ # Fallback to default font if arial is not available
91
+ font = ImageFont.load_default()
80
92
  x0, y0 = None, None
93
+ number_data_to_draw: dict[Optional[int], list[_NumberDrawingData]] = {}
81
94
  my_images: dict[Optional[int], Image] = images or {}
82
95
  prev_page = None
96
+ i = 0
83
97
  for elem, _ in doc.iterate_items(
84
98
  included_content_layers=self.params.content_layers,
85
99
  ):
@@ -92,7 +106,10 @@ class ReadingOrderVisualizer(BaseVisualizer):
92
106
  page_no = prov.page_no
93
107
  image = my_images.get(page_no)
94
108
 
95
- if image is None or prev_page is None or page_no > prev_page:
109
+ if page_no not in number_data_to_draw:
110
+ number_data_to_draw[page_no] = []
111
+
112
+ if image is None or prev_page is None or page_no != prev_page:
96
113
  # new page begins
97
114
  prev_page = page_no
98
115
  x0 = y0 = None
@@ -109,7 +126,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
109
126
  else:
110
127
  image = deepcopy(pil_img)
111
128
  my_images[page_no] = image
112
- draw = ImageDraw.Draw(image)
129
+ draw = ImageDraw.Draw(image, "RGBA")
113
130
 
114
131
  tlo_bbox = prov.bbox.to_top_left_origin(
115
132
  page_height=doc.pages[prov.page_no].size.height
@@ -124,9 +141,20 @@ class ReadingOrderVisualizer(BaseVisualizer):
124
141
  ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
125
142
 
126
143
  if x0 is None and y0 is None:
144
+ # is_root= True
127
145
  x0 = (ro_bbox.l + ro_bbox.r) / 2.0
128
146
  y0 = (ro_bbox.b + ro_bbox.t) / 2.0
147
+
148
+ number_data_to_draw[page_no].append(
149
+ _NumberDrawingData(
150
+ xy=(x0, y0),
151
+ text=f"{i}",
152
+ )
153
+ )
154
+ i += 1
155
+
129
156
  else:
157
+ # is_root = False
130
158
  assert x0 is not None
131
159
  assert y0 is not None
132
160
 
@@ -139,7 +167,40 @@ class ReadingOrderVisualizer(BaseVisualizer):
139
167
  line_width=2,
140
168
  color="red",
141
169
  )
170
+
142
171
  x0, y0 = x1, y1
172
+
173
+ if self.params.show_branch_numbering:
174
+ # post-drawing the numbers to ensure they are rendered on top-layer
175
+ for page in number_data_to_draw:
176
+ if (image := my_images.get(page)) is None:
177
+ continue
178
+ draw = ImageDraw.Draw(image, "RGBA")
179
+
180
+ for num_item in number_data_to_draw[page]:
181
+
182
+ text_bbox = draw.textbbox(num_item.xy, num_item.text, font)
183
+ text_bg_padding = 5
184
+ draw.ellipse(
185
+ [
186
+ (
187
+ text_bbox[0] - text_bg_padding,
188
+ text_bbox[1] - text_bg_padding,
189
+ ),
190
+ (
191
+ text_bbox[2] + text_bg_padding,
192
+ text_bbox[3] + text_bg_padding,
193
+ ),
194
+ ],
195
+ fill="orange",
196
+ )
197
+ draw.text(
198
+ num_item.xy,
199
+ text=num_item.text,
200
+ fill="black",
201
+ font=font,
202
+ )
203
+
143
204
  return my_images
144
205
 
145
206
  @override
@@ -23,8 +23,23 @@ class TableVisualizer(BaseVisualizer):
23
23
 
24
24
  # show_Label: bool = False
25
25
  show_cells: bool = True
26
- # show_rows: bool = False
27
- # show_cols: bool = False
26
+ show_rows: bool = False
27
+ show_cols: bool = False
28
+
29
+ cell_color: tuple[int, int, int, int] = (256, 0, 0, 32)
30
+ cell_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
31
+
32
+ row_color: tuple[int, int, int, int] = (256, 0, 0, 32)
33
+ row_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
34
+
35
+ row_header_color: tuple[int, int, int, int] = (0, 256, 0, 32)
36
+ row_header_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
37
+
38
+ col_color: tuple[int, int, int, int] = (0, 256, 0, 32)
39
+ col_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
40
+
41
+ col_header_color: tuple[int, int, int, int] = (0, 0, 256, 32)
42
+ col_header_outline: tuple[int, int, int, int] = (0, 0, 256, 128)
28
43
 
29
44
  base_visualizer: Optional[BaseVisualizer] = None
30
45
  params: Params = Params()
@@ -45,7 +60,21 @@ class TableVisualizer(BaseVisualizer):
45
60
 
46
61
  tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
47
62
 
48
- cell_color = (256, 0, 0, 32) # Transparent black for cells
63
+ cell_color = self.params.cell_color # Transparent black for cells
64
+ cell_outline = self.params.cell_outline
65
+ if cell.column_header:
66
+ cell_color = (
67
+ self.params.col_header_color
68
+ ) # Transparent black for cells
69
+ cell_outline = self.params.col_header_outline
70
+ if cell.row_header:
71
+ cell_color = (
72
+ self.params.row_header_color
73
+ ) # Transparent black for cells
74
+ cell_outline = self.params.row_header_outline
75
+ if cell.row_section:
76
+ cell_color = self.params.row_header_color
77
+ cell_outline = self.params.row_header_outline
49
78
 
50
79
  cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
51
80
  cx0 *= scale_x
@@ -55,10 +84,68 @@ class TableVisualizer(BaseVisualizer):
55
84
 
56
85
  draw.rectangle(
57
86
  [(cx0, cy0), (cx1, cy1)],
58
- outline=(256, 0, 0, 128),
87
+ outline=cell_outline,
59
88
  fill=cell_color,
60
89
  )
61
90
 
91
+ def _draw_table_rows(
92
+ self,
93
+ table: TableItem,
94
+ page_image: Image,
95
+ page_height: float,
96
+ scale_x: float,
97
+ scale_y: float,
98
+ ):
99
+ """Draw individual table cells."""
100
+ draw = ImageDraw.Draw(page_image, "RGBA")
101
+
102
+ rows = table.data.get_row_bounding_boxes()
103
+
104
+ for rid, bbox in rows.items():
105
+
106
+ tl_bbox = bbox.to_top_left_origin(page_height=page_height)
107
+
108
+ cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
109
+ cx0 *= scale_x
110
+ cx1 *= scale_x
111
+ cy0 *= scale_y
112
+ cy1 *= scale_y
113
+
114
+ draw.rectangle(
115
+ [(cx0, cy0), (cx1, cy1)],
116
+ outline=self.params.row_outline,
117
+ fill=self.params.row_color,
118
+ )
119
+
120
+ def _draw_table_cols(
121
+ self,
122
+ table: TableItem,
123
+ page_image: Image,
124
+ page_height: float,
125
+ scale_x: float,
126
+ scale_y: float,
127
+ ):
128
+ """Draw individual table cells."""
129
+ draw = ImageDraw.Draw(page_image, "RGBA")
130
+
131
+ cols = table.data.get_column_bounding_boxes()
132
+
133
+ for cid, bbox in cols.items():
134
+
135
+ tl_bbox = bbox.to_top_left_origin(page_height=page_height)
136
+
137
+ cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
138
+ cx0 *= scale_x
139
+ cx1 *= scale_x
140
+ cy0 *= scale_y
141
+ cy1 *= scale_y
142
+
143
+ draw.rectangle(
144
+ [(cx0, cy0), (cx1, cy1)],
145
+ outline=self.params.col_outline,
146
+ fill=self.params.col_color,
147
+ )
148
+
62
149
  def _draw_doc_tables(
63
150
  self,
64
151
  doc: DoclingDocument,
@@ -108,6 +195,24 @@ class TableVisualizer(BaseVisualizer):
108
195
  scale_y=image.height / doc.pages[page_nr].size.height,
109
196
  )
110
197
 
198
+ if self.params.show_rows:
199
+ self._draw_table_rows(
200
+ table=elem,
201
+ page_height=doc.pages[page_nr].size.height,
202
+ page_image=image,
203
+ scale_x=image.width / doc.pages[page_nr].size.width,
204
+ scale_y=image.height / doc.pages[page_nr].size.height,
205
+ )
206
+
207
+ if self.params.show_cols:
208
+ self._draw_table_cols(
209
+ table=elem,
210
+ page_height=doc.pages[page_nr].size.height,
211
+ page_image=image,
212
+ scale_x=image.width / doc.pages[page_nr].size.width,
213
+ scale_y=image.height / doc.pages[page_nr].size.height,
214
+ )
215
+
111
216
  else:
112
217
  raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
113
218
 
@@ -0,0 +1,84 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defined by the Document type."""
7
+
8
+ from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
+ from .document import (
10
+ BaseAnnotation,
11
+ ChartBar,
12
+ ChartLine,
13
+ ChartPoint,
14
+ ChartSlice,
15
+ ChartStackedBar,
16
+ CodeItem,
17
+ ContentLayer,
18
+ DescriptionAnnotation,
19
+ DocItem,
20
+ DoclingDocument,
21
+ DocTagsDocument,
22
+ DocTagsPage,
23
+ DocumentOrigin,
24
+ FloatingItem,
25
+ Formatting,
26
+ FormItem,
27
+ FormulaItem,
28
+ GraphCell,
29
+ GraphData,
30
+ GraphLink,
31
+ GroupItem,
32
+ ImageRef,
33
+ InlineGroup,
34
+ KeyValueItem,
35
+ ListItem,
36
+ MiscAnnotation,
37
+ NodeItem,
38
+ OrderedList,
39
+ PageItem,
40
+ PictureBarChartData,
41
+ PictureChartData,
42
+ PictureClassificationClass,
43
+ PictureClassificationData,
44
+ PictureDataType,
45
+ PictureItem,
46
+ PictureLineChartData,
47
+ PictureMoleculeData,
48
+ PicturePieChartData,
49
+ PictureScatterChartData,
50
+ PictureStackedBarChartData,
51
+ PictureTabularChartData,
52
+ ProvenanceItem,
53
+ RefItem,
54
+ Script,
55
+ SectionHeaderItem,
56
+ TableCell,
57
+ TableData,
58
+ TableItem,
59
+ TextItem,
60
+ TitleItem,
61
+ UnorderedList,
62
+ )
63
+ from .labels import (
64
+ CodeLanguageLabel,
65
+ DocItemLabel,
66
+ GraphCellLabel,
67
+ GraphLinkLabel,
68
+ GroupLabel,
69
+ PictureClassificationLabel,
70
+ TableCellLabel,
71
+ )
72
+ from .page import (
73
+ BoundingRectangle,
74
+ ColorMixin,
75
+ ColorRGBA,
76
+ Coord2D,
77
+ OrderedElement,
78
+ PdfCellRenderingMode,
79
+ PdfPageBoundaryType,
80
+ TextCell,
81
+ TextCellUnit,
82
+ TextDirection,
83
+ )
84
+ from .tokens import DocumentToken, TableToken
@@ -38,7 +38,7 @@ from typing_extensions import Annotated, Self, deprecated
38
38
  from docling_core.search.package import VERSION_PATTERN
39
39
  from docling_core.types.base import _JSON_POINTER_REGEX
40
40
  from docling_core.types.doc import BoundingBox, Size
41
- from docling_core.types.doc.base import ImageRefMode
41
+ from docling_core.types.doc.base import CoordOrigin, ImageRefMode
42
42
  from docling_core.types.doc.labels import (
43
43
  CodeLanguageLabel,
44
44
  DocItemLabel,
@@ -372,6 +372,119 @@ class TableData(BaseModel): # TBD
372
372
 
373
373
  return table_data
374
374
 
375
+ def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
376
+ """Get the minimal bounding box for each row in the table.
377
+
378
+ Returns:
379
+ List[Optional[BoundingBox]]: A list where each element is the minimal
380
+ bounding box that encompasses all cells in that row, or None if no
381
+ cells in the row have bounding boxes.
382
+ """
383
+ coords = []
384
+ for cell in self.table_cells:
385
+ if cell.bbox is not None:
386
+ coords.append(cell.bbox.coord_origin)
387
+
388
+ if len(set(coords)) > 1:
389
+ raise ValueError(
390
+ "All bounding boxes must have the same \
391
+ CoordOrigin to compute their union."
392
+ )
393
+
394
+ row_bboxes: dict[int, BoundingBox] = {}
395
+
396
+ for row_idx in range(self.num_rows):
397
+ row_cells_with_bbox: dict[int, list[BoundingBox]] = {}
398
+
399
+ # Collect all cells in this row that have bounding boxes
400
+ for cell in self.table_cells:
401
+
402
+ if (
403
+ cell.bbox is not None
404
+ and cell.start_row_offset_idx <= row_idx < cell.end_row_offset_idx
405
+ ):
406
+
407
+ row_span = cell.end_row_offset_idx - cell.start_row_offset_idx
408
+ if row_span in row_cells_with_bbox:
409
+ row_cells_with_bbox[row_span].append(cell.bbox)
410
+ else:
411
+ row_cells_with_bbox[row_span] = [cell.bbox]
412
+
413
+ # Calculate the enclosing bounding box for this row
414
+ if len(row_cells_with_bbox) > 0:
415
+ min_row_span = min(row_cells_with_bbox.keys())
416
+ row_bbox: BoundingBox = BoundingBox.enclosing_bbox(
417
+ row_cells_with_bbox[min_row_span]
418
+ )
419
+
420
+ for rspan, bboxs in row_cells_with_bbox.items():
421
+ for bbox in bboxs:
422
+ row_bbox.l = min(row_bbox.l, bbox.l)
423
+ row_bbox.r = max(row_bbox.r, bbox.r)
424
+
425
+ row_bboxes[row_idx] = row_bbox
426
+
427
+ return row_bboxes
428
+
429
+ def get_column_bounding_boxes(self) -> dict[int, BoundingBox]:
430
+ """Get the minimal bounding box for each column in the table.
431
+
432
+ Returns:
433
+ List[Optional[BoundingBox]]: A list where each element is the minimal
434
+ bounding box that encompasses all cells in that column, or None if no
435
+ cells in the column have bounding boxes.
436
+ """
437
+ coords = []
438
+ for cell in self.table_cells:
439
+ if cell.bbox is not None:
440
+ coords.append(cell.bbox.coord_origin)
441
+
442
+ if len(set(coords)) > 1:
443
+ raise ValueError(
444
+ "All bounding boxes must have the same \
445
+ CoordOrigin to compute their union."
446
+ )
447
+
448
+ col_bboxes: dict[int, BoundingBox] = {}
449
+
450
+ for col_idx in range(self.num_cols):
451
+ col_cells_with_bbox: dict[int, list[BoundingBox]] = {}
452
+
453
+ # Collect all cells in this row that have bounding boxes
454
+ for cell in self.table_cells:
455
+
456
+ if (
457
+ cell.bbox is not None
458
+ and cell.start_col_offset_idx <= col_idx < cell.end_col_offset_idx
459
+ ):
460
+
461
+ col_span = cell.end_col_offset_idx - cell.start_col_offset_idx
462
+ if col_span in col_cells_with_bbox:
463
+ col_cells_with_bbox[col_span].append(cell.bbox)
464
+ else:
465
+ col_cells_with_bbox[col_span] = [cell.bbox]
466
+
467
+ # Calculate the enclosing bounding box for this row
468
+ if len(col_cells_with_bbox) > 0:
469
+ min_col_span = min(col_cells_with_bbox.keys())
470
+ col_bbox: BoundingBox = BoundingBox.enclosing_bbox(
471
+ col_cells_with_bbox[min_col_span]
472
+ )
473
+
474
+ for rspan, bboxs in col_cells_with_bbox.items():
475
+ for bbox in bboxs:
476
+ if bbox.coord_origin == CoordOrigin.TOPLEFT:
477
+ col_bbox.b = max(col_bbox.b, bbox.b)
478
+ col_bbox.t = min(col_bbox.t, bbox.t)
479
+
480
+ elif bbox.coord_origin == CoordOrigin.BOTTOMLEFT:
481
+ col_bbox.b = min(col_bbox.b, bbox.b)
482
+ col_bbox.t = max(col_bbox.t, bbox.t)
483
+
484
+ col_bboxes[col_idx] = col_bbox
485
+
486
+ return col_bboxes
487
+
375
488
 
376
489
  class PictureTabularChartData(PictureChartData):
377
490
  """Base class for picture chart data.
@@ -4056,6 +4169,7 @@ class DoclingDocument(BaseModel):
4056
4169
  add_table_cell_location: bool = False,
4057
4170
  add_table_cell_text: bool = True,
4058
4171
  minified: bool = False,
4172
+ pages: Optional[set[int]] = None,
4059
4173
  ) -> str:
4060
4174
  r"""Exports the document content to a DocumentToken format.
4061
4175
 
@@ -4074,6 +4188,7 @@ class DoclingDocument(BaseModel):
4074
4188
  :param # table specific flagsadd_table_cell_location: bool
4075
4189
  :param add_table_cell_text: bool: (Default value = True)
4076
4190
  :param minified: bool: (Default value = False)
4191
+ :param pages: set[int]: (Default value = None)
4077
4192
  :returns: The content of the document formatted as a DocTags string.
4078
4193
  :rtype: str
4079
4194
  """
@@ -4098,6 +4213,7 @@ class DoclingDocument(BaseModel):
4098
4213
  add_page_break=add_page_index,
4099
4214
  add_table_cell_location=add_table_cell_location,
4100
4215
  add_table_cell_text=add_table_cell_text,
4216
+ pages=pages,
4101
4217
  mode=(
4102
4218
  DocTagsParams.Mode.MINIFIED
4103
4219
  if minified
@@ -4237,7 +4353,9 @@ class DoclingDocument(BaseModel):
4237
4353
  return pitem
4238
4354
 
4239
4355
  def get_visualization(
4240
- self, show_label: bool = True
4356
+ self,
4357
+ show_label: bool = True,
4358
+ show_branch_numbering: bool = False,
4241
4359
  ) -> dict[Optional[int], PILImage.Image]:
4242
4360
  """Get visualization of the document as images by page."""
4243
4361
  from docling_core.transforms.visualizer.layout_visualizer import (
@@ -4253,6 +4371,9 @@ class DoclingDocument(BaseModel):
4253
4371
  show_label=show_label,
4254
4372
  ),
4255
4373
  ),
4374
+ params=ReadingOrderVisualizer.Params(
4375
+ show_branch_numbering=show_branch_numbering,
4376
+ ),
4256
4377
  )
4257
4378
  images = visualizer.get_visualization(doc=self)
4258
4379
 
@@ -4343,3 +4464,67 @@ class DoclingDocument(BaseModel):
4343
4464
  hyperlink=li.hyperlink,
4344
4465
  )
4345
4466
  return self
4467
+
4468
+ def _normalize_references(self) -> None:
4469
+ """Normalize ref numbering by ordering node items as per iterate_items()."""
4470
+ new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
4471
+
4472
+ item_lists: dict[str, list[NodeItem]] = {
4473
+ "groups": [],
4474
+ "texts": [],
4475
+ "pictures": [],
4476
+ "tables": [],
4477
+ "key_value_items": [],
4478
+ "form_items": [],
4479
+ }
4480
+ orig_ref_to_new_ref: dict[str, str] = {}
4481
+
4482
+ # collect items in traversal order
4483
+ for item, _ in self.iterate_items(
4484
+ with_groups=True,
4485
+ traverse_pictures=True,
4486
+ included_content_layers={c for c in ContentLayer},
4487
+ ):
4488
+ key = item.self_ref.split("/")[1]
4489
+ is_body = key == "body"
4490
+ new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
4491
+ # register cref mapping:
4492
+ orig_ref_to_new_ref[item.self_ref] = new_cref
4493
+
4494
+ if not is_body:
4495
+ new_item = copy.deepcopy(item)
4496
+ new_item.children = []
4497
+
4498
+ # put item in the right list
4499
+ item_lists[key].append(new_item)
4500
+
4501
+ # update item's self reference
4502
+ new_item.self_ref = new_cref
4503
+
4504
+ if item.parent:
4505
+ # set item's parent
4506
+ new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
4507
+ new_item.parent = RefItem(cref=new_parent_cref)
4508
+
4509
+ # add item to parent's children
4510
+ path_components = new_parent_cref.split("/")
4511
+ num_components = len(path_components)
4512
+ parent_node: NodeItem
4513
+ if num_components == 3:
4514
+ _, parent_key, parent_index_str = path_components
4515
+ parent_index = int(parent_index_str)
4516
+ parent_node = item_lists[parent_key][parent_index]
4517
+ elif num_components == 2 and path_components[1] == "body":
4518
+ parent_node = new_body
4519
+ else:
4520
+ raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
4521
+ parent_node.children.append(RefItem(cref=new_cref))
4522
+
4523
+ # update document
4524
+ self.groups = item_lists["groups"] # type: ignore
4525
+ self.texts = item_lists["texts"] # type: ignore
4526
+ self.pictures = item_lists["pictures"] # type: ignore
4527
+ self.tables = item_lists["tables"] # type: ignore
4528
+ self.key_value_items = item_lists["key_value_items"] # type: ignore
4529
+ self.form_items = item_lists["form_items"] # type: ignore
4530
+ self.body = new_body
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.36.0
3
+ Version: 2.38.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling-core"
3
- version = "2.36.0" # DO NOT EDIT, updated automatically
3
+ version = "2.38.0" # DO NOT EDIT, updated automatically
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
@@ -720,6 +720,15 @@ def _test_export_methods(
720
720
  dt_min_pred = doc.export_to_doctags(minified=True)
721
721
  _verify_regression_test(dt_min_pred, filename=filename, ext="min.dt")
722
722
 
723
+ # Test pages parameter in DocTags export
724
+ if doc.pages: # Only test if document has pages
725
+ first_page = min(doc.pages.keys())
726
+ second_page = first_page + 1
727
+ if second_page in doc.pages: # Only test if document has at least 2 pages
728
+ dt_pages_pred = doc.export_to_doctags(pages={first_page, second_page})
729
+ print(dt_pages_pred)
730
+ _verify_regression_test(dt_pages_pred, filename=filename, ext="pages.dt")
731
+
723
732
  # Test Tables export ...
724
733
  for table in doc.tables:
725
734
  table.export_to_markdown()
@@ -1636,3 +1645,11 @@ def test_misplaced_list_items():
1636
1645
  else:
1637
1646
  exp_doc = DoclingDocument.load_from_yaml(exp_file)
1638
1647
  assert doc == exp_doc
1648
+
1649
+ doc._normalize_references()
1650
+ exp_file = filename.parent / f"{filename.stem}.norm.out.yaml"
1651
+ if GEN_TEST_DATA:
1652
+ doc.save_as_yaml(exp_file)
1653
+ else:
1654
+ exp_doc = DoclingDocument.load_from_yaml(exp_file)
1655
+ assert doc == exp_doc
@@ -55,7 +55,7 @@ def test_doc_visualization_no_label():
55
55
  )
56
56
 
57
57
 
58
- def test_table_visualization_no_label():
58
+ def test_table_visualization_for_cells():
59
59
  src = Path("./test/data/doc/2408.09869v3_enriched.json")
60
60
  doc = DoclingDocument.load_from_json(src)
61
61
 
@@ -66,3 +66,32 @@ def test_table_visualization_no_label():
66
66
  exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_table_viz_wout_lbl_p5.png",
67
67
  actual=viz_pages[5],
68
68
  )
69
+
70
+
71
+ def test_table_visualization_for_rows_and_cols():
72
+ src = Path("./test/data/doc/2408.09869v3_enriched.json")
73
+ doc = DoclingDocument.load_from_json(src)
74
+
75
+ visualizer = TableVisualizer(
76
+ params=TableVisualizer.Params(show_cells=False, show_rows=True, show_cols=True)
77
+ )
78
+ viz_pages = visualizer.get_visualization(doc=doc)
79
+
80
+ verify(
81
+ exp_file=VIZ_TEST_DATA_PATH
82
+ / f"{src.stem}_table_viz_wout_lbl_p5_rows_and_cols.png",
83
+ actual=viz_pages[5],
84
+ )
85
+
86
+
87
+ def test_cross_page_lists_with_branch_nums():
88
+ src = Path("./test/data/doc/cross_page_lists.json")
89
+ doc = DoclingDocument.load_from_json(src)
90
+
91
+ viz_pages = doc.get_visualization(show_branch_numbering=True)
92
+
93
+ for i in range(2):
94
+ verify(
95
+ exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_p{i+1}.png",
96
+ actual=viz_pages[i + 1],
97
+ )
@@ -1,32 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Package for models defined by the Document type."""
7
-
8
- from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
- from .document import (
10
- CodeItem,
11
- DocItem,
12
- DoclingDocument,
13
- DocumentOrigin,
14
- FloatingItem,
15
- GroupItem,
16
- ImageRef,
17
- KeyValueItem,
18
- NodeItem,
19
- PageItem,
20
- PictureClassificationClass,
21
- PictureClassificationData,
22
- PictureDataType,
23
- PictureItem,
24
- ProvenanceItem,
25
- RefItem,
26
- SectionHeaderItem,
27
- TableCell,
28
- TableData,
29
- TableItem,
30
- TextItem,
31
- )
32
- from .labels import DocItemLabel, GroupLabel, TableCellLabel
File without changes
File without changes
File without changes