docling-core 2.37.0__py3-none-any.whl → 2.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -234,10 +234,13 @@ class HybridChunker(BaseChunker):
234
234
  if available_length <= 0:
235
235
  warnings.warn(
236
236
  "Headers and captions for this chunk are longer than the total "
237
- "amount of size for the chunk, chunk will be ignored: "
238
- f"{doc_chunk.text=}"
237
+ "available size for the chunk, so they will be ignored: "
238
+ f"{doc_chunk.text=}, {doc_chunk.meta=}"
239
239
  )
240
- return []
240
+ new_chunk = DocChunk(**doc_chunk.export_json_dict())
241
+ new_chunk.meta.captions = None
242
+ new_chunk.meta.headings = None
243
+ return self._split_using_plain_text(doc_chunk=new_chunk)
241
244
  text = doc_chunk.text
242
245
  segments = sem_chunker.chunk(text)
243
246
  chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
@@ -163,8 +163,8 @@ class LayoutVisualizer(BaseVisualizer):
163
163
  else:
164
164
  raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
165
165
 
166
- if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
167
- # complete previous drawing
166
+ if prev_page_nr is None or page_nr != prev_page_nr: # changing page
167
+ # dump previous drawing
168
168
  if prev_page_nr is not None and prev_image and clusters:
169
169
  self._draw_clusters(
170
170
  image=prev_image,
@@ -1,10 +1,11 @@
1
1
  """Define classes for reading order visualization."""
2
2
 
3
3
  from copy import deepcopy
4
- from typing import Optional
4
+ from typing import Optional, Union
5
5
 
6
- from PIL import ImageDraw
6
+ from PIL import ImageDraw, ImageFont
7
7
  from PIL.Image import Image
8
+ from PIL.ImageFont import FreeTypeFont
8
9
  from pydantic import BaseModel
9
10
  from typing_extensions import override
10
11
 
@@ -12,6 +13,11 @@ from docling_core.transforms.visualizer.base import BaseVisualizer
12
13
  from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
13
14
 
14
15
 
16
+ class _NumberDrawingData(BaseModel):
17
+ xy: tuple[float, float]
18
+ text: str
19
+
20
+
15
21
  class ReadingOrderVisualizer(BaseVisualizer):
16
22
  """Reading order visualizer."""
17
23
 
@@ -19,6 +25,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
19
25
  """Layout visualization parameters."""
20
26
 
21
27
  show_label: bool = True
28
+ show_branch_numbering: bool = False
22
29
  content_layers: set[ContentLayer] = {
23
30
  cl for cl in ContentLayer if cl != ContentLayer.BACKGROUND
24
31
  }
@@ -76,10 +83,17 @@ class ReadingOrderVisualizer(BaseVisualizer):
76
83
  images: Optional[dict[Optional[int], Image]] = None,
77
84
  ):
78
85
  """Draw the reading order."""
79
- # draw = ImageDraw.Draw(image)
86
+ font: Union[ImageFont.ImageFont, FreeTypeFont]
87
+ try:
88
+ font = ImageFont.truetype("arial.ttf", 12)
89
+ except OSError:
90
+ # Fallback to default font if arial is not available
91
+ font = ImageFont.load_default()
80
92
  x0, y0 = None, None
93
+ number_data_to_draw: dict[Optional[int], list[_NumberDrawingData]] = {}
81
94
  my_images: dict[Optional[int], Image] = images or {}
82
95
  prev_page = None
96
+ i = 0
83
97
  for elem, _ in doc.iterate_items(
84
98
  included_content_layers=self.params.content_layers,
85
99
  ):
@@ -92,7 +106,10 @@ class ReadingOrderVisualizer(BaseVisualizer):
92
106
  page_no = prov.page_no
93
107
  image = my_images.get(page_no)
94
108
 
95
- if image is None or prev_page is None or page_no > prev_page:
109
+ if page_no not in number_data_to_draw:
110
+ number_data_to_draw[page_no] = []
111
+
112
+ if image is None or prev_page is None or page_no != prev_page:
96
113
  # new page begins
97
114
  prev_page = page_no
98
115
  x0 = y0 = None
@@ -109,7 +126,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
109
126
  else:
110
127
  image = deepcopy(pil_img)
111
128
  my_images[page_no] = image
112
- draw = ImageDraw.Draw(image)
129
+ draw = ImageDraw.Draw(image, "RGBA")
113
130
 
114
131
  tlo_bbox = prov.bbox.to_top_left_origin(
115
132
  page_height=doc.pages[prov.page_no].size.height
@@ -124,9 +141,20 @@ class ReadingOrderVisualizer(BaseVisualizer):
124
141
  ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
125
142
 
126
143
  if x0 is None and y0 is None:
144
+ # is_root= True
127
145
  x0 = (ro_bbox.l + ro_bbox.r) / 2.0
128
146
  y0 = (ro_bbox.b + ro_bbox.t) / 2.0
147
+
148
+ number_data_to_draw[page_no].append(
149
+ _NumberDrawingData(
150
+ xy=(x0, y0),
151
+ text=f"{i}",
152
+ )
153
+ )
154
+ i += 1
155
+
129
156
  else:
157
+ # is_root = False
130
158
  assert x0 is not None
131
159
  assert y0 is not None
132
160
 
@@ -139,7 +167,40 @@ class ReadingOrderVisualizer(BaseVisualizer):
139
167
  line_width=2,
140
168
  color="red",
141
169
  )
170
+
142
171
  x0, y0 = x1, y1
172
+
173
+ if self.params.show_branch_numbering:
174
+ # post-drawing the numbers to ensure they are rendered on top-layer
175
+ for page in number_data_to_draw:
176
+ if (image := my_images.get(page)) is None:
177
+ continue
178
+ draw = ImageDraw.Draw(image, "RGBA")
179
+
180
+ for num_item in number_data_to_draw[page]:
181
+
182
+ text_bbox = draw.textbbox(num_item.xy, num_item.text, font)
183
+ text_bg_padding = 5
184
+ draw.ellipse(
185
+ [
186
+ (
187
+ text_bbox[0] - text_bg_padding,
188
+ text_bbox[1] - text_bg_padding,
189
+ ),
190
+ (
191
+ text_bbox[2] + text_bg_padding,
192
+ text_bbox[3] + text_bg_padding,
193
+ ),
194
+ ],
195
+ fill="orange",
196
+ )
197
+ draw.text(
198
+ num_item.xy,
199
+ text=num_item.text,
200
+ fill="black",
201
+ font=font,
202
+ )
203
+
143
204
  return my_images
144
205
 
145
206
  @override
@@ -7,26 +7,78 @@
7
7
 
8
8
  from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
+ BaseAnnotation,
11
+ ChartBar,
12
+ ChartLine,
13
+ ChartPoint,
14
+ ChartSlice,
15
+ ChartStackedBar,
10
16
  CodeItem,
17
+ ContentLayer,
18
+ DescriptionAnnotation,
11
19
  DocItem,
12
20
  DoclingDocument,
21
+ DocTagsDocument,
22
+ DocTagsPage,
13
23
  DocumentOrigin,
14
24
  FloatingItem,
25
+ Formatting,
26
+ FormItem,
27
+ FormulaItem,
28
+ GraphCell,
29
+ GraphData,
30
+ GraphLink,
15
31
  GroupItem,
16
32
  ImageRef,
33
+ InlineGroup,
17
34
  KeyValueItem,
35
+ ListItem,
36
+ MiscAnnotation,
18
37
  NodeItem,
38
+ OrderedList,
19
39
  PageItem,
40
+ PictureBarChartData,
41
+ PictureChartData,
20
42
  PictureClassificationClass,
21
43
  PictureClassificationData,
22
44
  PictureDataType,
23
45
  PictureItem,
46
+ PictureLineChartData,
47
+ PictureMoleculeData,
48
+ PicturePieChartData,
49
+ PictureScatterChartData,
50
+ PictureStackedBarChartData,
51
+ PictureTabularChartData,
24
52
  ProvenanceItem,
25
53
  RefItem,
54
+ Script,
26
55
  SectionHeaderItem,
27
56
  TableCell,
28
57
  TableData,
29
58
  TableItem,
30
59
  TextItem,
60
+ TitleItem,
61
+ UnorderedList,
31
62
  )
32
- from .labels import DocItemLabel, GroupLabel, TableCellLabel
63
+ from .labels import (
64
+ CodeLanguageLabel,
65
+ DocItemLabel,
66
+ GraphCellLabel,
67
+ GraphLinkLabel,
68
+ GroupLabel,
69
+ PictureClassificationLabel,
70
+ TableCellLabel,
71
+ )
72
+ from .page import (
73
+ BoundingRectangle,
74
+ ColorMixin,
75
+ ColorRGBA,
76
+ Coord2D,
77
+ OrderedElement,
78
+ PdfCellRenderingMode,
79
+ PdfPageBoundaryType,
80
+ TextCell,
81
+ TextCellUnit,
82
+ TextDirection,
83
+ )
84
+ from .tokens import DocumentToken, TableToken
@@ -4169,6 +4169,7 @@ class DoclingDocument(BaseModel):
4169
4169
  add_table_cell_location: bool = False,
4170
4170
  add_table_cell_text: bool = True,
4171
4171
  minified: bool = False,
4172
+ pages: Optional[set[int]] = None,
4172
4173
  ) -> str:
4173
4174
  r"""Exports the document content to a DocumentToken format.
4174
4175
 
@@ -4187,6 +4188,7 @@ class DoclingDocument(BaseModel):
4187
4188
  :param # table specific flagsadd_table_cell_location: bool
4188
4189
  :param add_table_cell_text: bool: (Default value = True)
4189
4190
  :param minified: bool: (Default value = False)
4191
+ :param pages: set[int]: (Default value = None)
4190
4192
  :returns: The content of the document formatted as a DocTags string.
4191
4193
  :rtype: str
4192
4194
  """
@@ -4211,6 +4213,7 @@ class DoclingDocument(BaseModel):
4211
4213
  add_page_break=add_page_index,
4212
4214
  add_table_cell_location=add_table_cell_location,
4213
4215
  add_table_cell_text=add_table_cell_text,
4216
+ pages=pages,
4214
4217
  mode=(
4215
4218
  DocTagsParams.Mode.MINIFIED
4216
4219
  if minified
@@ -4350,7 +4353,9 @@ class DoclingDocument(BaseModel):
4350
4353
  return pitem
4351
4354
 
4352
4355
  def get_visualization(
4353
- self, show_label: bool = True
4356
+ self,
4357
+ show_label: bool = True,
4358
+ show_branch_numbering: bool = False,
4354
4359
  ) -> dict[Optional[int], PILImage.Image]:
4355
4360
  """Get visualization of the document as images by page."""
4356
4361
  from docling_core.transforms.visualizer.layout_visualizer import (
@@ -4366,6 +4371,9 @@ class DoclingDocument(BaseModel):
4366
4371
  show_label=show_label,
4367
4372
  ),
4368
4373
  ),
4374
+ params=ReadingOrderVisualizer.Params(
4375
+ show_branch_numbering=show_branch_numbering,
4376
+ ),
4369
4377
  )
4370
4378
  images = visualizer.get_visualization(doc=self)
4371
4379
 
@@ -4456,3 +4464,67 @@ class DoclingDocument(BaseModel):
4456
4464
  hyperlink=li.hyperlink,
4457
4465
  )
4458
4466
  return self
4467
+
4468
+ def _normalize_references(self) -> None:
4469
+ """Normalize ref numbering by ordering node items as per iterate_items()."""
4470
+ new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
4471
+
4472
+ item_lists: dict[str, list[NodeItem]] = {
4473
+ "groups": [],
4474
+ "texts": [],
4475
+ "pictures": [],
4476
+ "tables": [],
4477
+ "key_value_items": [],
4478
+ "form_items": [],
4479
+ }
4480
+ orig_ref_to_new_ref: dict[str, str] = {}
4481
+
4482
+ # collect items in traversal order
4483
+ for item, _ in self.iterate_items(
4484
+ with_groups=True,
4485
+ traverse_pictures=True,
4486
+ included_content_layers={c for c in ContentLayer},
4487
+ ):
4488
+ key = item.self_ref.split("/")[1]
4489
+ is_body = key == "body"
4490
+ new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
4491
+ # register cref mapping:
4492
+ orig_ref_to_new_ref[item.self_ref] = new_cref
4493
+
4494
+ if not is_body:
4495
+ new_item = copy.deepcopy(item)
4496
+ new_item.children = []
4497
+
4498
+ # put item in the right list
4499
+ item_lists[key].append(new_item)
4500
+
4501
+ # update item's self reference
4502
+ new_item.self_ref = new_cref
4503
+
4504
+ if item.parent:
4505
+ # set item's parent
4506
+ new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
4507
+ new_item.parent = RefItem(cref=new_parent_cref)
4508
+
4509
+ # add item to parent's children
4510
+ path_components = new_parent_cref.split("/")
4511
+ num_components = len(path_components)
4512
+ parent_node: NodeItem
4513
+ if num_components == 3:
4514
+ _, parent_key, parent_index_str = path_components
4515
+ parent_index = int(parent_index_str)
4516
+ parent_node = item_lists[parent_key][parent_index]
4517
+ elif num_components == 2 and path_components[1] == "body":
4518
+ parent_node = new_body
4519
+ else:
4520
+ raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
4521
+ parent_node.children.append(RefItem(cref=new_cref))
4522
+
4523
+ # update document
4524
+ self.groups = item_lists["groups"] # type: ignore
4525
+ self.texts = item_lists["texts"] # type: ignore
4526
+ self.pictures = item_lists["pictures"] # type: ignore
4527
+ self.tables = item_lists["tables"] # type: ignore
4528
+ self.key_value_items = item_lists["key_value_items"] # type: ignore
4529
+ self.form_items = item_lists["form_items"] # type: ignore
4530
+ self.body = new_body
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.37.0
3
+ Version: 2.38.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -20,7 +20,7 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
20
20
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
21
21
  docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
22
22
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=7Fpwwsn2BoiR12KGPrn8fU1uuhqBLp85MRLMF0aIsL8,8281
23
- docling_core/transforms/chunker/hybrid_chunker.py,sha256=i4Yskms48XRUAVhec8pTGDP1dbrTEgc1pNh5fNXqfKQ,12317
23
+ docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
24
24
  docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
25
25
  docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
26
26
  docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
@@ -34,14 +34,14 @@ docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx9
34
34
  docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
35
35
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
36
36
  docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
37
- docling_core/transforms/visualizer/layout_visualizer.py,sha256=hpq7OnyBgGxt3iW3_aNy9KH_0kmKdgoiJIFPcA2SSHU,8040
38
- docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=yBra_W33bb16BxrTqP-ABu5NfRplTEJgu3dKdew3zKA,5601
37
+ docling_core/transforms/visualizer/layout_visualizer.py,sha256=zHzQTWcy-z1J2BcsjvakLkrp8pgStgnxhDl8YqIAotY,8035
38
+ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao39X3Dut0934NAjU3I4v3JN5VzzdjmoGRY,7776
39
39
  docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
40
40
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
41
41
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
42
- docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
42
+ docling_core/types/doc/__init__.py,sha256=pchsIq-9FH_kCTyuyDdB8L4yV77pmnxPwT7399xrqxI,1626
43
43
  docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
44
- docling_core/types/doc/document.py,sha256=JIrCXTeTYSbjTM1wt6kAbXF6QZ1OepC9vG2C3rO0j8I,153808
44
+ docling_core/types/doc/document.py,sha256=JPh-9MqfOxThP5njvXZAY8sxQyhiPJLjDsSJviggItc,156829
45
45
  docling_core/types/doc/labels.py,sha256=JiciRK7_DOkebsrfQ6PVCvS__TsKgWn1ANk84BeB14k,7359
46
46
  docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
47
47
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
@@ -74,9 +74,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
74
74
  docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
75
75
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
76
76
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
77
- docling_core-2.37.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
78
- docling_core-2.37.0.dist-info/METADATA,sha256=B0hyQog06wYqrKsB2jbeiAZ-Rk3Pl_uy2JH7Rws-9EY,6453
79
- docling_core-2.37.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
- docling_core-2.37.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
81
- docling_core-2.37.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
82
- docling_core-2.37.0.dist-info/RECORD,,
77
+ docling_core-2.38.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
78
+ docling_core-2.38.0.dist-info/METADATA,sha256=llcycAVzvc09CX0igt4VIGrGWT8UuMjnWN5rrQoEJ6s,6453
79
+ docling_core-2.38.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
+ docling_core-2.38.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
81
+ docling_core-2.38.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
82
+ docling_core-2.38.0.dist-info/RECORD,,