docling-core 2.28.1__tar.gz → 2.30.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (79) hide show
  1. {docling_core-2.28.1 → docling_core-2.30.0}/PKG-INFO +1 -1
  2. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hierarchical_chunker.py +5 -5
  3. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hybrid_chunker.py +4 -4
  4. {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/common.py +1 -1
  5. {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/doctags.py +2 -2
  6. {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/html.py +29 -3
  7. {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/markdown.py +2 -2
  8. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/layout_visualizer.py +33 -30
  9. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +52 -50
  10. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/document.py +59 -38
  11. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/labels.py +1 -0
  12. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/page.py +25 -4
  13. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/legacy.py +1 -1
  14. {docling_core-2.28.1 → docling_core-2.30.0}/pyproject.toml +1 -1
  15. {docling_core-2.28.1 → docling_core-2.30.0}/LICENSE +0 -0
  16. {docling_core-2.28.1 → docling_core-2.30.0}/README.md +0 -0
  17. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/__init__.py +0 -0
  18. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/cli/__init__.py +0 -0
  19. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/cli/view.py +0 -0
  20. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/experimental/__init__.py +0 -0
  21. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/py.typed +0 -0
  22. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  23. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  24. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  25. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  26. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  27. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  28. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  29. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  30. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/__init__.py +0 -0
  31. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  32. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/mapping.py +0 -0
  33. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/meta.py +0 -0
  34. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/package.py +0 -0
  35. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/__init__.py +0 -0
  36. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/__init__.py +0 -0
  37. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/base.py +0 -0
  38. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  39. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  40. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  41. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  42. {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/__init__.py +0 -0
  43. {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/base.py +0 -0
  44. {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/html_styles.py +0 -0
  45. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  46. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/base.py +0 -0
  47. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/__init__.py +0 -0
  48. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/base.py +0 -0
  49. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/__init__.py +0 -0
  50. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/base.py +0 -0
  51. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/tokens.py +0 -0
  52. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/utils.py +0 -0
  53. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/gen/__init__.py +0 -0
  54. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/gen/generic.py +0 -0
  55. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/io/__init__.py +0 -0
  56. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  57. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/base.py +0 -0
  58. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  59. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  60. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  61. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/document.py +0 -0
  62. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  63. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/nlp/__init__.py +0 -0
  64. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/nlp/qa.py +0 -0
  65. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/nlp/qa_labels.py +0 -0
  66. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/__init__.py +0 -0
  67. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/attribute.py +0 -0
  68. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/base.py +0 -0
  69. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/predicate.py +0 -0
  70. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/record.py +0 -0
  71. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/statement.py +0 -0
  72. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/subject.py +0 -0
  73. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/__init__.py +0 -0
  74. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/alias.py +0 -0
  75. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/file.py +0 -0
  76. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/generate_docs.py +0 -0
  77. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/generate_jsonschema.py +0 -0
  78. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/validate.py +0 -0
  79. {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.28.1
3
+ Version: 2.30.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -14,19 +14,19 @@ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
14
14
  from pydantic import ConfigDict, Field, StringConstraints, field_validator
15
15
  from typing_extensions import Annotated, override
16
16
 
17
- from docling_core.experimental.serializer.base import (
17
+ from docling_core.search.package import VERSION_PATTERN
18
+ from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
19
+ from docling_core.transforms.serializer.base import (
18
20
  BaseDocSerializer,
19
21
  BaseSerializerProvider,
20
22
  BaseTableSerializer,
21
23
  SerializationResult,
22
24
  )
23
- from docling_core.experimental.serializer.common import create_ser_result
24
- from docling_core.experimental.serializer.markdown import (
25
+ from docling_core.transforms.serializer.common import create_ser_result
26
+ from docling_core.transforms.serializer.markdown import (
25
27
  MarkdownDocSerializer,
26
28
  MarkdownParams,
27
29
  )
28
- from docling_core.search.package import VERSION_PATTERN
29
- from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
30
30
  from docling_core.types import DoclingDocument as DLDocument
31
31
  from docling_core.types.doc.base import ImageRefMode
32
32
  from docling_core.types.doc.document import (
@@ -25,10 +25,6 @@ except ImportError:
25
25
  "`pip install 'docling-core[chunking-openai]'`"
26
26
  )
27
27
 
28
- from docling_core.experimental.serializer.base import (
29
- BaseDocSerializer,
30
- BaseSerializerProvider,
31
- )
32
28
  from docling_core.transforms.chunker import (
33
29
  BaseChunk,
34
30
  BaseChunker,
@@ -36,6 +32,10 @@ from docling_core.transforms.chunker import (
36
32
  DocMeta,
37
33
  HierarchicalChunker,
38
34
  )
35
+ from docling_core.transforms.serializer.base import (
36
+ BaseDocSerializer,
37
+ BaseSerializerProvider,
38
+ )
39
39
  from docling_core.types import DoclingDocument
40
40
 
41
41
 
@@ -14,7 +14,7 @@ from typing import Any, Iterable, Optional, Tuple, Union
14
14
  from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
15
15
  from typing_extensions import Self, override
16
16
 
17
- from docling_core.experimental.serializer.base import (
17
+ from docling_core.transforms.serializer.base import (
18
18
  BaseDocSerializer,
19
19
  BaseFallbackSerializer,
20
20
  BaseFormSerializer,
@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Union
6
6
  from pydantic import BaseModel
7
7
  from typing_extensions import override
8
8
 
9
- from docling_core.experimental.serializer.base import (
9
+ from docling_core.transforms.serializer.base import (
10
10
  BaseDocSerializer,
11
11
  BaseFallbackSerializer,
12
12
  BaseFormSerializer,
@@ -18,7 +18,7 @@ from docling_core.experimental.serializer.base import (
18
18
  BaseTextSerializer,
19
19
  SerializationResult,
20
20
  )
21
- from docling_core.experimental.serializer.common import (
21
+ from docling_core.transforms.serializer.common import (
22
22
  CommonParams,
23
23
  DocSerializer,
24
24
  create_ser_result,
@@ -19,7 +19,7 @@ import latex2mathml.converter
19
19
  from pydantic import AnyUrl, BaseModel
20
20
  from typing_extensions import override
21
21
 
22
- from docling_core.experimental.serializer.base import (
22
+ from docling_core.transforms.serializer.base import (
23
23
  BaseDocSerializer,
24
24
  BaseFallbackSerializer,
25
25
  BaseFormSerializer,
@@ -31,12 +31,12 @@ from docling_core.experimental.serializer.base import (
31
31
  BaseTextSerializer,
32
32
  SerializationResult,
33
33
  )
34
- from docling_core.experimental.serializer.common import (
34
+ from docling_core.transforms.serializer.common import (
35
35
  CommonParams,
36
36
  DocSerializer,
37
37
  create_ser_result,
38
38
  )
39
- from docling_core.experimental.serializer.html_styles import (
39
+ from docling_core.transforms.serializer.html_styles import (
40
40
  _get_css_for_single_column,
41
41
  _get_css_for_split_page,
42
42
  )
@@ -370,6 +370,13 @@ class HTMLPictureSerializer(BasePictureSerializer):
370
370
  **kwargs: Any,
371
371
  ) -> SerializationResult:
372
372
  """Export picture to HTML format."""
373
+
374
+ def get_img_row(imgb64: str, ind: int) -> str:
375
+ row = '<tr><td style="border: 2px solid black; padding: 8px;">'
376
+ row += f'<img src="data:image/png;base64,{imgb64}" alt="image {ind}">'
377
+ row += "</td></tr>\n"
378
+ return row
379
+
373
380
  params = HTMLParams(**kwargs)
374
381
 
375
382
  res_parts: list[SerializationResult] = []
@@ -393,6 +400,22 @@ class HTMLPictureSerializer(BasePictureSerializer):
393
400
  and item.image.uri.scheme == "data"
394
401
  ):
395
402
  img_text = f'<img src="{item.image.uri}">'
403
+ elif len(item.prov) > 1: # more than 1 provenance
404
+
405
+ img_text = (
406
+ '<table style="border-collapse: collapse; width: 100%;">\n'
407
+ )
408
+ for ind, prov in enumerate(item.prov):
409
+ img = item.get_image(doc, prov_index=ind)
410
+
411
+ if img is not None:
412
+ imgb64 = item._image_to_base64(img)
413
+ img_text += get_img_row(imgb64=imgb64, ind=ind)
414
+ else:
415
+ _logger.warning("Could not get image")
416
+
417
+ img_text += "</table>\n"
418
+
396
419
  else:
397
420
  # get the item.image._pil or crop it out of the page-image
398
421
  img = item.get_image(doc)
@@ -400,6 +423,9 @@ class HTMLPictureSerializer(BasePictureSerializer):
400
423
  if img is not None:
401
424
  imgb64 = item._image_to_base64(img)
402
425
  img_text = f'<img src="data:image/png;base64,{imgb64}">'
426
+ else:
427
+ _logger.warning("Could not get image")
428
+
403
429
  elif params.image_mode == ImageRefMode.REFERENCED:
404
430
  if isinstance(item.image, ImageRef) and not (
405
431
  isinstance(item.image.uri, AnyUrl)
@@ -14,7 +14,7 @@ from pydantic import AnyUrl, BaseModel, PositiveInt
14
14
  from tabulate import tabulate
15
15
  from typing_extensions import override
16
16
 
17
- from docling_core.experimental.serializer.base import (
17
+ from docling_core.transforms.serializer.base import (
18
18
  BaseDocSerializer,
19
19
  BaseFallbackSerializer,
20
20
  BaseFormSerializer,
@@ -26,7 +26,7 @@ from docling_core.experimental.serializer.base import (
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
28
  )
29
- from docling_core.experimental.serializer.common import (
29
+ from docling_core.transforms.serializer.common import (
30
30
  CommonParams,
31
31
  DocSerializer,
32
32
  _PageBreakSerResult,
@@ -149,38 +149,41 @@ class LayoutVisualizer(BaseVisualizer):
149
149
  continue
150
150
  if len(elem.prov) == 0:
151
151
  continue # Skip elements without provenances
152
- prov = elem.prov[0]
153
- page_nr = prov.page_no
154
-
155
- if page_nr in my_images:
156
- image = my_images[page_nr]
157
- else:
158
- raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
159
-
160
- if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
161
- # complete previous drawing
162
- if prev_page_nr is not None and prev_image and clusters:
163
- self._draw_clusters(
164
- image=prev_image,
165
- clusters=clusters,
166
- scale_x=prev_image.width / doc.pages[prev_page_nr].size.width,
167
- scale_y=prev_image.height / doc.pages[prev_page_nr].size.height,
168
- )
169
- clusters = []
170
152
 
171
- tlo_bbox = prov.bbox.to_top_left_origin(
172
- page_height=doc.pages[prov.page_no].size.height
173
- )
174
- cluster = _TLCluster(
175
- id=idx,
176
- label=elem.label,
177
- brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
178
- cells=[],
179
- )
180
- clusters.append(cluster)
153
+ for prov in elem.prov:
154
+ page_nr = prov.page_no
155
+
156
+ if page_nr in my_images:
157
+ image = my_images[page_nr]
158
+ else:
159
+ raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
160
+
161
+ if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
162
+ # complete previous drawing
163
+ if prev_page_nr is not None and prev_image and clusters:
164
+ self._draw_clusters(
165
+ image=prev_image,
166
+ clusters=clusters,
167
+ scale_x=prev_image.width
168
+ / doc.pages[prev_page_nr].size.width,
169
+ scale_y=prev_image.height
170
+ / doc.pages[prev_page_nr].size.height,
171
+ )
172
+ clusters = []
173
+
174
+ tlo_bbox = prov.bbox.to_top_left_origin(
175
+ page_height=doc.pages[prov.page_no].size.height
176
+ )
177
+ cluster = _TLCluster(
178
+ id=idx,
179
+ label=elem.label,
180
+ brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
181
+ cells=[],
182
+ )
183
+ clusters.append(cluster)
181
184
 
182
- prev_page_nr = page_nr
183
- prev_image = image
185
+ prev_page_nr = page_nr
186
+ prev_image = image
184
187
 
185
188
  # complete last drawing
186
189
  if prev_page_nr is not None and prev_image and clusters:
@@ -77,57 +77,59 @@ class ReadingOrderVisualizer(BaseVisualizer):
77
77
  continue
78
78
  if len(elem.prov) == 0:
79
79
  continue # Skip elements without provenances
80
- prov = elem.prov[0]
81
- page_no = prov.page_no
82
- image = my_images.get(page_no)
83
-
84
- if image is None or prev_page is None or page_no > prev_page:
85
- # new page begins
86
- prev_page = page_no
87
- x0 = y0 = None
88
-
89
- if image is None:
90
- page_image = doc.pages[page_no].image
91
- if page_image is None or (pil_img := page_image.pil_image) is None:
92
- raise RuntimeError("Cannot visualize document without images")
93
- else:
94
- image = deepcopy(pil_img)
95
- my_images[page_no] = image
96
- draw = ImageDraw.Draw(image)
97
-
98
- # if prov.page_no not in true_doc.pages or prov.page_no != 1:
99
- # logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ")
100
- # continue
101
-
102
- tlo_bbox = prov.bbox.to_top_left_origin(
103
- page_height=doc.pages[prov.page_no].size.height
104
- )
105
- ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
106
- ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
107
- ro_bbox.r = round(ro_bbox.r * image.width)
108
- ro_bbox.t = round(ro_bbox.t * image.height)
109
- ro_bbox.b = round(ro_bbox.b * image.height)
110
-
111
- if ro_bbox.b > ro_bbox.t:
112
- ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
113
-
114
- if x0 is None and y0 is None:
115
- x0 = (ro_bbox.l + ro_bbox.r) / 2.0
116
- y0 = (ro_bbox.b + ro_bbox.t) / 2.0
117
- else:
118
- assert x0 is not None
119
- assert y0 is not None
120
-
121
- x1 = (ro_bbox.l + ro_bbox.r) / 2.0
122
- y1 = (ro_bbox.b + ro_bbox.t) / 2.0
123
-
124
- draw = self._draw_arrow(
125
- draw=draw,
126
- arrow_coords=(x0, y0, x1, y1),
127
- line_width=2,
128
- color="red",
80
+
81
+ for prov in elem.prov:
82
+ page_no = prov.page_no
83
+ image = my_images.get(page_no)
84
+
85
+ if image is None or prev_page is None or page_no > prev_page:
86
+ # new page begins
87
+ prev_page = page_no
88
+ x0 = y0 = None
89
+
90
+ if image is None:
91
+ page_image = doc.pages[page_no].image
92
+ if (
93
+ page_image is None
94
+ or (pil_img := page_image.pil_image) is None
95
+ ):
96
+ raise RuntimeError(
97
+ "Cannot visualize document without images"
98
+ )
99
+ else:
100
+ image = deepcopy(pil_img)
101
+ my_images[page_no] = image
102
+ draw = ImageDraw.Draw(image)
103
+
104
+ tlo_bbox = prov.bbox.to_top_left_origin(
105
+ page_height=doc.pages[prov.page_no].size.height
129
106
  )
130
- x0, y0 = x1, y1
107
+ ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
108
+ ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
109
+ ro_bbox.r = round(ro_bbox.r * image.width)
110
+ ro_bbox.t = round(ro_bbox.t * image.height)
111
+ ro_bbox.b = round(ro_bbox.b * image.height)
112
+
113
+ if ro_bbox.b > ro_bbox.t:
114
+ ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
115
+
116
+ if x0 is None and y0 is None:
117
+ x0 = (ro_bbox.l + ro_bbox.r) / 2.0
118
+ y0 = (ro_bbox.b + ro_bbox.t) / 2.0
119
+ else:
120
+ assert x0 is not None
121
+ assert y0 is not None
122
+
123
+ x1 = (ro_bbox.l + ro_bbox.r) / 2.0
124
+ y1 = (ro_bbox.b + ro_bbox.t) / 2.0
125
+
126
+ draw = self._draw_arrow(
127
+ draw=draw,
128
+ arrow_coords=(x0, y0, x1, y1),
129
+ line_width=2,
130
+ color="red",
131
+ )
132
+ x0, y0 = x1, y1
131
133
  return my_images
132
134
 
133
135
  @override
@@ -790,7 +790,9 @@ class DocItem(
790
790
 
791
791
  return location
792
792
 
793
- def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
793
+ def get_image(
794
+ self, doc: "DoclingDocument", prov_index: int = 0
795
+ ) -> Optional[PILImage.Image]:
794
796
  """Returns the image of this DocItem.
795
797
 
796
798
  The function returns None if this DocItem has no valid provenance or
@@ -800,7 +802,7 @@ class DocItem(
800
802
  if not len(self.prov):
801
803
  return None
802
804
 
803
- page = doc.pages.get(self.prov[0].page_no)
805
+ page = doc.pages.get(self.prov[prov_index].page_no)
804
806
  if page is None or page.size is None or page.image is None:
805
807
  return None
806
808
 
@@ -808,7 +810,7 @@ class DocItem(
808
810
  if not page_image:
809
811
  return None
810
812
  crop_bbox = (
811
- self.prov[0]
813
+ self.prov[prov_index]
812
814
  .bbox.to_top_left_origin(page_height=page.size.height)
813
815
  .scale_to_size(old_size=page.size, new_size=page.image.size)
814
816
  # .scaled(scale=page_image.height / page.size.height)
@@ -872,7 +874,7 @@ class TextItem(DocItem):
872
874
  :param add_content: bool: (Default value = True)
873
875
 
874
876
  """
875
- from docling_core.experimental.serializer.doctags import (
877
+ from docling_core.transforms.serializer.doctags import (
876
878
  DocTagsDocSerializer,
877
879
  DocTagsParams,
878
880
  )
@@ -930,7 +932,7 @@ class SectionHeaderItem(TextItem):
930
932
  :param add_content: bool: (Default value = True)
931
933
 
932
934
  """
933
- from docling_core.experimental.serializer.doctags import (
935
+ from docling_core.transforms.serializer.doctags import (
934
936
  DocTagsDocSerializer,
935
937
  DocTagsParams,
936
938
  )
@@ -973,7 +975,9 @@ class FloatingItem(DocItem):
973
975
  text += cap.resolve(doc).text
974
976
  return text
975
977
 
976
- def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
978
+ def get_image(
979
+ self, doc: "DoclingDocument", prov_index: int = 0
980
+ ) -> Optional[PILImage.Image]:
977
981
  """Returns the image corresponding to this FloatingItem.
978
982
 
979
983
  This function returns the PIL image from self.image if one is available.
@@ -985,7 +989,7 @@ class FloatingItem(DocItem):
985
989
  """
986
990
  if self.image is not None:
987
991
  return self.image.pil_image
988
- return super().get_image(doc=doc)
992
+ return super().get_image(doc=doc, prov_index=prov_index)
989
993
 
990
994
 
991
995
  class CodeItem(FloatingItem, TextItem):
@@ -1020,7 +1024,7 @@ class CodeItem(FloatingItem, TextItem):
1020
1024
  :param add_content: bool: (Default value = True)
1021
1025
 
1022
1026
  """
1023
- from docling_core.experimental.serializer.doctags import (
1027
+ from docling_core.transforms.serializer.doctags import (
1024
1028
  DocTagsDocSerializer,
1025
1029
  DocTagsParams,
1026
1030
  )
@@ -1073,7 +1077,7 @@ class PictureItem(FloatingItem):
1073
1077
  image_bytes = self.image._pil.tobytes()
1074
1078
 
1075
1079
  # Create a hash object (e.g., SHA-256)
1076
- hasher = hashlib.sha256()
1080
+ hasher = hashlib.sha256(usedforsecurity=False)
1077
1081
 
1078
1082
  # Feed the image bytes into the hash object
1079
1083
  hasher.update(image_bytes)
@@ -1091,7 +1095,7 @@ class PictureItem(FloatingItem):
1091
1095
  image_placeholder: str = "<!-- image -->",
1092
1096
  ) -> str:
1093
1097
  """Export picture to Markdown format."""
1094
- from docling_core.experimental.serializer.markdown import (
1098
+ from docling_core.transforms.serializer.markdown import (
1095
1099
  MarkdownDocSerializer,
1096
1100
  MarkdownParams,
1097
1101
  )
@@ -1118,7 +1122,7 @@ class PictureItem(FloatingItem):
1118
1122
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1119
1123
  ) -> str:
1120
1124
  """Export picture to HTML format."""
1121
- from docling_core.experimental.serializer.html import (
1125
+ from docling_core.transforms.serializer.html import (
1122
1126
  HTMLDocSerializer,
1123
1127
  HTMLParams,
1124
1128
  )
@@ -1159,7 +1163,7 @@ class PictureItem(FloatingItem):
1159
1163
  :param # not used at the moment
1160
1164
 
1161
1165
  """
1162
- from docling_core.experimental.serializer.doctags import (
1166
+ from docling_core.transforms.serializer.doctags import (
1163
1167
  DocTagsDocSerializer,
1164
1168
  DocTagsParams,
1165
1169
  )
@@ -1235,7 +1239,7 @@ class TableItem(FloatingItem):
1235
1239
  def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
1236
1240
  """Export the table as markdown."""
1237
1241
  if doc is not None:
1238
- from docling_core.experimental.serializer.markdown import (
1242
+ from docling_core.transforms.serializer.markdown import (
1239
1243
  MarkdownDocSerializer,
1240
1244
  )
1241
1245
 
@@ -1282,7 +1286,7 @@ class TableItem(FloatingItem):
1282
1286
  ) -> str:
1283
1287
  """Export the table as html."""
1284
1288
  if doc is not None:
1285
- from docling_core.experimental.serializer.html import HTMLDocSerializer
1289
+ from docling_core.transforms.serializer.html import HTMLDocSerializer
1286
1290
 
1287
1291
  serializer = HTMLDocSerializer(doc=doc)
1288
1292
  text = serializer.serialize(item=self).text
@@ -1414,7 +1418,7 @@ class TableItem(FloatingItem):
1414
1418
  :param add_caption: bool: (Default value = True)
1415
1419
 
1416
1420
  """
1417
- from docling_core.experimental.serializer.doctags import (
1421
+ from docling_core.transforms.serializer.doctags import (
1418
1422
  DocTagsDocSerializer,
1419
1423
  DocTagsParams,
1420
1424
  )
@@ -1512,7 +1516,7 @@ class KeyValueItem(FloatingItem):
1512
1516
  :param add_content: bool: (Default value = True)
1513
1517
 
1514
1518
  """
1515
- from docling_core.experimental.serializer.doctags import (
1519
+ from docling_core.transforms.serializer.doctags import (
1516
1520
  DocTagsDocSerializer,
1517
1521
  DocTagsParams,
1518
1522
  )
@@ -2657,16 +2661,25 @@ class DoclingDocument(BaseModel):
2657
2661
  if should_yield:
2658
2662
  yield root, my_stack
2659
2663
 
2660
- # Handle picture traversal - only traverse children if requested
2661
- if isinstance(root, PictureItem) and not traverse_pictures:
2662
- return
2663
-
2664
2664
  my_stack.append(-1)
2665
2665
 
2666
+ allowed_pic_refs: set[str] = (
2667
+ {r.cref for r in root.captions}
2668
+ if (root_is_picture := isinstance(root, PictureItem))
2669
+ else set()
2670
+ )
2671
+
2666
2672
  # Traverse children
2667
2673
  for child_ind, child_ref in enumerate(root.children):
2668
- my_stack[-1] = child_ind
2669
2674
  child = child_ref.resolve(self)
2675
+ if (
2676
+ root_is_picture
2677
+ and not traverse_pictures
2678
+ and isinstance(child, DocItem)
2679
+ and child.self_ref not in allowed_pic_refs
2680
+ ):
2681
+ continue
2682
+ my_stack[-1] = child_ind
2670
2683
 
2671
2684
  if isinstance(child, NodeItem):
2672
2685
  yield from self._iterate_items_with_stack(
@@ -2999,7 +3012,7 @@ class DoclingDocument(BaseModel):
2999
3012
  :returns: The exported Markdown representation.
3000
3013
  :rtype: str
3001
3014
  """
3002
- from docling_core.experimental.serializer.markdown import (
3015
+ from docling_core.transforms.serializer.markdown import (
3003
3016
  MarkdownDocSerializer,
3004
3017
  MarkdownParams,
3005
3018
  )
@@ -3153,7 +3166,7 @@ class DoclingDocument(BaseModel):
3153
3166
  split_page_view: bool = False,
3154
3167
  ) -> str:
3155
3168
  r"""Serialize to HTML."""
3156
- from docling_core.experimental.serializer.html import (
3169
+ from docling_core.transforms.serializer.html import (
3157
3170
  HTMLDocSerializer,
3158
3171
  HTMLOutputStyle,
3159
3172
  HTMLParams,
@@ -3195,9 +3208,9 @@ class DoclingDocument(BaseModel):
3195
3208
 
3196
3209
  return ser_res.text
3197
3210
 
3211
+ @staticmethod
3198
3212
  def load_from_doctags( # noqa: C901
3199
- self,
3200
- doctag_document: DocTagsDocument,
3213
+ doctag_document: DocTagsDocument, document_name: str = "Document"
3201
3214
  ) -> "DoclingDocument":
3202
3215
  r"""Load Docling document from lists of DocTags and Images."""
3203
3216
  # Maps the recognized tag to a Docling label.
@@ -3221,6 +3234,8 @@ class DoclingDocument(BaseModel):
3221
3234
  "key_value_region": DocItemLabel.KEY_VALUE_REGION,
3222
3235
  }
3223
3236
 
3237
+ doc = DoclingDocument(name=document_name)
3238
+
3224
3239
  def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
3225
3240
  """Extract <loc_...> coords from the chunk, normalized by / 500."""
3226
3241
  coords = re.findall(r"<loc_(\d+)>", text_chunk)
@@ -3244,7 +3259,7 @@ class DoclingDocument(BaseModel):
3244
3259
  caption_content = caption.group(1)
3245
3260
  bbox = extract_bounding_box(caption_content)
3246
3261
  caption_text = extract_inner_text(caption_content)
3247
- caption_item = self.add_text(
3262
+ caption_item = doc.add_text(
3248
3263
  label=DocItemLabel.CAPTION,
3249
3264
  text=caption_text,
3250
3265
  parent=None,
@@ -3567,7 +3582,7 @@ class DoclingDocument(BaseModel):
3567
3582
  pg_width = 1
3568
3583
  pg_height = 1
3569
3584
 
3570
- self.add_page(
3585
+ doc.add_page(
3571
3586
  page_no=page_no,
3572
3587
  size=Size(width=pg_width, height=pg_height),
3573
3588
  image=ImageRef.from_pil(image=image, dpi=72) if image else None,
@@ -3595,7 +3610,9 @@ class DoclingDocument(BaseModel):
3595
3610
  rf"{DocumentToken.UNORDERED_LIST.value}|"
3596
3611
  rf"{DocItemLabel.KEY_VALUE_REGION}|"
3597
3612
  rf"{DocumentToken.CHART.value}|"
3598
- rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3613
+ rf"{DocumentToken.OTSL.value})>"
3614
+ rf"(?P<content>.*?)"
3615
+ rf"(?:(?P<closed></(?P=tag)>)|(?P<eof>$))"
3599
3616
  )
3600
3617
  pattern = re.compile(tag_pattern, re.DOTALL)
3601
3618
 
@@ -3605,6 +3622,10 @@ class DoclingDocument(BaseModel):
3605
3622
  tag_name = match.group("tag")
3606
3623
 
3607
3624
  bbox = extract_bounding_box(full_chunk) # Extracts first bbox
3625
+ if not match.group("closed"):
3626
+ # no closing tag; only the existence of the item is recovered
3627
+ full_chunk = f"<{tag_name}></{tag_name}>"
3628
+
3608
3629
  doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3609
3630
 
3610
3631
  if tag_name == DocumentToken.OTSL.value:
@@ -3624,9 +3645,9 @@ class DoclingDocument(BaseModel):
3624
3645
  charspan=(0, 0),
3625
3646
  page_no=page_no,
3626
3647
  )
3627
- self.add_table(data=table_data, prov=prov, caption=caption)
3648
+ doc.add_table(data=table_data, prov=prov, caption=caption)
3628
3649
  else:
3629
- self.add_table(data=table_data, caption=caption)
3650
+ doc.add_table(data=table_data, caption=caption)
3630
3651
 
3631
3652
  elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
3632
3653
  caption, caption_bbox = extract_caption(full_chunk)
@@ -3646,7 +3667,7 @@ class DoclingDocument(BaseModel):
3646
3667
  int(bbox.b * im_height),
3647
3668
  )
3648
3669
  cropped_image = image.crop(crop_box)
3649
- pic = self.add_picture(
3670
+ pic = doc.add_picture(
3650
3671
  parent=None,
3651
3672
  image=ImageRef.from_pil(image=cropped_image, dpi=72),
3652
3673
  prov=(
@@ -3692,7 +3713,7 @@ class DoclingDocument(BaseModel):
3692
3713
  else:
3693
3714
  if bbox:
3694
3715
  # In case we don't have access to an binary of an image
3695
- pic = self.add_picture(
3716
+ pic = doc.add_picture(
3696
3717
  parent=None,
3697
3718
  prov=ProvenanceItem(
3698
3719
  bbox=bbox, charspan=(0, 0), page_no=page_no
@@ -3733,7 +3754,7 @@ class DoclingDocument(BaseModel):
3733
3754
  key_value_data, kv_item_prov = parse_key_value_item(
3734
3755
  full_chunk, image
3735
3756
  )
3736
- self.add_key_values(graph=key_value_data, prov=kv_item_prov)
3757
+ doc.add_key_values(graph=key_value_data, prov=kv_item_prov)
3737
3758
  elif tag_name in [
3738
3759
  DocumentToken.ORDERED_LIST.value,
3739
3760
  DocumentToken.UNORDERED_LIST.value,
@@ -3749,7 +3770,7 @@ class DoclingDocument(BaseModel):
3749
3770
  )
3750
3771
  li_pattern = re.compile(list_item_pattern, re.DOTALL)
3751
3772
  # Add list group:
3752
- new_list = self.add_group(label=list_label, name="list")
3773
+ new_list = doc.add_group(label=list_label, name="list")
3753
3774
  # Pricess list items
3754
3775
  for li_match in li_pattern.finditer(full_chunk):
3755
3776
  enum_value += 1
@@ -3760,7 +3781,7 @@ class DoclingDocument(BaseModel):
3760
3781
  li_bbox = extract_bounding_box(li_full_chunk) if image else None
3761
3782
  text_content = extract_inner_text(li_full_chunk)
3762
3783
  # Add list item
3763
- self.add_list_item(
3784
+ doc.add_list_item(
3764
3785
  marker=enum_marker,
3765
3786
  enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
3766
3787
  parent=new_list,
@@ -3792,13 +3813,13 @@ class DoclingDocument(BaseModel):
3792
3813
  if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
3793
3814
  content_layer = ContentLayer.FURNITURE
3794
3815
 
3795
- self.add_text(
3816
+ doc.add_text(
3796
3817
  label=doc_label,
3797
3818
  text=text_content,
3798
3819
  prov=element_prov,
3799
3820
  content_layer=content_layer,
3800
3821
  )
3801
- return self
3822
+ return doc
3802
3823
 
3803
3824
  @deprecated("Use save_as_doctags instead.")
3804
3825
  def save_as_document_tokens(self, *args, **kwargs):
@@ -3885,7 +3906,7 @@ class DoclingDocument(BaseModel):
3885
3906
  :returns: The content of the document formatted as a DocTags string.
3886
3907
  :rtype: str
3887
3908
  """
3888
- from docling_core.experimental.serializer.doctags import (
3909
+ from docling_core.transforms.serializer.doctags import (
3889
3910
  DocTagsDocSerializer,
3890
3911
  DocTagsParams,
3891
3912
  )
@@ -78,6 +78,7 @@ class GroupLabel(str, Enum):
78
78
  KEY_VALUE_AREA = "key_value_area"
79
79
  COMMENT_SECTION = "comment_section"
80
80
  INLINE = "inline"
81
+ PICTURE_AREA = "picture_area"
81
82
 
82
83
  def __str__(self):
83
84
  """Get string value."""
@@ -472,8 +472,27 @@ class SegmentedPage(BaseModel):
472
472
  word_cells: List[TextCell] = []
473
473
  textline_cells: List[TextCell] = []
474
474
 
475
+ # These flags are set to differentiate if above lists of this SegmentedPage
476
+ # are empty (page had no content) or if they have not been computed (i.e. textline_cells may be present
477
+ # but word_cells are not)
478
+ has_chars: bool = False
479
+ has_words: bool = False
480
+ has_lines: bool = False
481
+
475
482
  image: Optional[ImageRef] = None
476
483
 
484
+ @model_validator(mode="after")
485
+ def validate_page(self) -> "SegmentedPage":
486
+ """Validate page."""
487
+ if len(self.textline_cells) > 0:
488
+ self.has_lines = True
489
+ if len(self.word_cells) > 0:
490
+ self.has_words = True
491
+ if len(self.char_cells) > 0:
492
+ self.has_chars = True
493
+
494
+ return self
495
+
477
496
  def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
478
497
  """Iterate through text cells of the specified unit type.
479
498
 
@@ -579,13 +598,17 @@ class SegmentedPdfPage(SegmentedPage):
579
598
  with open(filename, "r", encoding="utf-8") as f:
580
599
  return cls.model_validate_json(f.read())
581
600
 
582
- def crop_text(self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0):
601
+ def crop_text(
602
+ self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
603
+ ) -> str:
583
604
  """Extract text from cells within the specified bounding box.
584
605
 
585
606
  Args:
586
607
  cell_unit: Type of text unit to extract
587
608
  bbox: Bounding box to extract from
588
609
  eps: Epsilon value for position comparison
610
+ Returns:
611
+ Extracted text from the cells
589
612
  """
590
613
  selection = []
591
614
  for page_cell in self.iterate_cells(cell_unit):
@@ -605,7 +628,6 @@ class SegmentedPdfPage(SegmentedPage):
605
628
 
606
629
  text = ""
607
630
  for i, cell in enumerate(selection):
608
-
609
631
  if i == 0:
610
632
  text += cell.text
611
633
  else:
@@ -619,6 +641,7 @@ class SegmentedPdfPage(SegmentedPage):
619
641
  else:
620
642
  text += " "
621
643
  text += cell.text
644
+ return text
622
645
 
623
646
  def export_to_textlines(
624
647
  self,
@@ -640,7 +663,6 @@ class SegmentedPdfPage(SegmentedPage):
640
663
  """
641
664
  lines: List[str] = []
642
665
  for cell in self.iterate_cells(cell_unit):
643
-
644
666
  line = ""
645
667
  if add_location:
646
668
  line += f"({cell.rect.r_x0:06.02f}, {cell.rect.r_y0:06.02f}) "
@@ -1104,7 +1126,6 @@ class SegmentedPdfPage(SegmentedPage):
1104
1126
 
1105
1127
  # Draw each rectangle by connecting its four points
1106
1128
  for line in self.lines:
1107
-
1108
1129
  line.to_top_left_origin(page_height=page_height)
1109
1130
  for segment in line.iterate_segments():
1110
1131
  draw.line(
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
47
47
 
48
48
 
49
49
  def _create_hash(string: str):
50
- hasher = hashlib.sha256()
50
+ hasher = hashlib.sha256(usedforsecurity=False)
51
51
  hasher.update(string.encode("utf-8"))
52
52
 
53
53
  return hasher.hexdigest()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.28.1"
3
+ version = "2.30.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes