docling-core 2.28.0__tar.gz → 2.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (79) hide show
  1. {docling_core-2.28.0 → docling_core-2.29.0}/PKG-INFO +1 -1
  2. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/hierarchical_chunker.py +5 -5
  3. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/hybrid_chunker.py +4 -4
  4. {docling_core-2.28.0/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/common.py +1 -1
  5. {docling_core-2.28.0/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/doctags.py +2 -2
  6. {docling_core-2.28.0/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/html.py +3 -3
  7. {docling_core-2.28.0/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/markdown.py +2 -2
  8. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/visualizer/layout_visualizer.py +49 -35
  9. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +52 -50
  10. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/doc/document.py +29 -27
  11. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/doc/page.py +6 -4
  12. {docling_core-2.28.0 → docling_core-2.29.0}/pyproject.toml +1 -1
  13. {docling_core-2.28.0 → docling_core-2.29.0}/LICENSE +0 -0
  14. {docling_core-2.28.0 → docling_core-2.29.0}/README.md +0 -0
  15. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/__init__.py +0 -0
  16. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/cli/__init__.py +0 -0
  17. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/cli/view.py +0 -0
  18. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/experimental/__init__.py +0 -0
  19. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/py.typed +0 -0
  20. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  21. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  22. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  23. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  24. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  25. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  26. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  27. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  28. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/search/__init__.py +0 -0
  29. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  30. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/search/mapping.py +0 -0
  31. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/search/meta.py +0 -0
  32. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/search/package.py +0 -0
  33. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/__init__.py +0 -0
  34. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/__init__.py +0 -0
  35. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/base.py +0 -0
  36. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  37. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  38. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  39. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  40. {docling_core-2.28.0/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/__init__.py +0 -0
  41. {docling_core-2.28.0/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/base.py +0 -0
  42. {docling_core-2.28.0/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/html_styles.py +0 -0
  43. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  44. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/transforms/visualizer/base.py +0 -0
  45. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/__init__.py +0 -0
  46. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/base.py +0 -0
  47. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/doc/__init__.py +0 -0
  48. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/doc/base.py +0 -0
  49. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/doc/labels.py +0 -0
  50. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/doc/tokens.py +0 -0
  51. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/doc/utils.py +0 -0
  52. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/gen/__init__.py +0 -0
  53. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/gen/generic.py +0 -0
  54. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/io/__init__.py +0 -0
  55. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  56. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/legacy_doc/base.py +0 -0
  57. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  58. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  59. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  60. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/legacy_doc/document.py +0 -0
  61. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  62. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/nlp/__init__.py +0 -0
  63. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/nlp/qa.py +0 -0
  64. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/nlp/qa_labels.py +0 -0
  65. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/rec/__init__.py +0 -0
  66. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/rec/attribute.py +0 -0
  67. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/rec/base.py +0 -0
  68. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/rec/predicate.py +0 -0
  69. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/rec/record.py +0 -0
  70. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/rec/statement.py +0 -0
  71. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/types/rec/subject.py +0 -0
  72. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/__init__.py +0 -0
  73. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/alias.py +0 -0
  74. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/file.py +0 -0
  75. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/generate_docs.py +0 -0
  76. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/generate_jsonschema.py +0 -0
  77. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/legacy.py +0 -0
  78. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/validate.py +0 -0
  79. {docling_core-2.28.0 → docling_core-2.29.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.28.0
3
+ Version: 2.29.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -14,19 +14,19 @@ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
14
14
  from pydantic import ConfigDict, Field, StringConstraints, field_validator
15
15
  from typing_extensions import Annotated, override
16
16
 
17
- from docling_core.experimental.serializer.base import (
17
+ from docling_core.search.package import VERSION_PATTERN
18
+ from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
19
+ from docling_core.transforms.serializer.base import (
18
20
  BaseDocSerializer,
19
21
  BaseSerializerProvider,
20
22
  BaseTableSerializer,
21
23
  SerializationResult,
22
24
  )
23
- from docling_core.experimental.serializer.common import create_ser_result
24
- from docling_core.experimental.serializer.markdown import (
25
+ from docling_core.transforms.serializer.common import create_ser_result
26
+ from docling_core.transforms.serializer.markdown import (
25
27
  MarkdownDocSerializer,
26
28
  MarkdownParams,
27
29
  )
28
- from docling_core.search.package import VERSION_PATTERN
29
- from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
30
30
  from docling_core.types import DoclingDocument as DLDocument
31
31
  from docling_core.types.doc.base import ImageRefMode
32
32
  from docling_core.types.doc.document import (
@@ -25,10 +25,6 @@ except ImportError:
25
25
  "`pip install 'docling-core[chunking-openai]'`"
26
26
  )
27
27
 
28
- from docling_core.experimental.serializer.base import (
29
- BaseDocSerializer,
30
- BaseSerializerProvider,
31
- )
32
28
  from docling_core.transforms.chunker import (
33
29
  BaseChunk,
34
30
  BaseChunker,
@@ -36,6 +32,10 @@ from docling_core.transforms.chunker import (
36
32
  DocMeta,
37
33
  HierarchicalChunker,
38
34
  )
35
+ from docling_core.transforms.serializer.base import (
36
+ BaseDocSerializer,
37
+ BaseSerializerProvider,
38
+ )
39
39
  from docling_core.types import DoclingDocument
40
40
 
41
41
 
@@ -14,7 +14,7 @@ from typing import Any, Iterable, Optional, Tuple, Union
14
14
  from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
15
15
  from typing_extensions import Self, override
16
16
 
17
- from docling_core.experimental.serializer.base import (
17
+ from docling_core.transforms.serializer.base import (
18
18
  BaseDocSerializer,
19
19
  BaseFallbackSerializer,
20
20
  BaseFormSerializer,
@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Union
6
6
  from pydantic import BaseModel
7
7
  from typing_extensions import override
8
8
 
9
- from docling_core.experimental.serializer.base import (
9
+ from docling_core.transforms.serializer.base import (
10
10
  BaseDocSerializer,
11
11
  BaseFallbackSerializer,
12
12
  BaseFormSerializer,
@@ -18,7 +18,7 @@ from docling_core.experimental.serializer.base import (
18
18
  BaseTextSerializer,
19
19
  SerializationResult,
20
20
  )
21
- from docling_core.experimental.serializer.common import (
21
+ from docling_core.transforms.serializer.common import (
22
22
  CommonParams,
23
23
  DocSerializer,
24
24
  create_ser_result,
@@ -19,7 +19,7 @@ import latex2mathml.converter
19
19
  from pydantic import AnyUrl, BaseModel
20
20
  from typing_extensions import override
21
21
 
22
- from docling_core.experimental.serializer.base import (
22
+ from docling_core.transforms.serializer.base import (
23
23
  BaseDocSerializer,
24
24
  BaseFallbackSerializer,
25
25
  BaseFormSerializer,
@@ -31,12 +31,12 @@ from docling_core.experimental.serializer.base import (
31
31
  BaseTextSerializer,
32
32
  SerializationResult,
33
33
  )
34
- from docling_core.experimental.serializer.common import (
34
+ from docling_core.transforms.serializer.common import (
35
35
  CommonParams,
36
36
  DocSerializer,
37
37
  create_ser_result,
38
38
  )
39
- from docling_core.experimental.serializer.html_styles import (
39
+ from docling_core.transforms.serializer.html_styles import (
40
40
  _get_css_for_single_column,
41
41
  _get_css_for_split_page,
42
42
  )
@@ -14,7 +14,7 @@ from pydantic import AnyUrl, BaseModel, PositiveInt
14
14
  from tabulate import tabulate
15
15
  from typing_extensions import override
16
16
 
17
- from docling_core.experimental.serializer.base import (
17
+ from docling_core.transforms.serializer.base import (
18
18
  BaseDocSerializer,
19
19
  BaseFallbackSerializer,
20
20
  BaseFormSerializer,
@@ -26,7 +26,7 @@ from docling_core.experimental.serializer.base import (
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
28
  )
29
- from docling_core.experimental.serializer.common import (
29
+ from docling_core.transforms.serializer.common import (
30
30
  CommonParams,
31
31
  DocSerializer,
32
32
  _PageBreakSerResult,
@@ -123,7 +123,21 @@ class LayoutVisualizer(BaseVisualizer):
123
123
  ):
124
124
  """Draw the document clusters and optionaly the reading order."""
125
125
  clusters = []
126
- my_images = images or {}
126
+ my_images: dict[Optional[int], Image] = {}
127
+
128
+ if images is not None:
129
+ my_images = images
130
+
131
+ # Initialise `my_images` beforehand: sometimes, you have the
132
+ # page-images but no DocItems!
133
+ for page_nr, page in doc.pages.items():
134
+ page_image = doc.pages[page_nr].image
135
+ if page_image is None or (pil_img := page_image.pil_image) is None:
136
+ raise RuntimeError("Cannot visualize document without images")
137
+ elif page_nr not in my_images:
138
+ image = deepcopy(pil_img)
139
+ my_images[page_nr] = image
140
+
127
141
  prev_image = None
128
142
  prev_page_nr = None
129
143
  for idx, (elem, _) in enumerate(
@@ -135,41 +149,41 @@ class LayoutVisualizer(BaseVisualizer):
135
149
  continue
136
150
  if len(elem.prov) == 0:
137
151
  continue # Skip elements without provenances
138
- prov = elem.prov[0]
139
- page_nr = prov.page_no
140
- image = my_images.get(page_nr)
141
-
142
- if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
143
- # complete previous drawing
144
- if prev_page_nr is not None and prev_image and clusters:
145
- self._draw_clusters(
146
- image=prev_image,
147
- clusters=clusters,
148
- scale_x=prev_image.width / doc.pages[prev_page_nr].size.width,
149
- scale_y=prev_image.height / doc.pages[prev_page_nr].size.height,
150
- )
151
- clusters = []
152
-
153
- if image is None:
154
- page_image = doc.pages[page_nr].image
155
- if page_image is None or (pil_img := page_image.pil_image) is None:
156
- raise RuntimeError("Cannot visualize document without images")
157
- else:
158
- image = deepcopy(pil_img)
159
- my_images[page_nr] = image
160
- tlo_bbox = prov.bbox.to_top_left_origin(
161
- page_height=doc.pages[prov.page_no].size.height
162
- )
163
- cluster = _TLCluster(
164
- id=idx,
165
- label=elem.label,
166
- brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
167
- cells=[],
168
- )
169
- clusters.append(cluster)
170
152
 
171
- prev_page_nr = page_nr
172
- prev_image = image
153
+ for prov in elem.prov:
154
+ page_nr = prov.page_no
155
+
156
+ if page_nr in my_images:
157
+ image = my_images[page_nr]
158
+ else:
159
+ raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
160
+
161
+ if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
162
+ # complete previous drawing
163
+ if prev_page_nr is not None and prev_image and clusters:
164
+ self._draw_clusters(
165
+ image=prev_image,
166
+ clusters=clusters,
167
+ scale_x=prev_image.width
168
+ / doc.pages[prev_page_nr].size.width,
169
+ scale_y=prev_image.height
170
+ / doc.pages[prev_page_nr].size.height,
171
+ )
172
+ clusters = []
173
+
174
+ tlo_bbox = prov.bbox.to_top_left_origin(
175
+ page_height=doc.pages[prov.page_no].size.height
176
+ )
177
+ cluster = _TLCluster(
178
+ id=idx,
179
+ label=elem.label,
180
+ brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
181
+ cells=[],
182
+ )
183
+ clusters.append(cluster)
184
+
185
+ prev_page_nr = page_nr
186
+ prev_image = image
173
187
 
174
188
  # complete last drawing
175
189
  if prev_page_nr is not None and prev_image and clusters:
@@ -77,57 +77,59 @@ class ReadingOrderVisualizer(BaseVisualizer):
77
77
  continue
78
78
  if len(elem.prov) == 0:
79
79
  continue # Skip elements without provenances
80
- prov = elem.prov[0]
81
- page_no = prov.page_no
82
- image = my_images.get(page_no)
83
-
84
- if image is None or prev_page is None or page_no > prev_page:
85
- # new page begins
86
- prev_page = page_no
87
- x0 = y0 = None
88
-
89
- if image is None:
90
- page_image = doc.pages[page_no].image
91
- if page_image is None or (pil_img := page_image.pil_image) is None:
92
- raise RuntimeError("Cannot visualize document without images")
93
- else:
94
- image = deepcopy(pil_img)
95
- my_images[page_no] = image
96
- draw = ImageDraw.Draw(image)
97
-
98
- # if prov.page_no not in true_doc.pages or prov.page_no != 1:
99
- # logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ")
100
- # continue
101
-
102
- tlo_bbox = prov.bbox.to_top_left_origin(
103
- page_height=doc.pages[prov.page_no].size.height
104
- )
105
- ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
106
- ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
107
- ro_bbox.r = round(ro_bbox.r * image.width)
108
- ro_bbox.t = round(ro_bbox.t * image.height)
109
- ro_bbox.b = round(ro_bbox.b * image.height)
110
-
111
- if ro_bbox.b > ro_bbox.t:
112
- ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
113
-
114
- if x0 is None and y0 is None:
115
- x0 = (ro_bbox.l + ro_bbox.r) / 2.0
116
- y0 = (ro_bbox.b + ro_bbox.t) / 2.0
117
- else:
118
- assert x0 is not None
119
- assert y0 is not None
120
-
121
- x1 = (ro_bbox.l + ro_bbox.r) / 2.0
122
- y1 = (ro_bbox.b + ro_bbox.t) / 2.0
123
-
124
- draw = self._draw_arrow(
125
- draw=draw,
126
- arrow_coords=(x0, y0, x1, y1),
127
- line_width=2,
128
- color="red",
80
+
81
+ for prov in elem.prov:
82
+ page_no = prov.page_no
83
+ image = my_images.get(page_no)
84
+
85
+ if image is None or prev_page is None or page_no > prev_page:
86
+ # new page begins
87
+ prev_page = page_no
88
+ x0 = y0 = None
89
+
90
+ if image is None:
91
+ page_image = doc.pages[page_no].image
92
+ if (
93
+ page_image is None
94
+ or (pil_img := page_image.pil_image) is None
95
+ ):
96
+ raise RuntimeError(
97
+ "Cannot visualize document without images"
98
+ )
99
+ else:
100
+ image = deepcopy(pil_img)
101
+ my_images[page_no] = image
102
+ draw = ImageDraw.Draw(image)
103
+
104
+ tlo_bbox = prov.bbox.to_top_left_origin(
105
+ page_height=doc.pages[prov.page_no].size.height
129
106
  )
130
- x0, y0 = x1, y1
107
+ ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
108
+ ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
109
+ ro_bbox.r = round(ro_bbox.r * image.width)
110
+ ro_bbox.t = round(ro_bbox.t * image.height)
111
+ ro_bbox.b = round(ro_bbox.b * image.height)
112
+
113
+ if ro_bbox.b > ro_bbox.t:
114
+ ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
115
+
116
+ if x0 is None and y0 is None:
117
+ x0 = (ro_bbox.l + ro_bbox.r) / 2.0
118
+ y0 = (ro_bbox.b + ro_bbox.t) / 2.0
119
+ else:
120
+ assert x0 is not None
121
+ assert y0 is not None
122
+
123
+ x1 = (ro_bbox.l + ro_bbox.r) / 2.0
124
+ y1 = (ro_bbox.b + ro_bbox.t) / 2.0
125
+
126
+ draw = self._draw_arrow(
127
+ draw=draw,
128
+ arrow_coords=(x0, y0, x1, y1),
129
+ line_width=2,
130
+ color="red",
131
+ )
132
+ x0, y0 = x1, y1
131
133
  return my_images
132
134
 
133
135
  @override
@@ -872,7 +872,7 @@ class TextItem(DocItem):
872
872
  :param add_content: bool: (Default value = True)
873
873
 
874
874
  """
875
- from docling_core.experimental.serializer.doctags import (
875
+ from docling_core.transforms.serializer.doctags import (
876
876
  DocTagsDocSerializer,
877
877
  DocTagsParams,
878
878
  )
@@ -930,7 +930,7 @@ class SectionHeaderItem(TextItem):
930
930
  :param add_content: bool: (Default value = True)
931
931
 
932
932
  """
933
- from docling_core.experimental.serializer.doctags import (
933
+ from docling_core.transforms.serializer.doctags import (
934
934
  DocTagsDocSerializer,
935
935
  DocTagsParams,
936
936
  )
@@ -1020,7 +1020,7 @@ class CodeItem(FloatingItem, TextItem):
1020
1020
  :param add_content: bool: (Default value = True)
1021
1021
 
1022
1022
  """
1023
- from docling_core.experimental.serializer.doctags import (
1023
+ from docling_core.transforms.serializer.doctags import (
1024
1024
  DocTagsDocSerializer,
1025
1025
  DocTagsParams,
1026
1026
  )
@@ -1091,7 +1091,7 @@ class PictureItem(FloatingItem):
1091
1091
  image_placeholder: str = "<!-- image -->",
1092
1092
  ) -> str:
1093
1093
  """Export picture to Markdown format."""
1094
- from docling_core.experimental.serializer.markdown import (
1094
+ from docling_core.transforms.serializer.markdown import (
1095
1095
  MarkdownDocSerializer,
1096
1096
  MarkdownParams,
1097
1097
  )
@@ -1118,7 +1118,7 @@ class PictureItem(FloatingItem):
1118
1118
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1119
1119
  ) -> str:
1120
1120
  """Export picture to HTML format."""
1121
- from docling_core.experimental.serializer.html import (
1121
+ from docling_core.transforms.serializer.html import (
1122
1122
  HTMLDocSerializer,
1123
1123
  HTMLParams,
1124
1124
  )
@@ -1159,7 +1159,7 @@ class PictureItem(FloatingItem):
1159
1159
  :param # not used at the moment
1160
1160
 
1161
1161
  """
1162
- from docling_core.experimental.serializer.doctags import (
1162
+ from docling_core.transforms.serializer.doctags import (
1163
1163
  DocTagsDocSerializer,
1164
1164
  DocTagsParams,
1165
1165
  )
@@ -1235,7 +1235,7 @@ class TableItem(FloatingItem):
1235
1235
  def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
1236
1236
  """Export the table as markdown."""
1237
1237
  if doc is not None:
1238
- from docling_core.experimental.serializer.markdown import (
1238
+ from docling_core.transforms.serializer.markdown import (
1239
1239
  MarkdownDocSerializer,
1240
1240
  )
1241
1241
 
@@ -1282,7 +1282,7 @@ class TableItem(FloatingItem):
1282
1282
  ) -> str:
1283
1283
  """Export the table as html."""
1284
1284
  if doc is not None:
1285
- from docling_core.experimental.serializer.html import HTMLDocSerializer
1285
+ from docling_core.transforms.serializer.html import HTMLDocSerializer
1286
1286
 
1287
1287
  serializer = HTMLDocSerializer(doc=doc)
1288
1288
  text = serializer.serialize(item=self).text
@@ -1383,7 +1383,7 @@ class TableItem(FloatingItem):
1383
1383
  if add_cross_cell:
1384
1384
  body.append(str(TableToken.OTSL_XCEL.value))
1385
1385
  body.append(str(TableToken.OTSL_NL.value))
1386
- body_str = "".join(body)
1386
+ body_str = "".join(body)
1387
1387
  return body_str
1388
1388
 
1389
1389
  @deprecated("Use export_to_doctags() instead.")
@@ -1414,7 +1414,7 @@ class TableItem(FloatingItem):
1414
1414
  :param add_caption: bool: (Default value = True)
1415
1415
 
1416
1416
  """
1417
- from docling_core.experimental.serializer.doctags import (
1417
+ from docling_core.transforms.serializer.doctags import (
1418
1418
  DocTagsDocSerializer,
1419
1419
  DocTagsParams,
1420
1420
  )
@@ -1512,7 +1512,7 @@ class KeyValueItem(FloatingItem):
1512
1512
  :param add_content: bool: (Default value = True)
1513
1513
 
1514
1514
  """
1515
- from docling_core.experimental.serializer.doctags import (
1515
+ from docling_core.transforms.serializer.doctags import (
1516
1516
  DocTagsDocSerializer,
1517
1517
  DocTagsParams,
1518
1518
  )
@@ -2999,7 +2999,7 @@ class DoclingDocument(BaseModel):
2999
2999
  :returns: The exported Markdown representation.
3000
3000
  :rtype: str
3001
3001
  """
3002
- from docling_core.experimental.serializer.markdown import (
3002
+ from docling_core.transforms.serializer.markdown import (
3003
3003
  MarkdownDocSerializer,
3004
3004
  MarkdownParams,
3005
3005
  )
@@ -3153,7 +3153,7 @@ class DoclingDocument(BaseModel):
3153
3153
  split_page_view: bool = False,
3154
3154
  ) -> str:
3155
3155
  r"""Serialize to HTML."""
3156
- from docling_core.experimental.serializer.html import (
3156
+ from docling_core.transforms.serializer.html import (
3157
3157
  HTMLDocSerializer,
3158
3158
  HTMLOutputStyle,
3159
3159
  HTMLParams,
@@ -3195,9 +3195,9 @@ class DoclingDocument(BaseModel):
3195
3195
 
3196
3196
  return ser_res.text
3197
3197
 
3198
+ @staticmethod
3198
3199
  def load_from_doctags( # noqa: C901
3199
- self,
3200
- doctag_document: DocTagsDocument,
3200
+ doctag_document: DocTagsDocument, document_name: str = "Document"
3201
3201
  ) -> "DoclingDocument":
3202
3202
  r"""Load Docling document from lists of DocTags and Images."""
3203
3203
  # Maps the recognized tag to a Docling label.
@@ -3221,6 +3221,8 @@ class DoclingDocument(BaseModel):
3221
3221
  "key_value_region": DocItemLabel.KEY_VALUE_REGION,
3222
3222
  }
3223
3223
 
3224
+ doc = DoclingDocument(name=document_name)
3225
+
3224
3226
  def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
3225
3227
  """Extract <loc_...> coords from the chunk, normalized by / 500."""
3226
3228
  coords = re.findall(r"<loc_(\d+)>", text_chunk)
@@ -3244,7 +3246,7 @@ class DoclingDocument(BaseModel):
3244
3246
  caption_content = caption.group(1)
3245
3247
  bbox = extract_bounding_box(caption_content)
3246
3248
  caption_text = extract_inner_text(caption_content)
3247
- caption_item = self.add_text(
3249
+ caption_item = doc.add_text(
3248
3250
  label=DocItemLabel.CAPTION,
3249
3251
  text=caption_text,
3250
3252
  parent=None,
@@ -3567,7 +3569,7 @@ class DoclingDocument(BaseModel):
3567
3569
  pg_width = 1
3568
3570
  pg_height = 1
3569
3571
 
3570
- self.add_page(
3572
+ doc.add_page(
3571
3573
  page_no=page_no,
3572
3574
  size=Size(width=pg_width, height=pg_height),
3573
3575
  image=ImageRef.from_pil(image=image, dpi=72) if image else None,
@@ -3624,9 +3626,9 @@ class DoclingDocument(BaseModel):
3624
3626
  charspan=(0, 0),
3625
3627
  page_no=page_no,
3626
3628
  )
3627
- self.add_table(data=table_data, prov=prov, caption=caption)
3629
+ doc.add_table(data=table_data, prov=prov, caption=caption)
3628
3630
  else:
3629
- self.add_table(data=table_data, caption=caption)
3631
+ doc.add_table(data=table_data, caption=caption)
3630
3632
 
3631
3633
  elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
3632
3634
  caption, caption_bbox = extract_caption(full_chunk)
@@ -3646,7 +3648,7 @@ class DoclingDocument(BaseModel):
3646
3648
  int(bbox.b * im_height),
3647
3649
  )
3648
3650
  cropped_image = image.crop(crop_box)
3649
- pic = self.add_picture(
3651
+ pic = doc.add_picture(
3650
3652
  parent=None,
3651
3653
  image=ImageRef.from_pil(image=cropped_image, dpi=72),
3652
3654
  prov=(
@@ -3692,7 +3694,7 @@ class DoclingDocument(BaseModel):
3692
3694
  else:
3693
3695
  if bbox:
3694
3696
  # In case we don't have access to an binary of an image
3695
- pic = self.add_picture(
3697
+ pic = doc.add_picture(
3696
3698
  parent=None,
3697
3699
  prov=ProvenanceItem(
3698
3700
  bbox=bbox, charspan=(0, 0), page_no=page_no
@@ -3733,7 +3735,7 @@ class DoclingDocument(BaseModel):
3733
3735
  key_value_data, kv_item_prov = parse_key_value_item(
3734
3736
  full_chunk, image
3735
3737
  )
3736
- self.add_key_values(graph=key_value_data, prov=kv_item_prov)
3738
+ doc.add_key_values(graph=key_value_data, prov=kv_item_prov)
3737
3739
  elif tag_name in [
3738
3740
  DocumentToken.ORDERED_LIST.value,
3739
3741
  DocumentToken.UNORDERED_LIST.value,
@@ -3749,7 +3751,7 @@ class DoclingDocument(BaseModel):
3749
3751
  )
3750
3752
  li_pattern = re.compile(list_item_pattern, re.DOTALL)
3751
3753
  # Add list group:
3752
- new_list = self.add_group(label=list_label, name="list")
3754
+ new_list = doc.add_group(label=list_label, name="list")
3753
3755
  # Pricess list items
3754
3756
  for li_match in li_pattern.finditer(full_chunk):
3755
3757
  enum_value += 1
@@ -3760,7 +3762,7 @@ class DoclingDocument(BaseModel):
3760
3762
  li_bbox = extract_bounding_box(li_full_chunk) if image else None
3761
3763
  text_content = extract_inner_text(li_full_chunk)
3762
3764
  # Add list item
3763
- self.add_list_item(
3765
+ doc.add_list_item(
3764
3766
  marker=enum_marker,
3765
3767
  enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
3766
3768
  parent=new_list,
@@ -3792,13 +3794,13 @@ class DoclingDocument(BaseModel):
3792
3794
  if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
3793
3795
  content_layer = ContentLayer.FURNITURE
3794
3796
 
3795
- self.add_text(
3797
+ doc.add_text(
3796
3798
  label=doc_label,
3797
3799
  text=text_content,
3798
3800
  prov=element_prov,
3799
3801
  content_layer=content_layer,
3800
3802
  )
3801
- return self
3803
+ return doc
3802
3804
 
3803
3805
  @deprecated("Use save_as_doctags instead.")
3804
3806
  def save_as_document_tokens(self, *args, **kwargs):
@@ -3885,7 +3887,7 @@ class DoclingDocument(BaseModel):
3885
3887
  :returns: The content of the document formatted as a DocTags string.
3886
3888
  :rtype: str
3887
3889
  """
3888
- from docling_core.experimental.serializer.doctags import (
3890
+ from docling_core.transforms.serializer.doctags import (
3889
3891
  DocTagsDocSerializer,
3890
3892
  DocTagsParams,
3891
3893
  )
@@ -579,13 +579,17 @@ class SegmentedPdfPage(SegmentedPage):
579
579
  with open(filename, "r", encoding="utf-8") as f:
580
580
  return cls.model_validate_json(f.read())
581
581
 
582
- def crop_text(self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0):
582
+ def crop_text(
583
+ self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
584
+ ) -> str:
583
585
  """Extract text from cells within the specified bounding box.
584
586
 
585
587
  Args:
586
588
  cell_unit: Type of text unit to extract
587
589
  bbox: Bounding box to extract from
588
590
  eps: Epsilon value for position comparison
591
+ Returns:
592
+ Extracted text from the cells
589
593
  """
590
594
  selection = []
591
595
  for page_cell in self.iterate_cells(cell_unit):
@@ -605,7 +609,6 @@ class SegmentedPdfPage(SegmentedPage):
605
609
 
606
610
  text = ""
607
611
  for i, cell in enumerate(selection):
608
-
609
612
  if i == 0:
610
613
  text += cell.text
611
614
  else:
@@ -619,6 +622,7 @@ class SegmentedPdfPage(SegmentedPage):
619
622
  else:
620
623
  text += " "
621
624
  text += cell.text
625
+ return text
622
626
 
623
627
  def export_to_textlines(
624
628
  self,
@@ -640,7 +644,6 @@ class SegmentedPdfPage(SegmentedPage):
640
644
  """
641
645
  lines: List[str] = []
642
646
  for cell in self.iterate_cells(cell_unit):
643
-
644
647
  line = ""
645
648
  if add_location:
646
649
  line += f"({cell.rect.r_x0:06.02f}, {cell.rect.r_y0:06.02f}) "
@@ -1104,7 +1107,6 @@ class SegmentedPdfPage(SegmentedPage):
1104
1107
 
1105
1108
  # Draw each rectangle by connecting its four points
1106
1109
  for line in self.lines:
1107
-
1108
1110
  line.to_top_left_origin(page_height=page_height)
1109
1111
  for segment in line.iterate_segments():
1110
1112
  draw.line(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.28.0"
3
+ version = "2.29.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes