docling-core 2.25.0__tar.gz → 2.26.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.25.0 → docling_core-2.26.1}/PKG-INFO +1 -1
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/base.py +29 -3
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/common.py +157 -71
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/doctags.py +88 -54
- docling_core-2.26.1/docling_core/experimental/serializer/html.py +941 -0
- docling_core-2.26.1/docling_core/experimental/serializer/html_styles.py +212 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/markdown.py +105 -63
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/chunker/base.py +8 -2
- docling_core-2.26.1/docling_core/transforms/chunker/hierarchical_chunker.py +262 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/document.py +702 -482
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/labels.py +2 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/page.py +12 -17
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/pyproject.toml +1 -1
- docling_core-2.25.0/docling_core/transforms/chunker/hierarchical_chunker.py +0 -241
- {docling_core-2.25.0 → docling_core-2.26.1}/LICENSE +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/README.md +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/py.typed +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/package.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/base.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/validators.py +0 -0
|
@@ -11,6 +11,7 @@ from typing import Optional, Union
|
|
|
11
11
|
from pydantic import AnyUrl, BaseModel
|
|
12
12
|
|
|
13
13
|
from docling_core.types.doc.document import (
|
|
14
|
+
DocItem,
|
|
14
15
|
DoclingDocument,
|
|
15
16
|
FloatingItem,
|
|
16
17
|
FormItem,
|
|
@@ -25,10 +26,19 @@ from docling_core.types.doc.document import (
|
|
|
25
26
|
)
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
class Span(BaseModel):
|
|
30
|
+
"""Class encapsulating fine-granular document span information."""
|
|
31
|
+
|
|
32
|
+
item: DocItem
|
|
33
|
+
# prov_idx: Optional[PositiveInt] = None # None to be interpreted as whole DocItem
|
|
34
|
+
|
|
35
|
+
|
|
28
36
|
class SerializationResult(BaseModel):
|
|
29
37
|
"""SerializationResult."""
|
|
30
38
|
|
|
31
|
-
text: str
|
|
39
|
+
text: str = ""
|
|
40
|
+
spans: list[Span] = []
|
|
41
|
+
# group: Optional[GroupItem] = None # set when result reflects specific group item
|
|
32
42
|
|
|
33
43
|
|
|
34
44
|
class BaseTextSerializer(ABC):
|
|
@@ -163,7 +173,9 @@ class BaseDocSerializer(ABC):
|
|
|
163
173
|
"""Base class for document serializers."""
|
|
164
174
|
|
|
165
175
|
@abstractmethod
|
|
166
|
-
def serialize(
|
|
176
|
+
def serialize(
|
|
177
|
+
self, *, item: Optional[NodeItem] = None, **kwargs
|
|
178
|
+
) -> SerializationResult:
|
|
167
179
|
"""Run the serialization."""
|
|
168
180
|
...
|
|
169
181
|
|
|
@@ -222,6 +234,20 @@ class BaseDocSerializer(ABC):
|
|
|
222
234
|
...
|
|
223
235
|
|
|
224
236
|
@abstractmethod
|
|
225
|
-
def get_excluded_refs(self, **kwargs) ->
|
|
237
|
+
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
226
238
|
"""Get references to excluded items."""
|
|
227
239
|
...
|
|
240
|
+
|
|
241
|
+
@abstractmethod
|
|
242
|
+
def requires_page_break(self) -> bool:
|
|
243
|
+
"""Whether to add page breaks."""
|
|
244
|
+
...
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class BaseSerializerProvider(ABC):
|
|
248
|
+
"""Base class for document serializer providers."""
|
|
249
|
+
|
|
250
|
+
@abstractmethod
|
|
251
|
+
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
|
|
252
|
+
"""Get a the associated serializer."""
|
|
253
|
+
...
|
|
@@ -4,12 +4,12 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Define base classes for serialization."""
|
|
7
|
+
import re
|
|
7
8
|
import sys
|
|
8
9
|
from abc import abstractmethod
|
|
9
|
-
from copy import deepcopy
|
|
10
10
|
from functools import cached_property
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Optional, Union
|
|
12
|
+
from typing import Any, Iterable, Optional, Tuple, Union
|
|
13
13
|
|
|
14
14
|
from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
|
|
15
15
|
from typing_extensions import Self, override
|
|
@@ -25,6 +25,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
25
25
|
BaseTableSerializer,
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
|
+
Span,
|
|
28
29
|
)
|
|
29
30
|
from docling_core.types.doc.document import (
|
|
30
31
|
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
@@ -49,6 +50,81 @@ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
|
49
50
|
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
|
|
50
51
|
|
|
51
52
|
|
|
53
|
+
class _PageBreakNode(NodeItem):
|
|
54
|
+
"""Page break node."""
|
|
55
|
+
|
|
56
|
+
prev_page: int
|
|
57
|
+
next_page: int
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class _PageBreakSerResult(SerializationResult):
|
|
61
|
+
"""Page break serialization result."""
|
|
62
|
+
|
|
63
|
+
node: _PageBreakNode
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _iterate_items(
|
|
67
|
+
doc: DoclingDocument,
|
|
68
|
+
layers: Optional[set[ContentLayer]],
|
|
69
|
+
node: Optional[NodeItem] = None,
|
|
70
|
+
traverse_pictures: bool = False,
|
|
71
|
+
add_page_breaks: bool = False,
|
|
72
|
+
):
|
|
73
|
+
prev_page_nr: Optional[int] = None
|
|
74
|
+
page_break_i = 0
|
|
75
|
+
for item, _ in doc.iterate_items(
|
|
76
|
+
root=node,
|
|
77
|
+
with_groups=True,
|
|
78
|
+
included_content_layers=layers,
|
|
79
|
+
traverse_pictures=traverse_pictures,
|
|
80
|
+
):
|
|
81
|
+
if isinstance(item, DocItem):
|
|
82
|
+
if item.prov:
|
|
83
|
+
page_no = item.prov[0].page_no
|
|
84
|
+
if add_page_breaks and (prev_page_nr is None or page_no > prev_page_nr):
|
|
85
|
+
if prev_page_nr is not None: # close previous range
|
|
86
|
+
yield _PageBreakNode(
|
|
87
|
+
self_ref=f"#/pb/{page_break_i}",
|
|
88
|
+
prev_page=prev_page_nr,
|
|
89
|
+
next_page=page_no,
|
|
90
|
+
)
|
|
91
|
+
page_break_i += 1
|
|
92
|
+
prev_page_nr = page_no
|
|
93
|
+
yield item
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def create_ser_result(
|
|
97
|
+
*,
|
|
98
|
+
text: str = "",
|
|
99
|
+
span_source: Union[DocItem, list[SerializationResult]] = [],
|
|
100
|
+
) -> SerializationResult:
|
|
101
|
+
"""Function for creating `SerializationResult` instances.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
text: the text the use. Defaults to "".
|
|
105
|
+
span_source: the item or list of results to use as span source. Defaults to [].
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The created `SerializationResult`.
|
|
109
|
+
"""
|
|
110
|
+
spans: list[Span]
|
|
111
|
+
if isinstance(span_source, DocItem):
|
|
112
|
+
spans = [Span(item=span_source)]
|
|
113
|
+
else:
|
|
114
|
+
results: list[SerializationResult] = span_source
|
|
115
|
+
spans = []
|
|
116
|
+
span_ids: set[str] = set()
|
|
117
|
+
for ser_res in results:
|
|
118
|
+
for span in ser_res.spans:
|
|
119
|
+
if (span_id := span.item.self_ref) not in span_ids:
|
|
120
|
+
span_ids.add(span_id)
|
|
121
|
+
spans.append(span)
|
|
122
|
+
return SerializationResult(
|
|
123
|
+
text=text,
|
|
124
|
+
spans=spans,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
52
128
|
class CommonParams(BaseModel):
|
|
53
129
|
"""Common serialization parameters."""
|
|
54
130
|
|
|
@@ -95,7 +171,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
95
171
|
|
|
96
172
|
params: CommonParams = CommonParams()
|
|
97
173
|
|
|
98
|
-
_excluded_refs_cache: dict[str,
|
|
174
|
+
_excluded_refs_cache: dict[str, set[str]] = {}
|
|
99
175
|
|
|
100
176
|
@computed_field # type: ignore[misc]
|
|
101
177
|
@cached_property
|
|
@@ -113,19 +189,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
113
189
|
return refs
|
|
114
190
|
|
|
115
191
|
@override
|
|
116
|
-
def get_excluded_refs(self, **kwargs) ->
|
|
192
|
+
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
117
193
|
"""References to excluded items."""
|
|
118
194
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
119
195
|
params_json = params.model_dump_json()
|
|
120
196
|
refs = self._excluded_refs_cache.get(params_json)
|
|
121
197
|
if refs is None:
|
|
122
|
-
refs =
|
|
198
|
+
refs = {
|
|
123
199
|
item.self_ref
|
|
124
|
-
for ix,
|
|
125
|
-
|
|
126
|
-
|
|
200
|
+
for ix, item in enumerate(
|
|
201
|
+
_iterate_items(
|
|
202
|
+
doc=self.doc,
|
|
127
203
|
traverse_pictures=True,
|
|
128
|
-
|
|
204
|
+
layers=params.layers,
|
|
129
205
|
)
|
|
130
206
|
)
|
|
131
207
|
if (
|
|
@@ -145,56 +221,21 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
145
221
|
)
|
|
146
222
|
)
|
|
147
223
|
)
|
|
148
|
-
|
|
224
|
+
}
|
|
149
225
|
self._excluded_refs_cache[params_json] = refs
|
|
150
226
|
return refs
|
|
151
227
|
|
|
152
228
|
@abstractmethod
|
|
153
|
-
def
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
@abstractmethod
|
|
158
|
-
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
|
|
229
|
+
def serialize_doc(
|
|
230
|
+
self, *, parts: list[SerializationResult], **kwargs
|
|
231
|
+
) -> SerializationResult:
|
|
159
232
|
"""Serialize a document out of its pages."""
|
|
160
233
|
...
|
|
161
234
|
|
|
162
235
|
def _serialize_body(self) -> SerializationResult:
|
|
163
236
|
"""Serialize the document body."""
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
starts: list[int] = []
|
|
167
|
-
for ix, (item, _) in enumerate(
|
|
168
|
-
self.doc.iterate_items(
|
|
169
|
-
with_groups=True,
|
|
170
|
-
traverse_pictures=True,
|
|
171
|
-
included_content_layers=self.params.layers,
|
|
172
|
-
)
|
|
173
|
-
):
|
|
174
|
-
if isinstance(item, DocItem):
|
|
175
|
-
if item.prov:
|
|
176
|
-
if last_page is None or item.prov[0].page_no > last_page:
|
|
177
|
-
starts.append(ix)
|
|
178
|
-
last_page = item.prov[0].page_no
|
|
179
|
-
page_ranges = [
|
|
180
|
-
(
|
|
181
|
-
(starts[i] if i > 0 else 0),
|
|
182
|
-
(starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
|
|
183
|
-
)
|
|
184
|
-
for i, _ in enumerate(starts)
|
|
185
|
-
] or [
|
|
186
|
-
(0, sys.maxsize)
|
|
187
|
-
] # use whole range if no pages detected
|
|
188
|
-
|
|
189
|
-
page_results: list[SerializationResult] = []
|
|
190
|
-
for page_range in page_ranges:
|
|
191
|
-
params_to_pass = deepcopy(self.params)
|
|
192
|
-
params_to_pass.start_idx = page_range[0]
|
|
193
|
-
params_to_pass.stop_idx = page_range[1]
|
|
194
|
-
subparts = self.get_parts(**params_to_pass.model_dump())
|
|
195
|
-
page_res = self.serialize_page(subparts)
|
|
196
|
-
page_results.append(page_res)
|
|
197
|
-
res = self.serialize_doc(page_results)
|
|
237
|
+
subparts = self.get_parts()
|
|
238
|
+
res = self.serialize_doc(parts=subparts)
|
|
198
239
|
return res
|
|
199
240
|
|
|
200
241
|
@override
|
|
@@ -209,7 +250,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
209
250
|
) -> SerializationResult:
|
|
210
251
|
"""Serialize a given node."""
|
|
211
252
|
my_visited: set[str] = visited if visited is not None else set()
|
|
212
|
-
|
|
253
|
+
my_kwargs = self.params.merge_with_patch(patch=kwargs).model_dump()
|
|
254
|
+
empty_res = create_ser_result()
|
|
213
255
|
if item is None or item == self.doc.body:
|
|
214
256
|
if self.doc.body.self_ref not in my_visited:
|
|
215
257
|
my_visited.add(self.doc.body.self_ref)
|
|
@@ -217,6 +259,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
217
259
|
else:
|
|
218
260
|
return empty_res
|
|
219
261
|
|
|
262
|
+
my_visited.add(item.self_ref)
|
|
263
|
+
|
|
220
264
|
########
|
|
221
265
|
# groups
|
|
222
266
|
########
|
|
@@ -228,7 +272,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
228
272
|
list_level=list_level,
|
|
229
273
|
is_inline_scope=is_inline_scope,
|
|
230
274
|
visited=my_visited,
|
|
231
|
-
**
|
|
275
|
+
**my_kwargs,
|
|
232
276
|
)
|
|
233
277
|
elif isinstance(item, InlineGroup):
|
|
234
278
|
part = self.inline_serializer.serialize(
|
|
@@ -237,7 +281,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
237
281
|
doc=self.doc,
|
|
238
282
|
list_level=list_level,
|
|
239
283
|
visited=my_visited,
|
|
240
|
-
**
|
|
284
|
+
**my_kwargs,
|
|
241
285
|
)
|
|
242
286
|
###########
|
|
243
287
|
# doc items
|
|
@@ -253,7 +297,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
253
297
|
doc_serializer=self,
|
|
254
298
|
doc=self.doc,
|
|
255
299
|
is_inline_scope=is_inline_scope,
|
|
256
|
-
**
|
|
300
|
+
**my_kwargs,
|
|
257
301
|
)
|
|
258
302
|
if item.self_ref not in self.get_excluded_refs(**kwargs)
|
|
259
303
|
else empty_res
|
|
@@ -263,7 +307,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
263
307
|
item=item,
|
|
264
308
|
doc_serializer=self,
|
|
265
309
|
doc=self.doc,
|
|
266
|
-
**
|
|
310
|
+
**my_kwargs,
|
|
267
311
|
)
|
|
268
312
|
elif isinstance(item, PictureItem):
|
|
269
313
|
part = self.picture_serializer.serialize(
|
|
@@ -271,28 +315,33 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
271
315
|
doc_serializer=self,
|
|
272
316
|
doc=self.doc,
|
|
273
317
|
visited=my_visited,
|
|
274
|
-
**
|
|
318
|
+
**my_kwargs,
|
|
275
319
|
)
|
|
276
320
|
elif isinstance(item, KeyValueItem):
|
|
277
321
|
part = self.key_value_serializer.serialize(
|
|
278
322
|
item=item,
|
|
279
323
|
doc_serializer=self,
|
|
280
324
|
doc=self.doc,
|
|
281
|
-
**
|
|
325
|
+
**my_kwargs,
|
|
282
326
|
)
|
|
283
327
|
elif isinstance(item, FormItem):
|
|
284
328
|
part = self.form_serializer.serialize(
|
|
285
329
|
item=item,
|
|
286
330
|
doc_serializer=self,
|
|
287
331
|
doc=self.doc,
|
|
288
|
-
**
|
|
332
|
+
**my_kwargs,
|
|
333
|
+
)
|
|
334
|
+
elif isinstance(item, _PageBreakNode):
|
|
335
|
+
part = _PageBreakSerResult(
|
|
336
|
+
text=self._create_page_break(node=item),
|
|
337
|
+
node=item,
|
|
289
338
|
)
|
|
290
339
|
else:
|
|
291
340
|
part = self.fallback_serializer.serialize(
|
|
292
341
|
item=item,
|
|
293
342
|
doc_serializer=self,
|
|
294
343
|
doc=self.doc,
|
|
295
|
-
**
|
|
344
|
+
**my_kwargs,
|
|
296
345
|
)
|
|
297
346
|
return part
|
|
298
347
|
|
|
@@ -312,18 +361,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
312
361
|
parts: list[SerializationResult] = []
|
|
313
362
|
my_visited: set[str] = visited if visited is not None else set()
|
|
314
363
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
364
|
+
|
|
365
|
+
for node in _iterate_items(
|
|
366
|
+
node=item,
|
|
367
|
+
doc=self.doc,
|
|
368
|
+
layers=params.layers,
|
|
369
|
+
add_page_breaks=self.requires_page_break(),
|
|
320
370
|
):
|
|
321
|
-
if
|
|
371
|
+
if node.self_ref in my_visited:
|
|
322
372
|
continue
|
|
323
373
|
else:
|
|
324
|
-
my_visited.add(
|
|
374
|
+
my_visited.add(node.self_ref)
|
|
325
375
|
part = self.serialize(
|
|
326
|
-
item=
|
|
376
|
+
item=node,
|
|
327
377
|
list_level=list_level,
|
|
328
378
|
is_inline_scope=is_inline_scope,
|
|
329
379
|
visited=my_visited,
|
|
@@ -393,15 +443,51 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
393
443
|
) -> SerializationResult:
|
|
394
444
|
"""Serialize the item's captions."""
|
|
395
445
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
446
|
+
results: list[SerializationResult] = []
|
|
396
447
|
if DocItemLabel.CAPTION in params.labels:
|
|
397
|
-
|
|
398
|
-
it.text
|
|
448
|
+
results = [
|
|
449
|
+
create_ser_result(text=it.text, span_source=it)
|
|
399
450
|
for cap in item.captions
|
|
400
451
|
if isinstance(it := cap.resolve(self.doc), TextItem)
|
|
401
452
|
and it.self_ref not in self.get_excluded_refs(**kwargs)
|
|
402
453
|
]
|
|
403
|
-
text_res = params.caption_delim.join(
|
|
454
|
+
text_res = params.caption_delim.join([r.text for r in results])
|
|
404
455
|
text_res = self.post_process(text=text_res)
|
|
405
456
|
else:
|
|
406
457
|
text_res = ""
|
|
407
|
-
return
|
|
458
|
+
return create_ser_result(text=text_res, span_source=results)
|
|
459
|
+
|
|
460
|
+
def _get_applicable_pages(self) -> Optional[list[int]]:
|
|
461
|
+
pages = {
|
|
462
|
+
item.prov[0].page_no: ...
|
|
463
|
+
for ix, (item, _) in enumerate(
|
|
464
|
+
self.doc.iterate_items(
|
|
465
|
+
with_groups=True,
|
|
466
|
+
included_content_layers=self.params.layers,
|
|
467
|
+
traverse_pictures=True,
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
if (
|
|
471
|
+
isinstance(item, DocItem)
|
|
472
|
+
and item.prov
|
|
473
|
+
and (
|
|
474
|
+
self.params.pages is None
|
|
475
|
+
or item.prov[0].page_no in self.params.pages
|
|
476
|
+
)
|
|
477
|
+
and ix >= self.params.start_idx
|
|
478
|
+
and ix < self.params.stop_idx
|
|
479
|
+
)
|
|
480
|
+
}
|
|
481
|
+
return [p for p in pages] or None
|
|
482
|
+
|
|
483
|
+
def _create_page_break(self, node: _PageBreakNode) -> str:
|
|
484
|
+
return f"#_#_DOCLING_DOC_PAGE_BREAK_{node.prev_page}_{node.next_page}_#_#"
|
|
485
|
+
|
|
486
|
+
def _get_page_breaks(self, text: str) -> Iterable[Tuple[str, int, int]]:
|
|
487
|
+
pattern = r"#_#_DOCLING_DOC_PAGE_BREAK_(\d+)_(\d+)_#_#"
|
|
488
|
+
matches = re.finditer(pattern, text)
|
|
489
|
+
for match in matches:
|
|
490
|
+
full_match = match.group(0)
|
|
491
|
+
prev_page_nr = int(match.group(1))
|
|
492
|
+
next_page_nr = int(match.group(2))
|
|
493
|
+
yield (full_match, prev_page_nr, next_page_nr)
|