docling-core 2.23.3__tar.gz → 2.24.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.23.3 → docling_core-2.24.1}/PKG-INFO +1 -1
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/experimental/serializer/base.py +2 -2
- docling_core-2.24.1/docling_core/experimental/serializer/common.py +407 -0
- docling_core-2.24.1/docling_core/experimental/serializer/doctags.py +492 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/experimental/serializer/markdown.py +70 -41
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/doc/document.py +412 -418
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/doc/page.py +28 -9
- docling_core-2.24.1/docling_core/types/doc/tokens.py +292 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/pyproject.toml +1 -1
- docling_core-2.23.3/docling_core/experimental/serializer/common.py +0 -353
- docling_core-2.23.3/docling_core/types/doc/tokens.py +0 -126
- {docling_core-2.23.3 → docling_core-2.24.1}/LICENSE +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/README.md +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/py.typed +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/search/package.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/base.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.23.3 → docling_core-2.24.1}/docling_core/utils/validators.py +0 -0
|
@@ -197,7 +197,7 @@ class BaseDocSerializer(ABC):
|
|
|
197
197
|
@abstractmethod
|
|
198
198
|
def get_parts(
|
|
199
199
|
self,
|
|
200
|
-
|
|
200
|
+
item: Optional[NodeItem] = None,
|
|
201
201
|
**kwargs,
|
|
202
202
|
) -> list[SerializationResult]:
|
|
203
203
|
"""Get the components to be combined for serializing this node."""
|
|
@@ -222,6 +222,6 @@ class BaseDocSerializer(ABC):
|
|
|
222
222
|
...
|
|
223
223
|
|
|
224
224
|
@abstractmethod
|
|
225
|
-
def get_excluded_refs(self) -> list[str]:
|
|
225
|
+
def get_excluded_refs(self, **kwargs) -> list[str]:
|
|
226
226
|
"""Get references to excluded items."""
|
|
227
227
|
...
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2025
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define base classes for serialization."""
|
|
7
|
+
import sys
|
|
8
|
+
from abc import abstractmethod
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from functools import cached_property
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Optional, Union
|
|
13
|
+
|
|
14
|
+
from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
|
|
15
|
+
from typing_extensions import Self, override
|
|
16
|
+
|
|
17
|
+
from docling_core.experimental.serializer.base import (
|
|
18
|
+
BaseDocSerializer,
|
|
19
|
+
BaseFallbackSerializer,
|
|
20
|
+
BaseFormSerializer,
|
|
21
|
+
BaseInlineSerializer,
|
|
22
|
+
BaseKeyValueSerializer,
|
|
23
|
+
BaseListSerializer,
|
|
24
|
+
BasePictureSerializer,
|
|
25
|
+
BaseTableSerializer,
|
|
26
|
+
BaseTextSerializer,
|
|
27
|
+
SerializationResult,
|
|
28
|
+
)
|
|
29
|
+
from docling_core.types.doc.document import (
|
|
30
|
+
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
31
|
+
ContentLayer,
|
|
32
|
+
DocItem,
|
|
33
|
+
DoclingDocument,
|
|
34
|
+
FloatingItem,
|
|
35
|
+
Formatting,
|
|
36
|
+
FormItem,
|
|
37
|
+
InlineGroup,
|
|
38
|
+
KeyValueItem,
|
|
39
|
+
NodeItem,
|
|
40
|
+
OrderedList,
|
|
41
|
+
PictureItem,
|
|
42
|
+
TableItem,
|
|
43
|
+
TextItem,
|
|
44
|
+
UnorderedList,
|
|
45
|
+
)
|
|
46
|
+
from docling_core.types.doc.labels import DocItemLabel
|
|
47
|
+
|
|
48
|
+
_DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
49
|
+
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class CommonParams(BaseModel):
|
|
53
|
+
"""Common serialization parameters."""
|
|
54
|
+
|
|
55
|
+
# allowlists with non-recursive semantics, i.e. if a list group node is outside the
|
|
56
|
+
# range and some of its children items are within, they will be serialized
|
|
57
|
+
labels: set[DocItemLabel] = _DEFAULT_LABELS
|
|
58
|
+
layers: set[ContentLayer] = _DEFAULT_LAYERS
|
|
59
|
+
pages: Optional[set[int]] = None # None means all pages are allowed
|
|
60
|
+
|
|
61
|
+
# slice-like semantics: start is included, stop is excluded
|
|
62
|
+
start_idx: NonNegativeInt = 0
|
|
63
|
+
stop_idx: NonNegativeInt = sys.maxsize
|
|
64
|
+
|
|
65
|
+
include_formatting: bool = True
|
|
66
|
+
include_hyperlinks: bool = True
|
|
67
|
+
caption_delim: str = " "
|
|
68
|
+
|
|
69
|
+
def merge_with_patch(self, patch: dict[str, Any]) -> Self:
|
|
70
|
+
"""Create an instance by merging the provided patch dict on top of self."""
|
|
71
|
+
res = self.model_validate({**self.model_dump(), **patch})
|
|
72
|
+
return res
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class DocSerializer(BaseModel, BaseDocSerializer):
|
|
76
|
+
"""Class for document serializers."""
|
|
77
|
+
|
|
78
|
+
class Config:
|
|
79
|
+
"""Pydantic config."""
|
|
80
|
+
|
|
81
|
+
arbitrary_types_allowed = True
|
|
82
|
+
extra = "forbid"
|
|
83
|
+
|
|
84
|
+
doc: DoclingDocument
|
|
85
|
+
|
|
86
|
+
text_serializer: BaseTextSerializer
|
|
87
|
+
table_serializer: BaseTableSerializer
|
|
88
|
+
picture_serializer: BasePictureSerializer
|
|
89
|
+
key_value_serializer: BaseKeyValueSerializer
|
|
90
|
+
form_serializer: BaseFormSerializer
|
|
91
|
+
fallback_serializer: BaseFallbackSerializer
|
|
92
|
+
|
|
93
|
+
list_serializer: BaseListSerializer
|
|
94
|
+
inline_serializer: BaseInlineSerializer
|
|
95
|
+
|
|
96
|
+
params: CommonParams = CommonParams()
|
|
97
|
+
|
|
98
|
+
_excluded_refs_cache: dict[str, list[str]] = {}
|
|
99
|
+
|
|
100
|
+
@computed_field # type: ignore[misc]
|
|
101
|
+
@cached_property
|
|
102
|
+
def _captions_of_some_item(self) -> set[str]:
|
|
103
|
+
layers = {cl for cl in ContentLayer} # TODO review
|
|
104
|
+
refs = {
|
|
105
|
+
cap.cref
|
|
106
|
+
for (item, _) in self.doc.iterate_items(
|
|
107
|
+
with_groups=True,
|
|
108
|
+
traverse_pictures=True,
|
|
109
|
+
included_content_layers=layers,
|
|
110
|
+
)
|
|
111
|
+
for cap in (item.captions if isinstance(item, FloatingItem) else [])
|
|
112
|
+
}
|
|
113
|
+
return refs
|
|
114
|
+
|
|
115
|
+
@override
|
|
116
|
+
def get_excluded_refs(self, **kwargs) -> list[str]:
|
|
117
|
+
"""References to excluded items."""
|
|
118
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
119
|
+
params_json = params.model_dump_json()
|
|
120
|
+
refs = self._excluded_refs_cache.get(params_json)
|
|
121
|
+
if refs is None:
|
|
122
|
+
refs = [
|
|
123
|
+
item.self_ref
|
|
124
|
+
for ix, (item, _) in enumerate(
|
|
125
|
+
self.doc.iterate_items(
|
|
126
|
+
with_groups=True,
|
|
127
|
+
traverse_pictures=True,
|
|
128
|
+
included_content_layers=params.layers,
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
if (
|
|
132
|
+
(ix < params.start_idx or ix >= params.stop_idx)
|
|
133
|
+
or (
|
|
134
|
+
isinstance(item, DocItem)
|
|
135
|
+
and (
|
|
136
|
+
item.label not in params.labels
|
|
137
|
+
or item.content_layer not in params.layers
|
|
138
|
+
or (
|
|
139
|
+
params.pages is not None
|
|
140
|
+
and (
|
|
141
|
+
(not item.prov)
|
|
142
|
+
or item.prov[0].page_no not in params.pages
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
]
|
|
149
|
+
self._excluded_refs_cache[params_json] = refs
|
|
150
|
+
return refs
|
|
151
|
+
|
|
152
|
+
@abstractmethod
|
|
153
|
+
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
|
|
154
|
+
"""Serialize a page out of its parts."""
|
|
155
|
+
...
|
|
156
|
+
|
|
157
|
+
@abstractmethod
|
|
158
|
+
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
|
|
159
|
+
"""Serialize a document out of its pages."""
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def _serialize_body(self) -> SerializationResult:
|
|
163
|
+
"""Serialize the document body."""
|
|
164
|
+
# find page ranges if available; otherwise regard whole doc as a single page
|
|
165
|
+
last_page: Optional[int] = None
|
|
166
|
+
starts: list[int] = []
|
|
167
|
+
for ix, (item, _) in enumerate(
|
|
168
|
+
self.doc.iterate_items(
|
|
169
|
+
with_groups=True,
|
|
170
|
+
traverse_pictures=True,
|
|
171
|
+
included_content_layers=self.params.layers,
|
|
172
|
+
)
|
|
173
|
+
):
|
|
174
|
+
if isinstance(item, DocItem):
|
|
175
|
+
if item.prov:
|
|
176
|
+
if last_page is None or item.prov[0].page_no > last_page:
|
|
177
|
+
starts.append(ix)
|
|
178
|
+
last_page = item.prov[0].page_no
|
|
179
|
+
page_ranges = [
|
|
180
|
+
(
|
|
181
|
+
(starts[i] if i > 0 else 0),
|
|
182
|
+
(starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
|
|
183
|
+
)
|
|
184
|
+
for i, _ in enumerate(starts)
|
|
185
|
+
] or [
|
|
186
|
+
(0, sys.maxsize)
|
|
187
|
+
] # use whole range if no pages detected
|
|
188
|
+
|
|
189
|
+
page_results: list[SerializationResult] = []
|
|
190
|
+
for page_range in page_ranges:
|
|
191
|
+
params_to_pass = deepcopy(self.params)
|
|
192
|
+
params_to_pass.start_idx = page_range[0]
|
|
193
|
+
params_to_pass.stop_idx = page_range[1]
|
|
194
|
+
subparts = self.get_parts(**params_to_pass.model_dump())
|
|
195
|
+
page_res = self.serialize_page(subparts)
|
|
196
|
+
page_results.append(page_res)
|
|
197
|
+
res = self.serialize_doc(page_results)
|
|
198
|
+
return res
|
|
199
|
+
|
|
200
|
+
@override
|
|
201
|
+
def serialize(
|
|
202
|
+
self,
|
|
203
|
+
*,
|
|
204
|
+
item: Optional[NodeItem] = None,
|
|
205
|
+
list_level: int = 0,
|
|
206
|
+
is_inline_scope: bool = False,
|
|
207
|
+
visited: Optional[set[str]] = None, # refs of visited items
|
|
208
|
+
**kwargs,
|
|
209
|
+
) -> SerializationResult:
|
|
210
|
+
"""Serialize a given node."""
|
|
211
|
+
my_visited: set[str] = visited if visited is not None else set()
|
|
212
|
+
empty_res = SerializationResult(text="")
|
|
213
|
+
if item is None or item == self.doc.body:
|
|
214
|
+
if self.doc.body.self_ref not in my_visited:
|
|
215
|
+
my_visited.add(self.doc.body.self_ref)
|
|
216
|
+
return self._serialize_body()
|
|
217
|
+
else:
|
|
218
|
+
return empty_res
|
|
219
|
+
|
|
220
|
+
########
|
|
221
|
+
# groups
|
|
222
|
+
########
|
|
223
|
+
if isinstance(item, (UnorderedList, OrderedList)):
|
|
224
|
+
part = self.list_serializer.serialize(
|
|
225
|
+
item=item,
|
|
226
|
+
doc_serializer=self,
|
|
227
|
+
doc=self.doc,
|
|
228
|
+
list_level=list_level,
|
|
229
|
+
is_inline_scope=is_inline_scope,
|
|
230
|
+
visited=my_visited,
|
|
231
|
+
**kwargs,
|
|
232
|
+
)
|
|
233
|
+
elif isinstance(item, InlineGroup):
|
|
234
|
+
part = self.inline_serializer.serialize(
|
|
235
|
+
item=item,
|
|
236
|
+
doc_serializer=self,
|
|
237
|
+
doc=self.doc,
|
|
238
|
+
list_level=list_level,
|
|
239
|
+
visited=my_visited,
|
|
240
|
+
**kwargs,
|
|
241
|
+
)
|
|
242
|
+
###########
|
|
243
|
+
# doc items
|
|
244
|
+
###########
|
|
245
|
+
elif isinstance(item, TextItem):
|
|
246
|
+
if item.self_ref in self._captions_of_some_item:
|
|
247
|
+
# those captions will be handled by the floating item holding them
|
|
248
|
+
return empty_res
|
|
249
|
+
else:
|
|
250
|
+
part = (
|
|
251
|
+
self.text_serializer.serialize(
|
|
252
|
+
item=item,
|
|
253
|
+
doc_serializer=self,
|
|
254
|
+
doc=self.doc,
|
|
255
|
+
is_inline_scope=is_inline_scope,
|
|
256
|
+
**kwargs,
|
|
257
|
+
)
|
|
258
|
+
if item.self_ref not in self.get_excluded_refs(**kwargs)
|
|
259
|
+
else empty_res
|
|
260
|
+
)
|
|
261
|
+
elif isinstance(item, TableItem):
|
|
262
|
+
part = self.table_serializer.serialize(
|
|
263
|
+
item=item,
|
|
264
|
+
doc_serializer=self,
|
|
265
|
+
doc=self.doc,
|
|
266
|
+
**kwargs,
|
|
267
|
+
)
|
|
268
|
+
elif isinstance(item, PictureItem):
|
|
269
|
+
part = self.picture_serializer.serialize(
|
|
270
|
+
item=item,
|
|
271
|
+
doc_serializer=self,
|
|
272
|
+
doc=self.doc,
|
|
273
|
+
visited=my_visited,
|
|
274
|
+
**kwargs,
|
|
275
|
+
)
|
|
276
|
+
elif isinstance(item, KeyValueItem):
|
|
277
|
+
part = self.key_value_serializer.serialize(
|
|
278
|
+
item=item,
|
|
279
|
+
doc_serializer=self,
|
|
280
|
+
doc=self.doc,
|
|
281
|
+
**kwargs,
|
|
282
|
+
)
|
|
283
|
+
elif isinstance(item, FormItem):
|
|
284
|
+
part = self.form_serializer.serialize(
|
|
285
|
+
item=item,
|
|
286
|
+
doc_serializer=self,
|
|
287
|
+
doc=self.doc,
|
|
288
|
+
**kwargs,
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
part = self.fallback_serializer.serialize(
|
|
292
|
+
item=item,
|
|
293
|
+
doc_serializer=self,
|
|
294
|
+
doc=self.doc,
|
|
295
|
+
**kwargs,
|
|
296
|
+
)
|
|
297
|
+
return part
|
|
298
|
+
|
|
299
|
+
# making some assumptions about the kwargs it can pass
|
|
300
|
+
@override
|
|
301
|
+
def get_parts(
|
|
302
|
+
self,
|
|
303
|
+
item: Optional[NodeItem] = None,
|
|
304
|
+
*,
|
|
305
|
+
traverse_pictures: bool = False,
|
|
306
|
+
list_level: int = 0,
|
|
307
|
+
is_inline_scope: bool = False,
|
|
308
|
+
visited: Optional[set[str]] = None, # refs of visited items
|
|
309
|
+
**kwargs,
|
|
310
|
+
) -> list[SerializationResult]:
|
|
311
|
+
"""Get the components to be combined for serializing this node."""
|
|
312
|
+
parts: list[SerializationResult] = []
|
|
313
|
+
my_visited: set[str] = visited if visited is not None else set()
|
|
314
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
315
|
+
for item, _ in self.doc.iterate_items(
|
|
316
|
+
root=item,
|
|
317
|
+
with_groups=True,
|
|
318
|
+
traverse_pictures=traverse_pictures,
|
|
319
|
+
included_content_layers=params.layers,
|
|
320
|
+
):
|
|
321
|
+
if item.self_ref in my_visited:
|
|
322
|
+
continue
|
|
323
|
+
else:
|
|
324
|
+
my_visited.add(item.self_ref)
|
|
325
|
+
part = self.serialize(
|
|
326
|
+
item=item,
|
|
327
|
+
list_level=list_level,
|
|
328
|
+
is_inline_scope=is_inline_scope,
|
|
329
|
+
visited=my_visited,
|
|
330
|
+
**kwargs,
|
|
331
|
+
)
|
|
332
|
+
if part.text:
|
|
333
|
+
parts.append(part)
|
|
334
|
+
return parts
|
|
335
|
+
|
|
336
|
+
@override
|
|
337
|
+
def post_process(
|
|
338
|
+
self,
|
|
339
|
+
text: str,
|
|
340
|
+
*,
|
|
341
|
+
formatting: Optional[Formatting] = None,
|
|
342
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
343
|
+
**kwargs,
|
|
344
|
+
) -> str:
|
|
345
|
+
"""Apply some text post-processing steps."""
|
|
346
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
347
|
+
res = text
|
|
348
|
+
if params.include_formatting and formatting:
|
|
349
|
+
if formatting.bold:
|
|
350
|
+
res = self.serialize_bold(text=res)
|
|
351
|
+
if formatting.italic:
|
|
352
|
+
res = self.serialize_italic(text=res)
|
|
353
|
+
if formatting.underline:
|
|
354
|
+
res = self.serialize_underline(text=res)
|
|
355
|
+
if formatting.strikethrough:
|
|
356
|
+
res = self.serialize_strikethrough(text=res)
|
|
357
|
+
if params.include_hyperlinks and hyperlink:
|
|
358
|
+
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
|
|
359
|
+
return res
|
|
360
|
+
|
|
361
|
+
@override
|
|
362
|
+
def serialize_bold(self, text: str, **kwargs) -> str:
|
|
363
|
+
"""Hook for bold formatting serialization."""
|
|
364
|
+
return text
|
|
365
|
+
|
|
366
|
+
@override
|
|
367
|
+
def serialize_italic(self, text: str, **kwargs) -> str:
|
|
368
|
+
"""Hook for italic formatting serialization."""
|
|
369
|
+
return text
|
|
370
|
+
|
|
371
|
+
@override
|
|
372
|
+
def serialize_underline(self, text: str, **kwargs) -> str:
|
|
373
|
+
"""Hook for underline formatting serialization."""
|
|
374
|
+
return text
|
|
375
|
+
|
|
376
|
+
@override
|
|
377
|
+
def serialize_strikethrough(self, text: str, **kwargs) -> str:
|
|
378
|
+
"""Hook for strikethrough formatting serialization."""
|
|
379
|
+
return text
|
|
380
|
+
|
|
381
|
+
@override
|
|
382
|
+
def serialize_hyperlink(
|
|
383
|
+
self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
|
|
384
|
+
) -> str:
|
|
385
|
+
"""Hook for hyperlink serialization."""
|
|
386
|
+
return text
|
|
387
|
+
|
|
388
|
+
@override
|
|
389
|
+
def serialize_captions(
|
|
390
|
+
self,
|
|
391
|
+
item: FloatingItem,
|
|
392
|
+
**kwargs,
|
|
393
|
+
) -> SerializationResult:
|
|
394
|
+
"""Serialize the item's captions."""
|
|
395
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
396
|
+
if DocItemLabel.CAPTION in params.labels:
|
|
397
|
+
text_parts: list[str] = [
|
|
398
|
+
it.text
|
|
399
|
+
for cap in item.captions
|
|
400
|
+
if isinstance(it := cap.resolve(self.doc), TextItem)
|
|
401
|
+
and it.self_ref not in self.get_excluded_refs(**kwargs)
|
|
402
|
+
]
|
|
403
|
+
text_res = params.caption_delim.join(text_parts)
|
|
404
|
+
text_res = self.post_process(text=text_res)
|
|
405
|
+
else:
|
|
406
|
+
text_res = ""
|
|
407
|
+
return SerializationResult(text=text_res)
|