docling-core 2.24.1__tar.gz → 2.26.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.24.1 → docling_core-2.26.0}/PKG-INFO +1 -1
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/experimental/serializer/base.py +23 -2
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/experimental/serializer/common.py +79 -34
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/experimental/serializer/doctags.py +83 -47
- docling_core-2.26.0/docling_core/experimental/serializer/html.py +931 -0
- docling_core-2.26.0/docling_core/experimental/serializer/html_styles.py +212 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/experimental/serializer/markdown.py +95 -57
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/transforms/chunker/base.py +8 -2
- docling_core-2.26.0/docling_core/transforms/chunker/hierarchical_chunker.py +262 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/doc/base.py +4 -1
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/doc/document.py +738 -490
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/doc/labels.py +2 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/doc/page.py +12 -17
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/pyproject.toml +1 -1
- docling_core-2.24.1/docling_core/transforms/chunker/hierarchical_chunker.py +0 -241
- {docling_core-2.24.1 → docling_core-2.26.0}/LICENSE +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/README.md +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/py.typed +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/search/package.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/base.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.24.1 → docling_core-2.26.0}/docling_core/utils/validators.py +0 -0
|
@@ -11,6 +11,7 @@ from typing import Optional, Union
|
|
|
11
11
|
from pydantic import AnyUrl, BaseModel
|
|
12
12
|
|
|
13
13
|
from docling_core.types.doc.document import (
|
|
14
|
+
DocItem,
|
|
14
15
|
DoclingDocument,
|
|
15
16
|
FloatingItem,
|
|
16
17
|
FormItem,
|
|
@@ -25,10 +26,19 @@ from docling_core.types.doc.document import (
|
|
|
25
26
|
)
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
class Span(BaseModel):
|
|
30
|
+
"""Class encapsulating fine-granular document span information."""
|
|
31
|
+
|
|
32
|
+
item: DocItem
|
|
33
|
+
# prov_idx: Optional[PositiveInt] = None # None to be interpreted as whole DocItem
|
|
34
|
+
|
|
35
|
+
|
|
28
36
|
class SerializationResult(BaseModel):
|
|
29
37
|
"""SerializationResult."""
|
|
30
38
|
|
|
31
|
-
text: str
|
|
39
|
+
text: str = ""
|
|
40
|
+
spans: list[Span] = []
|
|
41
|
+
# group: Optional[GroupItem] = None # set when result reflects specific group item
|
|
32
42
|
|
|
33
43
|
|
|
34
44
|
class BaseTextSerializer(ABC):
|
|
@@ -163,7 +173,9 @@ class BaseDocSerializer(ABC):
|
|
|
163
173
|
"""Base class for document serializers."""
|
|
164
174
|
|
|
165
175
|
@abstractmethod
|
|
166
|
-
def serialize(
|
|
176
|
+
def serialize(
|
|
177
|
+
self, *, item: Optional[NodeItem] = None, **kwargs
|
|
178
|
+
) -> SerializationResult:
|
|
167
179
|
"""Run the serialization."""
|
|
168
180
|
...
|
|
169
181
|
|
|
@@ -225,3 +237,12 @@ class BaseDocSerializer(ABC):
|
|
|
225
237
|
def get_excluded_refs(self, **kwargs) -> list[str]:
|
|
226
238
|
"""Get references to excluded items."""
|
|
227
239
|
...
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class BaseSerializerProvider(ABC):
|
|
243
|
+
"""Base class for document serializer providers."""
|
|
244
|
+
|
|
245
|
+
@abstractmethod
|
|
246
|
+
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
|
|
247
|
+
"""Get a the associated serializer."""
|
|
248
|
+
...
|
|
@@ -25,6 +25,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
25
25
|
BaseTableSerializer,
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
|
+
Span,
|
|
28
29
|
)
|
|
29
30
|
from docling_core.types.doc.document import (
|
|
30
31
|
DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
@@ -49,6 +50,38 @@ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
|
49
50
|
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
|
|
50
51
|
|
|
51
52
|
|
|
53
|
+
def create_ser_result(
|
|
54
|
+
*,
|
|
55
|
+
text: str = "",
|
|
56
|
+
span_source: Union[DocItem, list[SerializationResult]] = [],
|
|
57
|
+
) -> SerializationResult:
|
|
58
|
+
"""Function for creating `SerializationResult` instances.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
text: the text the use. Defaults to "".
|
|
62
|
+
span_source: the item or list of results to use as span source. Defaults to [].
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
The created `SerializationResult`.
|
|
66
|
+
"""
|
|
67
|
+
spans: list[Span]
|
|
68
|
+
if isinstance(span_source, DocItem):
|
|
69
|
+
spans = [Span(item=span_source)]
|
|
70
|
+
else:
|
|
71
|
+
results: list[SerializationResult] = span_source
|
|
72
|
+
spans = []
|
|
73
|
+
span_ids: set[str] = set()
|
|
74
|
+
for ser_res in results:
|
|
75
|
+
for span in ser_res.spans:
|
|
76
|
+
if (span_id := span.item.self_ref) not in span_ids:
|
|
77
|
+
span_ids.add(span_id)
|
|
78
|
+
spans.append(span)
|
|
79
|
+
return SerializationResult(
|
|
80
|
+
text=text,
|
|
81
|
+
spans=spans,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
52
85
|
class CommonParams(BaseModel):
|
|
53
86
|
"""Common serialization parameters."""
|
|
54
87
|
|
|
@@ -150,20 +183,26 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
150
183
|
return refs
|
|
151
184
|
|
|
152
185
|
@abstractmethod
|
|
153
|
-
def serialize_page(
|
|
186
|
+
def serialize_page(
|
|
187
|
+
self, *, parts: list[SerializationResult], **kwargs
|
|
188
|
+
) -> SerializationResult:
|
|
154
189
|
"""Serialize a page out of its parts."""
|
|
155
190
|
...
|
|
156
191
|
|
|
157
192
|
@abstractmethod
|
|
158
|
-
def serialize_doc(
|
|
193
|
+
def serialize_doc(
|
|
194
|
+
self, *, pages: dict[Optional[int], SerializationResult], **kwargs
|
|
195
|
+
) -> SerializationResult:
|
|
159
196
|
"""Serialize a document out of its pages."""
|
|
160
197
|
...
|
|
161
198
|
|
|
162
199
|
def _serialize_body(self) -> SerializationResult:
|
|
163
200
|
"""Serialize the document body."""
|
|
164
201
|
# find page ranges if available; otherwise regard whole doc as a single page
|
|
165
|
-
|
|
166
|
-
|
|
202
|
+
prev_start: int = 0
|
|
203
|
+
prev_page_nr: Optional[int] = None
|
|
204
|
+
range_by_page_nr: dict[Optional[int], tuple[int, int]] = {}
|
|
205
|
+
|
|
167
206
|
for ix, (item, _) in enumerate(
|
|
168
207
|
self.doc.iterate_items(
|
|
169
208
|
with_groups=True,
|
|
@@ -173,28 +212,30 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
173
212
|
):
|
|
174
213
|
if isinstance(item, DocItem):
|
|
175
214
|
if item.prov:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
215
|
+
page_no = item.prov[0].page_no
|
|
216
|
+
if prev_page_nr is None or page_no > prev_page_nr:
|
|
217
|
+
if prev_page_nr is not None: # close previous range
|
|
218
|
+
range_by_page_nr[prev_page_nr] = (prev_start, ix)
|
|
219
|
+
|
|
220
|
+
prev_start = ix
|
|
221
|
+
# could alternatively always start 1st page from 0:
|
|
222
|
+
# prev_start = ix if prev_page_nr is not None else 0
|
|
223
|
+
|
|
224
|
+
prev_page_nr = page_no
|
|
225
|
+
|
|
226
|
+
# close last (and single if no pages) range
|
|
227
|
+
range_by_page_nr[prev_page_nr] = (prev_start, sys.maxsize)
|
|
188
228
|
|
|
189
|
-
page_results:
|
|
190
|
-
for
|
|
229
|
+
page_results: dict[Optional[int], SerializationResult] = {}
|
|
230
|
+
for page_nr in range_by_page_nr:
|
|
231
|
+
page_range = range_by_page_nr[page_nr]
|
|
191
232
|
params_to_pass = deepcopy(self.params)
|
|
192
233
|
params_to_pass.start_idx = page_range[0]
|
|
193
234
|
params_to_pass.stop_idx = page_range[1]
|
|
194
235
|
subparts = self.get_parts(**params_to_pass.model_dump())
|
|
195
|
-
page_res = self.serialize_page(subparts)
|
|
196
|
-
page_results
|
|
197
|
-
res = self.serialize_doc(page_results)
|
|
236
|
+
page_res = self.serialize_page(parts=subparts)
|
|
237
|
+
page_results[page_nr] = page_res
|
|
238
|
+
res = self.serialize_doc(pages=page_results)
|
|
198
239
|
return res
|
|
199
240
|
|
|
200
241
|
@override
|
|
@@ -209,7 +250,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
209
250
|
) -> SerializationResult:
|
|
210
251
|
"""Serialize a given node."""
|
|
211
252
|
my_visited: set[str] = visited if visited is not None else set()
|
|
212
|
-
|
|
253
|
+
my_kwargs = self.params.merge_with_patch(patch=kwargs).model_dump()
|
|
254
|
+
empty_res = create_ser_result()
|
|
213
255
|
if item is None or item == self.doc.body:
|
|
214
256
|
if self.doc.body.self_ref not in my_visited:
|
|
215
257
|
my_visited.add(self.doc.body.self_ref)
|
|
@@ -217,6 +259,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
217
259
|
else:
|
|
218
260
|
return empty_res
|
|
219
261
|
|
|
262
|
+
my_visited.add(item.self_ref)
|
|
263
|
+
|
|
220
264
|
########
|
|
221
265
|
# groups
|
|
222
266
|
########
|
|
@@ -228,7 +272,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
228
272
|
list_level=list_level,
|
|
229
273
|
is_inline_scope=is_inline_scope,
|
|
230
274
|
visited=my_visited,
|
|
231
|
-
**
|
|
275
|
+
**my_kwargs,
|
|
232
276
|
)
|
|
233
277
|
elif isinstance(item, InlineGroup):
|
|
234
278
|
part = self.inline_serializer.serialize(
|
|
@@ -237,7 +281,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
237
281
|
doc=self.doc,
|
|
238
282
|
list_level=list_level,
|
|
239
283
|
visited=my_visited,
|
|
240
|
-
**
|
|
284
|
+
**my_kwargs,
|
|
241
285
|
)
|
|
242
286
|
###########
|
|
243
287
|
# doc items
|
|
@@ -253,7 +297,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
253
297
|
doc_serializer=self,
|
|
254
298
|
doc=self.doc,
|
|
255
299
|
is_inline_scope=is_inline_scope,
|
|
256
|
-
**
|
|
300
|
+
**my_kwargs,
|
|
257
301
|
)
|
|
258
302
|
if item.self_ref not in self.get_excluded_refs(**kwargs)
|
|
259
303
|
else empty_res
|
|
@@ -263,7 +307,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
263
307
|
item=item,
|
|
264
308
|
doc_serializer=self,
|
|
265
309
|
doc=self.doc,
|
|
266
|
-
**
|
|
310
|
+
**my_kwargs,
|
|
267
311
|
)
|
|
268
312
|
elif isinstance(item, PictureItem):
|
|
269
313
|
part = self.picture_serializer.serialize(
|
|
@@ -271,28 +315,28 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
271
315
|
doc_serializer=self,
|
|
272
316
|
doc=self.doc,
|
|
273
317
|
visited=my_visited,
|
|
274
|
-
**
|
|
318
|
+
**my_kwargs,
|
|
275
319
|
)
|
|
276
320
|
elif isinstance(item, KeyValueItem):
|
|
277
321
|
part = self.key_value_serializer.serialize(
|
|
278
322
|
item=item,
|
|
279
323
|
doc_serializer=self,
|
|
280
324
|
doc=self.doc,
|
|
281
|
-
**
|
|
325
|
+
**my_kwargs,
|
|
282
326
|
)
|
|
283
327
|
elif isinstance(item, FormItem):
|
|
284
328
|
part = self.form_serializer.serialize(
|
|
285
329
|
item=item,
|
|
286
330
|
doc_serializer=self,
|
|
287
331
|
doc=self.doc,
|
|
288
|
-
**
|
|
332
|
+
**my_kwargs,
|
|
289
333
|
)
|
|
290
334
|
else:
|
|
291
335
|
part = self.fallback_serializer.serialize(
|
|
292
336
|
item=item,
|
|
293
337
|
doc_serializer=self,
|
|
294
338
|
doc=self.doc,
|
|
295
|
-
**
|
|
339
|
+
**my_kwargs,
|
|
296
340
|
)
|
|
297
341
|
return part
|
|
298
342
|
|
|
@@ -393,15 +437,16 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
393
437
|
) -> SerializationResult:
|
|
394
438
|
"""Serialize the item's captions."""
|
|
395
439
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
440
|
+
results: list[SerializationResult] = []
|
|
396
441
|
if DocItemLabel.CAPTION in params.labels:
|
|
397
|
-
|
|
398
|
-
it.text
|
|
442
|
+
results = [
|
|
443
|
+
create_ser_result(text=it.text, span_source=it)
|
|
399
444
|
for cap in item.captions
|
|
400
445
|
if isinstance(it := cap.resolve(self.doc), TextItem)
|
|
401
446
|
and it.self_ref not in self.get_excluded_refs(**kwargs)
|
|
402
447
|
]
|
|
403
|
-
text_res = params.caption_delim.join(
|
|
448
|
+
text_res = params.caption_delim.join([r.text for r in results])
|
|
404
449
|
text_res = self.post_process(text=text_res)
|
|
405
450
|
else:
|
|
406
451
|
text_res = ""
|
|
407
|
-
return
|
|
452
|
+
return create_ser_result(text=text_res, span_source=results)
|
|
@@ -18,7 +18,11 @@ from docling_core.experimental.serializer.base import (
|
|
|
18
18
|
BaseTextSerializer,
|
|
19
19
|
SerializationResult,
|
|
20
20
|
)
|
|
21
|
-
from docling_core.experimental.serializer.common import
|
|
21
|
+
from docling_core.experimental.serializer.common import (
|
|
22
|
+
CommonParams,
|
|
23
|
+
DocSerializer,
|
|
24
|
+
create_ser_result,
|
|
25
|
+
)
|
|
22
26
|
from docling_core.types.doc.document import (
|
|
23
27
|
CodeItem,
|
|
24
28
|
DocItem,
|
|
@@ -33,10 +37,12 @@ from docling_core.types.doc.document import (
|
|
|
33
37
|
PictureClassificationData,
|
|
34
38
|
PictureItem,
|
|
35
39
|
PictureMoleculeData,
|
|
40
|
+
PictureTabularChartData,
|
|
36
41
|
TableItem,
|
|
37
42
|
TextItem,
|
|
38
43
|
UnorderedList,
|
|
39
44
|
)
|
|
45
|
+
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
|
|
40
46
|
from docling_core.types.doc.tokens import DocumentToken
|
|
41
47
|
|
|
42
48
|
|
|
@@ -135,7 +141,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
135
141
|
text_res = "".join(parts)
|
|
136
142
|
if wrap_tag is not None:
|
|
137
143
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
138
|
-
return
|
|
144
|
+
return create_ser_result(text=text_res, span_source=item)
|
|
139
145
|
|
|
140
146
|
|
|
141
147
|
class DocTagsTableSerializer(BaseTableSerializer):
|
|
@@ -153,7 +159,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
153
159
|
"""Serializes the passed item."""
|
|
154
160
|
params = DocTagsParams(**kwargs)
|
|
155
161
|
|
|
156
|
-
|
|
162
|
+
res_parts: list[SerializationResult] = []
|
|
157
163
|
|
|
158
164
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
159
165
|
if params.add_location:
|
|
@@ -162,7 +168,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
162
168
|
xsize=params.xsize,
|
|
163
169
|
ysize=params.ysize,
|
|
164
170
|
)
|
|
165
|
-
|
|
171
|
+
res_parts.append(create_ser_result(text=loc_text, span_source=item))
|
|
166
172
|
|
|
167
173
|
otsl_text = item.export_to_otsl(
|
|
168
174
|
doc=doc,
|
|
@@ -171,18 +177,18 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
171
177
|
xsize=params.xsize,
|
|
172
178
|
ysize=params.ysize,
|
|
173
179
|
)
|
|
174
|
-
|
|
180
|
+
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
|
|
175
181
|
|
|
176
182
|
if params.add_caption:
|
|
177
|
-
|
|
178
|
-
if
|
|
179
|
-
|
|
183
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
184
|
+
if cap_res.text:
|
|
185
|
+
res_parts.append(cap_res)
|
|
180
186
|
|
|
181
|
-
text_res = "".join(
|
|
187
|
+
text_res = "".join([r.text for r in res_parts])
|
|
182
188
|
if text_res:
|
|
183
189
|
text_res = _wrap(text=text_res, wrap_tag=DocumentToken.OTSL.value)
|
|
184
190
|
|
|
185
|
-
return
|
|
191
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
186
192
|
|
|
187
193
|
|
|
188
194
|
class DocTagsPictureSerializer(BasePictureSerializer):
|
|
@@ -199,7 +205,8 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
199
205
|
) -> SerializationResult:
|
|
200
206
|
"""Serializes the passed item."""
|
|
201
207
|
params = DocTagsParams(**kwargs)
|
|
202
|
-
|
|
208
|
+
res_parts: list[SerializationResult] = []
|
|
209
|
+
is_chart = False
|
|
203
210
|
|
|
204
211
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
205
212
|
body = ""
|
|
@@ -217,6 +224,16 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
217
224
|
]
|
|
218
225
|
if len(classifications) > 0:
|
|
219
226
|
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
227
|
+
if predicted_class in [
|
|
228
|
+
PictureClassificationLabel.PIE_CHART,
|
|
229
|
+
PictureClassificationLabel.BAR_CHART,
|
|
230
|
+
PictureClassificationLabel.STACKED_BAR_CHART,
|
|
231
|
+
PictureClassificationLabel.LINE_CHART,
|
|
232
|
+
PictureClassificationLabel.FLOW_CHART,
|
|
233
|
+
PictureClassificationLabel.SCATTER_CHART,
|
|
234
|
+
PictureClassificationLabel.HEATMAP,
|
|
235
|
+
]:
|
|
236
|
+
is_chart = True
|
|
220
237
|
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
221
238
|
|
|
222
239
|
smiles_annotations = [
|
|
@@ -226,20 +243,35 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
226
243
|
body += _wrap(
|
|
227
244
|
text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value
|
|
228
245
|
)
|
|
229
|
-
|
|
246
|
+
|
|
247
|
+
tabular_chart_annotations = [
|
|
248
|
+
ann
|
|
249
|
+
for ann in item.annotations
|
|
250
|
+
if isinstance(ann, PictureTabularChartData)
|
|
251
|
+
]
|
|
252
|
+
if len(tabular_chart_annotations) > 0:
|
|
253
|
+
temp_doc = DoclingDocument(name="temp")
|
|
254
|
+
temp_table = temp_doc.add_table(
|
|
255
|
+
data=tabular_chart_annotations[0].chart_data
|
|
256
|
+
)
|
|
257
|
+
otsl_content = temp_table.export_to_otsl(
|
|
258
|
+
temp_doc, add_cell_location=False
|
|
259
|
+
)
|
|
260
|
+
body += otsl_content
|
|
261
|
+
res_parts.append(create_ser_result(text=body, span_source=item))
|
|
230
262
|
|
|
231
263
|
if params.add_caption:
|
|
232
|
-
|
|
233
|
-
if
|
|
234
|
-
|
|
264
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
265
|
+
if cap_res.text:
|
|
266
|
+
res_parts.append(cap_res)
|
|
235
267
|
|
|
236
|
-
text_res = "".join(
|
|
268
|
+
text_res = "".join([r.text for r in res_parts])
|
|
237
269
|
if text_res:
|
|
238
270
|
token = DocumentToken.create_token_name_from_doc_item_label(
|
|
239
|
-
label=
|
|
271
|
+
label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
|
|
240
272
|
)
|
|
241
273
|
text_res = _wrap(text=text_res, wrap_tag=token)
|
|
242
|
-
return
|
|
274
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
243
275
|
|
|
244
276
|
|
|
245
277
|
class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
@@ -256,8 +288,8 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
256
288
|
) -> SerializationResult:
|
|
257
289
|
"""Serializes the passed item."""
|
|
258
290
|
params = DocTagsParams(**kwargs)
|
|
259
|
-
|
|
260
291
|
body = ""
|
|
292
|
+
results: list[SerializationResult] = []
|
|
261
293
|
|
|
262
294
|
page_no = 1
|
|
263
295
|
if len(item.prov) > 0:
|
|
@@ -302,14 +334,16 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
302
334
|
tok = f"{cell.label.value}_{cell.cell_id}"
|
|
303
335
|
cell_txt = _wrap(text=cell_txt, wrap_tag=tok)
|
|
304
336
|
body += cell_txt
|
|
337
|
+
results.append(create_ser_result(text=body, span_source=item))
|
|
305
338
|
|
|
306
339
|
if params.add_caption:
|
|
307
|
-
|
|
308
|
-
if
|
|
309
|
-
|
|
340
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
341
|
+
if cap_res.text:
|
|
342
|
+
results.append(cap_res)
|
|
310
343
|
|
|
344
|
+
body = "".join([r.text for r in results])
|
|
311
345
|
body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value)
|
|
312
|
-
return
|
|
346
|
+
return create_ser_result(text=body, span_source=results)
|
|
313
347
|
|
|
314
348
|
|
|
315
349
|
class DocTagsFormSerializer(BaseFormSerializer):
|
|
@@ -326,8 +360,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
|
|
|
326
360
|
) -> SerializationResult:
|
|
327
361
|
"""Serializes the passed item."""
|
|
328
362
|
# TODO add actual implementation
|
|
329
|
-
|
|
330
|
-
return SerializationResult(text=text_res)
|
|
363
|
+
return create_ser_result()
|
|
331
364
|
|
|
332
365
|
|
|
333
366
|
class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
@@ -348,7 +381,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
348
381
|
**kwargs,
|
|
349
382
|
) -> SerializationResult:
|
|
350
383
|
"""Serializes the passed item."""
|
|
351
|
-
my_visited = visited
|
|
384
|
+
my_visited = visited if visited is not None else set()
|
|
352
385
|
params = DocTagsParams(**kwargs)
|
|
353
386
|
parts = doc_serializer.get_parts(
|
|
354
387
|
item=item,
|
|
@@ -361,8 +394,9 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
361
394
|
if parts:
|
|
362
395
|
text_res = delim.join(
|
|
363
396
|
[
|
|
364
|
-
|
|
397
|
+
t
|
|
365
398
|
for p in parts
|
|
399
|
+
if (t := _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value))
|
|
366
400
|
]
|
|
367
401
|
)
|
|
368
402
|
text_res = f"{text_res}{delim}"
|
|
@@ -374,7 +408,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
374
408
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
375
409
|
else:
|
|
376
410
|
text_res = ""
|
|
377
|
-
return
|
|
411
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
378
412
|
|
|
379
413
|
|
|
380
414
|
class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
@@ -392,7 +426,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
392
426
|
**kwargs,
|
|
393
427
|
) -> SerializationResult:
|
|
394
428
|
"""Serializes the passed item."""
|
|
395
|
-
my_visited = visited
|
|
429
|
+
my_visited = visited if visited is not None else set()
|
|
396
430
|
params = DocTagsParams(**kwargs)
|
|
397
431
|
parts = doc_serializer.get_parts(
|
|
398
432
|
item=item,
|
|
@@ -407,7 +441,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
407
441
|
if text_res:
|
|
408
442
|
text_res = f"{text_res}{delim}"
|
|
409
443
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
410
|
-
return
|
|
444
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
411
445
|
|
|
412
446
|
|
|
413
447
|
class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
@@ -423,8 +457,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
423
457
|
**kwargs,
|
|
424
458
|
) -> SerializationResult:
|
|
425
459
|
"""Serializes the passed item."""
|
|
426
|
-
|
|
427
|
-
return SerializationResult(text=text_res)
|
|
460
|
+
return create_ser_result()
|
|
428
461
|
|
|
429
462
|
|
|
430
463
|
class DocTagsDocSerializer(DocSerializer):
|
|
@@ -443,24 +476,28 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
443
476
|
params: DocTagsParams = DocTagsParams()
|
|
444
477
|
|
|
445
478
|
@override
|
|
446
|
-
def serialize_page(
|
|
479
|
+
def serialize_page(
|
|
480
|
+
self, *, parts: list[SerializationResult], **kwargs
|
|
481
|
+
) -> SerializationResult:
|
|
447
482
|
"""Serialize a page out of its parts."""
|
|
448
483
|
delim = _get_delim(params=self.params)
|
|
449
|
-
text_res = delim.join([p.text for p in parts])
|
|
450
|
-
return
|
|
484
|
+
text_res = delim.join([p.text for p in parts if p.text])
|
|
485
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
451
486
|
|
|
452
487
|
@override
|
|
453
|
-
def serialize_doc(
|
|
488
|
+
def serialize_doc(
|
|
489
|
+
self, *, pages: dict[Optional[int], SerializationResult], **kwargs
|
|
490
|
+
) -> SerializationResult:
|
|
454
491
|
"""Serialize a document out of its pages."""
|
|
455
492
|
delim = _get_delim(params=self.params)
|
|
456
493
|
if self.params.add_page_break:
|
|
457
494
|
page_sep = f"{delim}<{DocumentToken.PAGE_BREAK.value}>{delim}"
|
|
458
|
-
content = page_sep.join([
|
|
495
|
+
content = page_sep.join([text for k in pages if (text := pages[k].text)])
|
|
459
496
|
else:
|
|
460
|
-
content = self.serialize_page(parts=pages).text
|
|
497
|
+
content = self.serialize_page(parts=list(pages.values())).text
|
|
461
498
|
wrap_tag = DocumentToken.DOCUMENT.value
|
|
462
499
|
text_res = f"<{wrap_tag}>{content}{delim}</{wrap_tag}>"
|
|
463
|
-
return
|
|
500
|
+
return create_ser_result(text=text_res, span_source=list(pages.values()))
|
|
464
501
|
|
|
465
502
|
@override
|
|
466
503
|
def serialize_captions(
|
|
@@ -470,11 +507,10 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
470
507
|
) -> SerializationResult:
|
|
471
508
|
"""Serialize the item's captions."""
|
|
472
509
|
params = DocTagsParams(**kwargs)
|
|
473
|
-
|
|
474
|
-
|
|
510
|
+
results: list[SerializationResult] = []
|
|
475
511
|
if item.captions:
|
|
476
|
-
|
|
477
|
-
if
|
|
512
|
+
cap_res = super().serialize_captions(item, **kwargs)
|
|
513
|
+
if cap_res.text:
|
|
478
514
|
if params.add_location:
|
|
479
515
|
for caption in item.captions:
|
|
480
516
|
if caption.cref not in self.get_excluded_refs(**kwargs):
|
|
@@ -484,9 +520,9 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
484
520
|
xsize=params.xsize,
|
|
485
521
|
ysize=params.ysize,
|
|
486
522
|
)
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
text_res = "".join(
|
|
523
|
+
results.append(create_ser_result(text=loc_txt))
|
|
524
|
+
results.append(cap_res)
|
|
525
|
+
text_res = "".join([r.text for r in results])
|
|
490
526
|
if text_res:
|
|
491
527
|
text_res = _wrap(text=text_res, wrap_tag=DocumentToken.CAPTION.value)
|
|
492
|
-
return
|
|
528
|
+
return create_ser_result(text=text_res, span_source=results)
|