docling-core 2.26.0__py3-none-any.whl → 2.26.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +6 -1
- docling_core/experimental/serializer/common.py +105 -64
- docling_core/experimental/serializer/doctags.py +13 -15
- docling_core/experimental/serializer/html.py +28 -18
- docling_core/experimental/serializer/markdown.py +17 -13
- {docling_core-2.26.0.dist-info → docling_core-2.26.1.dist-info}/METADATA +1 -1
- {docling_core-2.26.0.dist-info → docling_core-2.26.1.dist-info}/RECORD +10 -10
- {docling_core-2.26.0.dist-info → docling_core-2.26.1.dist-info}/LICENSE +0 -0
- {docling_core-2.26.0.dist-info → docling_core-2.26.1.dist-info}/WHEEL +0 -0
- {docling_core-2.26.0.dist-info → docling_core-2.26.1.dist-info}/entry_points.txt +0 -0
|
@@ -234,10 +234,15 @@ class BaseDocSerializer(ABC):
|
|
|
234
234
|
...
|
|
235
235
|
|
|
236
236
|
@abstractmethod
|
|
237
|
-
def get_excluded_refs(self, **kwargs) ->
|
|
237
|
+
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
238
238
|
"""Get references to excluded items."""
|
|
239
239
|
...
|
|
240
240
|
|
|
241
|
+
@abstractmethod
|
|
242
|
+
def requires_page_break(self) -> bool:
|
|
243
|
+
"""Whether to add page breaks."""
|
|
244
|
+
...
|
|
245
|
+
|
|
241
246
|
|
|
242
247
|
class BaseSerializerProvider(ABC):
|
|
243
248
|
"""Base class for document serializer providers."""
|
|
@@ -4,12 +4,12 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Define base classes for serialization."""
|
|
7
|
+
import re
|
|
7
8
|
import sys
|
|
8
9
|
from abc import abstractmethod
|
|
9
|
-
from copy import deepcopy
|
|
10
10
|
from functools import cached_property
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Optional, Union
|
|
12
|
+
from typing import Any, Iterable, Optional, Tuple, Union
|
|
13
13
|
|
|
14
14
|
from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
|
|
15
15
|
from typing_extensions import Self, override
|
|
@@ -50,6 +50,49 @@ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
|
50
50
|
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
|
|
51
51
|
|
|
52
52
|
|
|
53
|
+
class _PageBreakNode(NodeItem):
|
|
54
|
+
"""Page break node."""
|
|
55
|
+
|
|
56
|
+
prev_page: int
|
|
57
|
+
next_page: int
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class _PageBreakSerResult(SerializationResult):
|
|
61
|
+
"""Page break serialization result."""
|
|
62
|
+
|
|
63
|
+
node: _PageBreakNode
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _iterate_items(
|
|
67
|
+
doc: DoclingDocument,
|
|
68
|
+
layers: Optional[set[ContentLayer]],
|
|
69
|
+
node: Optional[NodeItem] = None,
|
|
70
|
+
traverse_pictures: bool = False,
|
|
71
|
+
add_page_breaks: bool = False,
|
|
72
|
+
):
|
|
73
|
+
prev_page_nr: Optional[int] = None
|
|
74
|
+
page_break_i = 0
|
|
75
|
+
for item, _ in doc.iterate_items(
|
|
76
|
+
root=node,
|
|
77
|
+
with_groups=True,
|
|
78
|
+
included_content_layers=layers,
|
|
79
|
+
traverse_pictures=traverse_pictures,
|
|
80
|
+
):
|
|
81
|
+
if isinstance(item, DocItem):
|
|
82
|
+
if item.prov:
|
|
83
|
+
page_no = item.prov[0].page_no
|
|
84
|
+
if add_page_breaks and (prev_page_nr is None or page_no > prev_page_nr):
|
|
85
|
+
if prev_page_nr is not None: # close previous range
|
|
86
|
+
yield _PageBreakNode(
|
|
87
|
+
self_ref=f"#/pb/{page_break_i}",
|
|
88
|
+
prev_page=prev_page_nr,
|
|
89
|
+
next_page=page_no,
|
|
90
|
+
)
|
|
91
|
+
page_break_i += 1
|
|
92
|
+
prev_page_nr = page_no
|
|
93
|
+
yield item
|
|
94
|
+
|
|
95
|
+
|
|
53
96
|
def create_ser_result(
|
|
54
97
|
*,
|
|
55
98
|
text: str = "",
|
|
@@ -128,7 +171,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
128
171
|
|
|
129
172
|
params: CommonParams = CommonParams()
|
|
130
173
|
|
|
131
|
-
_excluded_refs_cache: dict[str,
|
|
174
|
+
_excluded_refs_cache: dict[str, set[str]] = {}
|
|
132
175
|
|
|
133
176
|
@computed_field # type: ignore[misc]
|
|
134
177
|
@cached_property
|
|
@@ -146,19 +189,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
146
189
|
return refs
|
|
147
190
|
|
|
148
191
|
@override
|
|
149
|
-
def get_excluded_refs(self, **kwargs) ->
|
|
192
|
+
def get_excluded_refs(self, **kwargs) -> set[str]:
|
|
150
193
|
"""References to excluded items."""
|
|
151
194
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
152
195
|
params_json = params.model_dump_json()
|
|
153
196
|
refs = self._excluded_refs_cache.get(params_json)
|
|
154
197
|
if refs is None:
|
|
155
|
-
refs =
|
|
198
|
+
refs = {
|
|
156
199
|
item.self_ref
|
|
157
|
-
for ix,
|
|
158
|
-
|
|
159
|
-
|
|
200
|
+
for ix, item in enumerate(
|
|
201
|
+
_iterate_items(
|
|
202
|
+
doc=self.doc,
|
|
160
203
|
traverse_pictures=True,
|
|
161
|
-
|
|
204
|
+
layers=params.layers,
|
|
162
205
|
)
|
|
163
206
|
)
|
|
164
207
|
if (
|
|
@@ -178,64 +221,21 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
178
221
|
)
|
|
179
222
|
)
|
|
180
223
|
)
|
|
181
|
-
|
|
224
|
+
}
|
|
182
225
|
self._excluded_refs_cache[params_json] = refs
|
|
183
226
|
return refs
|
|
184
227
|
|
|
185
|
-
@abstractmethod
|
|
186
|
-
def serialize_page(
|
|
187
|
-
self, *, parts: list[SerializationResult], **kwargs
|
|
188
|
-
) -> SerializationResult:
|
|
189
|
-
"""Serialize a page out of its parts."""
|
|
190
|
-
...
|
|
191
|
-
|
|
192
228
|
@abstractmethod
|
|
193
229
|
def serialize_doc(
|
|
194
|
-
self, *,
|
|
230
|
+
self, *, parts: list[SerializationResult], **kwargs
|
|
195
231
|
) -> SerializationResult:
|
|
196
232
|
"""Serialize a document out of its pages."""
|
|
197
233
|
...
|
|
198
234
|
|
|
199
235
|
def _serialize_body(self) -> SerializationResult:
|
|
200
236
|
"""Serialize the document body."""
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
prev_page_nr: Optional[int] = None
|
|
204
|
-
range_by_page_nr: dict[Optional[int], tuple[int, int]] = {}
|
|
205
|
-
|
|
206
|
-
for ix, (item, _) in enumerate(
|
|
207
|
-
self.doc.iterate_items(
|
|
208
|
-
with_groups=True,
|
|
209
|
-
traverse_pictures=True,
|
|
210
|
-
included_content_layers=self.params.layers,
|
|
211
|
-
)
|
|
212
|
-
):
|
|
213
|
-
if isinstance(item, DocItem):
|
|
214
|
-
if item.prov:
|
|
215
|
-
page_no = item.prov[0].page_no
|
|
216
|
-
if prev_page_nr is None or page_no > prev_page_nr:
|
|
217
|
-
if prev_page_nr is not None: # close previous range
|
|
218
|
-
range_by_page_nr[prev_page_nr] = (prev_start, ix)
|
|
219
|
-
|
|
220
|
-
prev_start = ix
|
|
221
|
-
# could alternatively always start 1st page from 0:
|
|
222
|
-
# prev_start = ix if prev_page_nr is not None else 0
|
|
223
|
-
|
|
224
|
-
prev_page_nr = page_no
|
|
225
|
-
|
|
226
|
-
# close last (and single if no pages) range
|
|
227
|
-
range_by_page_nr[prev_page_nr] = (prev_start, sys.maxsize)
|
|
228
|
-
|
|
229
|
-
page_results: dict[Optional[int], SerializationResult] = {}
|
|
230
|
-
for page_nr in range_by_page_nr:
|
|
231
|
-
page_range = range_by_page_nr[page_nr]
|
|
232
|
-
params_to_pass = deepcopy(self.params)
|
|
233
|
-
params_to_pass.start_idx = page_range[0]
|
|
234
|
-
params_to_pass.stop_idx = page_range[1]
|
|
235
|
-
subparts = self.get_parts(**params_to_pass.model_dump())
|
|
236
|
-
page_res = self.serialize_page(parts=subparts)
|
|
237
|
-
page_results[page_nr] = page_res
|
|
238
|
-
res = self.serialize_doc(pages=page_results)
|
|
237
|
+
subparts = self.get_parts()
|
|
238
|
+
res = self.serialize_doc(parts=subparts)
|
|
239
239
|
return res
|
|
240
240
|
|
|
241
241
|
@override
|
|
@@ -331,6 +331,11 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
331
331
|
doc=self.doc,
|
|
332
332
|
**my_kwargs,
|
|
333
333
|
)
|
|
334
|
+
elif isinstance(item, _PageBreakNode):
|
|
335
|
+
part = _PageBreakSerResult(
|
|
336
|
+
text=self._create_page_break(node=item),
|
|
337
|
+
node=item,
|
|
338
|
+
)
|
|
334
339
|
else:
|
|
335
340
|
part = self.fallback_serializer.serialize(
|
|
336
341
|
item=item,
|
|
@@ -356,18 +361,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
356
361
|
parts: list[SerializationResult] = []
|
|
357
362
|
my_visited: set[str] = visited if visited is not None else set()
|
|
358
363
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
+
|
|
365
|
+
for node in _iterate_items(
|
|
366
|
+
node=item,
|
|
367
|
+
doc=self.doc,
|
|
368
|
+
layers=params.layers,
|
|
369
|
+
add_page_breaks=self.requires_page_break(),
|
|
364
370
|
):
|
|
365
|
-
if
|
|
371
|
+
if node.self_ref in my_visited:
|
|
366
372
|
continue
|
|
367
373
|
else:
|
|
368
|
-
my_visited.add(
|
|
374
|
+
my_visited.add(node.self_ref)
|
|
369
375
|
part = self.serialize(
|
|
370
|
-
item=
|
|
376
|
+
item=node,
|
|
371
377
|
list_level=list_level,
|
|
372
378
|
is_inline_scope=is_inline_scope,
|
|
373
379
|
visited=my_visited,
|
|
@@ -450,3 +456,38 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
450
456
|
else:
|
|
451
457
|
text_res = ""
|
|
452
458
|
return create_ser_result(text=text_res, span_source=results)
|
|
459
|
+
|
|
460
|
+
def _get_applicable_pages(self) -> Optional[list[int]]:
|
|
461
|
+
pages = {
|
|
462
|
+
item.prov[0].page_no: ...
|
|
463
|
+
for ix, (item, _) in enumerate(
|
|
464
|
+
self.doc.iterate_items(
|
|
465
|
+
with_groups=True,
|
|
466
|
+
included_content_layers=self.params.layers,
|
|
467
|
+
traverse_pictures=True,
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
if (
|
|
471
|
+
isinstance(item, DocItem)
|
|
472
|
+
and item.prov
|
|
473
|
+
and (
|
|
474
|
+
self.params.pages is None
|
|
475
|
+
or item.prov[0].page_no in self.params.pages
|
|
476
|
+
)
|
|
477
|
+
and ix >= self.params.start_idx
|
|
478
|
+
and ix < self.params.stop_idx
|
|
479
|
+
)
|
|
480
|
+
}
|
|
481
|
+
return [p for p in pages] or None
|
|
482
|
+
|
|
483
|
+
def _create_page_break(self, node: _PageBreakNode) -> str:
|
|
484
|
+
return f"#_#_DOCLING_DOC_PAGE_BREAK_{node.prev_page}_{node.next_page}_#_#"
|
|
485
|
+
|
|
486
|
+
def _get_page_breaks(self, text: str) -> Iterable[Tuple[str, int, int]]:
|
|
487
|
+
pattern = r"#_#_DOCLING_DOC_PAGE_BREAK_(\d+)_(\d+)_#_#"
|
|
488
|
+
matches = re.finditer(pattern, text)
|
|
489
|
+
for match in matches:
|
|
490
|
+
full_match = match.group(0)
|
|
491
|
+
prev_page_nr = int(match.group(1))
|
|
492
|
+
next_page_nr = int(match.group(2))
|
|
493
|
+
yield (full_match, prev_page_nr, next_page_nr)
|
|
@@ -476,28 +476,21 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
476
476
|
params: DocTagsParams = DocTagsParams()
|
|
477
477
|
|
|
478
478
|
@override
|
|
479
|
-
def
|
|
479
|
+
def serialize_doc(
|
|
480
480
|
self, *, parts: list[SerializationResult], **kwargs
|
|
481
481
|
) -> SerializationResult:
|
|
482
|
-
"""Serialize a
|
|
482
|
+
"""Serialize a document out of its pages."""
|
|
483
483
|
delim = _get_delim(params=self.params)
|
|
484
484
|
text_res = delim.join([p.text for p in parts if p.text])
|
|
485
|
-
return create_ser_result(text=text_res, span_source=parts)
|
|
486
485
|
|
|
487
|
-
@override
|
|
488
|
-
def serialize_doc(
|
|
489
|
-
self, *, pages: dict[Optional[int], SerializationResult], **kwargs
|
|
490
|
-
) -> SerializationResult:
|
|
491
|
-
"""Serialize a document out of its pages."""
|
|
492
|
-
delim = _get_delim(params=self.params)
|
|
493
486
|
if self.params.add_page_break:
|
|
494
|
-
page_sep = f"
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
487
|
+
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
|
|
488
|
+
for full_match, _, _ in self._get_page_breaks(text=text_res):
|
|
489
|
+
text_res = text_res.replace(full_match, page_sep)
|
|
490
|
+
|
|
498
491
|
wrap_tag = DocumentToken.DOCUMENT.value
|
|
499
|
-
text_res = f"<{wrap_tag}>{
|
|
500
|
-
return create_ser_result(text=text_res, span_source=
|
|
492
|
+
text_res = f"<{wrap_tag}>{text_res}{delim}</{wrap_tag}>"
|
|
493
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
501
494
|
|
|
502
495
|
@override
|
|
503
496
|
def serialize_captions(
|
|
@@ -526,3 +519,8 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
526
519
|
if text_res:
|
|
527
520
|
text_res = _wrap(text=text_res, wrap_tag=DocumentToken.CAPTION.value)
|
|
528
521
|
return create_ser_result(text=text_res, span_source=results)
|
|
522
|
+
|
|
523
|
+
@override
|
|
524
|
+
def requires_page_break(self):
|
|
525
|
+
"""Whether to add page breaks."""
|
|
526
|
+
return self.params.add_page_break
|
|
@@ -765,37 +765,42 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
765
765
|
"""Apply HTML-specific hyperlink serialization."""
|
|
766
766
|
return f'<a href="{str(hyperlink)}">{text}</a>'
|
|
767
767
|
|
|
768
|
-
@override
|
|
769
|
-
def serialize_page(
|
|
770
|
-
self, parts: list[SerializationResult], **kwargs
|
|
771
|
-
) -> SerializationResult:
|
|
772
|
-
"""Serialize a page out of its parts."""
|
|
773
|
-
# Join all parts with newlines
|
|
774
|
-
body_content = "\n".join([p.text for p in parts if p.text])
|
|
775
|
-
return create_ser_result(
|
|
776
|
-
text=f"<div class='page'>\n{body_content}\n</div>",
|
|
777
|
-
span_source=parts,
|
|
778
|
-
)
|
|
779
|
-
|
|
780
768
|
@override
|
|
781
769
|
def serialize_doc(
|
|
782
|
-
self,
|
|
770
|
+
self, parts: list[SerializationResult], **kwargs
|
|
783
771
|
) -> SerializationResult:
|
|
784
772
|
"""Serialize a document out of its pages."""
|
|
785
773
|
# Create HTML structure
|
|
786
774
|
html_parts = [
|
|
787
775
|
"<!DOCTYPE html>",
|
|
776
|
+
"<html>",
|
|
788
777
|
self._generate_head(),
|
|
789
778
|
"<body>",
|
|
790
779
|
]
|
|
791
780
|
|
|
792
781
|
if self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
|
|
782
|
+
html_content = "\n".join([p.text for p in parts if p.text])
|
|
783
|
+
next_page: Optional[int] = None
|
|
784
|
+
prev_full_match_end = 0
|
|
785
|
+
pages = {}
|
|
786
|
+
for full_match, prev_page, next_page in self._get_page_breaks(html_content):
|
|
787
|
+
this_match_start = html_content.find(full_match)
|
|
788
|
+
pages[prev_page] = html_content[prev_full_match_end:this_match_start]
|
|
789
|
+
prev_full_match_end = this_match_start + len(full_match)
|
|
790
|
+
|
|
791
|
+
# capture last page
|
|
792
|
+
if next_page is not None:
|
|
793
|
+
pages[next_page] = html_content[prev_full_match_end:]
|
|
794
|
+
|
|
793
795
|
html_parts.append("<table>")
|
|
794
796
|
html_parts.append("<tbody>")
|
|
795
797
|
|
|
798
|
+
applicable_pages = self._get_applicable_pages()
|
|
796
799
|
for page_no, page in pages.items():
|
|
797
800
|
|
|
798
801
|
if isinstance(page_no, int):
|
|
802
|
+
if applicable_pages is not None and page_no not in applicable_pages:
|
|
803
|
+
continue
|
|
799
804
|
page_img = self.doc.pages[page_no].image
|
|
800
805
|
|
|
801
806
|
html_parts.append("<tr>")
|
|
@@ -831,7 +836,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
831
836
|
html_parts.append("</td>")
|
|
832
837
|
|
|
833
838
|
html_parts.append("<td>")
|
|
834
|
-
html_parts.append(page
|
|
839
|
+
html_parts.append(f"<div class='page'>\n{page}\n</div>")
|
|
835
840
|
html_parts.append("</td>")
|
|
836
841
|
|
|
837
842
|
html_parts.append("</tr>")
|
|
@@ -845,9 +850,9 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
845
850
|
|
|
846
851
|
elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
|
|
847
852
|
# Add all pages
|
|
848
|
-
for
|
|
849
|
-
|
|
850
|
-
|
|
853
|
+
html_content = "\n".join([p.text for p in parts if p.text])
|
|
854
|
+
html_content = f"<div class='page'>\n{html_content}\n</div>"
|
|
855
|
+
html_parts.append(html_content)
|
|
851
856
|
else:
|
|
852
857
|
raise ValueError(f"unknown output-style: {self.params.output_style}")
|
|
853
858
|
|
|
@@ -857,7 +862,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
857
862
|
# Join with newlines
|
|
858
863
|
html_content = "\n".join(html_parts)
|
|
859
864
|
|
|
860
|
-
return create_ser_result(text=html_content, span_source=
|
|
865
|
+
return create_ser_result(text=html_content, span_source=parts)
|
|
861
866
|
|
|
862
867
|
@override
|
|
863
868
|
def serialize_captions(
|
|
@@ -929,3 +934,8 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
929
934
|
def _get_default_css(self) -> str:
|
|
930
935
|
"""Return default CSS styles for the HTML document."""
|
|
931
936
|
return "<style></style>"
|
|
937
|
+
|
|
938
|
+
@override
|
|
939
|
+
def requires_page_break(self):
|
|
940
|
+
"""Whether to add page breaks."""
|
|
941
|
+
return self.params.output_style == HTMLOutputStyle.SPLIT_PAGE
|
|
@@ -29,6 +29,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
29
29
|
from docling_core.experimental.serializer.common import (
|
|
30
30
|
CommonParams,
|
|
31
31
|
DocSerializer,
|
|
32
|
+
_PageBreakSerResult,
|
|
32
33
|
create_ser_result,
|
|
33
34
|
)
|
|
34
35
|
from docling_core.types.doc.base import ImageRefMode
|
|
@@ -375,7 +376,11 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
375
376
|
(
|
|
376
377
|
c.text
|
|
377
378
|
if c.text and c.text[0] == " "
|
|
378
|
-
else
|
|
379
|
+
else (
|
|
380
|
+
f"{indent_str}"
|
|
381
|
+
f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}" # noqa: E501
|
|
382
|
+
f"{c.text}"
|
|
383
|
+
)
|
|
379
384
|
)
|
|
380
385
|
for i, c in enumerate(my_parts)
|
|
381
386
|
]
|
|
@@ -404,6 +409,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
|
404
409
|
list_level=list_level,
|
|
405
410
|
is_inline_scope=True,
|
|
406
411
|
visited=my_visited,
|
|
412
|
+
**kwargs,
|
|
407
413
|
)
|
|
408
414
|
text_res = " ".join([p.text for p in parts if p.text])
|
|
409
415
|
return create_ser_result(text=text_res, span_source=parts)
|
|
@@ -516,21 +522,19 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
516
522
|
return res
|
|
517
523
|
|
|
518
524
|
@override
|
|
519
|
-
def
|
|
525
|
+
def serialize_doc(
|
|
520
526
|
self, *, parts: list[SerializationResult], **kwargs
|
|
521
527
|
) -> SerializationResult:
|
|
522
|
-
"""Serialize a
|
|
528
|
+
"""Serialize a document out of its parts."""
|
|
523
529
|
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
530
|
+
if self.params.page_break_placeholder:
|
|
531
|
+
page_sep = self.params.page_break_placeholder or ""
|
|
532
|
+
for full_match, _, _ in self._get_page_breaks(text=text_res):
|
|
533
|
+
text_res = text_res.replace(full_match, page_sep)
|
|
534
|
+
|
|
524
535
|
return create_ser_result(text=text_res, span_source=parts)
|
|
525
536
|
|
|
526
537
|
@override
|
|
527
|
-
def
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
"""Serialize a document out of its pages."""
|
|
531
|
-
if self.params.page_break_placeholder is not None:
|
|
532
|
-
sep = f"\n\n{self.params.page_break_placeholder}\n\n"
|
|
533
|
-
text_res = sep.join([text for k in pages if (text := pages[k].text)])
|
|
534
|
-
return create_ser_result(text=text_res, span_source=list(pages.values()))
|
|
535
|
-
else:
|
|
536
|
-
return self.serialize_page(parts=list(pages.values()))
|
|
538
|
+
def requires_page_break(self):
|
|
539
|
+
"""Whether to add page breaks."""
|
|
540
|
+
return self.params.page_break_placeholder is not None
|
|
@@ -3,12 +3,12 @@ docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,
|
|
|
3
3
|
docling_core/cli/view.py,sha256=gwxSBYhGqwznMR8pdXaEuAh2bjFD5X_g11xFYSgFgtM,1764
|
|
4
4
|
docling_core/experimental/__init__.py,sha256=XnAVSUHbA6OFhNSpoYqSD3u83-xVaUaki1DIKFw69Ew,99
|
|
5
5
|
docling_core/experimental/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
6
|
-
docling_core/experimental/serializer/base.py,sha256=
|
|
7
|
-
docling_core/experimental/serializer/common.py,sha256=
|
|
8
|
-
docling_core/experimental/serializer/doctags.py,sha256=
|
|
9
|
-
docling_core/experimental/serializer/html.py,sha256=
|
|
6
|
+
docling_core/experimental/serializer/base.py,sha256=1sD1v5rWC4MT_Y6BWpMDjUAwuEqC0TR9YjQJZlhPt50,5901
|
|
7
|
+
docling_core/experimental/serializer/common.py,sha256=z80B2BzUdDfp_HgZ1KA64vK-oV07jcgxLs1XyCgp7sI,16152
|
|
8
|
+
docling_core/experimental/serializer/doctags.py,sha256=e97FJHh77x--g2t1O2YprBzF8lkihn_xOr59EjnR7ag,17794
|
|
9
|
+
docling_core/experimental/serializer/html.py,sha256=MZz3WXzWjYD1YUOl6AfkXfukw5N5bQORYU3x0Nl5U_w,31895
|
|
10
10
|
docling_core/experimental/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
11
|
-
docling_core/experimental/serializer/markdown.py,sha256=
|
|
11
|
+
docling_core/experimental/serializer/markdown.py,sha256=5bvONhaA1EdAD0c3WlWfr2x2KmRaSZd8muG-91XVHgc,17733
|
|
12
12
|
docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
|
|
14
14
|
docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
|
|
@@ -65,8 +65,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
65
65
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
66
66
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
67
67
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
68
|
-
docling_core-2.26.
|
|
69
|
-
docling_core-2.26.
|
|
70
|
-
docling_core-2.26.
|
|
71
|
-
docling_core-2.26.
|
|
72
|
-
docling_core-2.26.
|
|
68
|
+
docling_core-2.26.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
69
|
+
docling_core-2.26.1.dist-info/METADATA,sha256=yWNbBrlNTINX_i36vIS0CZ8dFYVFuHsbzB2vs_d3ADw,5843
|
|
70
|
+
docling_core-2.26.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
71
|
+
docling_core-2.26.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
72
|
+
docling_core-2.26.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|