docling-core 2.25.0__py3-none-any.whl → 2.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +23 -2
- docling_core/experimental/serializer/common.py +79 -34
- docling_core/experimental/serializer/doctags.py +83 -47
- docling_core/experimental/serializer/html.py +931 -0
- docling_core/experimental/serializer/html_styles.py +212 -0
- docling_core/experimental/serializer/markdown.py +95 -57
- docling_core/transforms/chunker/base.py +8 -2
- docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
- docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- docling_core/types/doc/document.py +702 -482
- docling_core/types/doc/labels.py +2 -0
- docling_core/types/doc/page.py +12 -17
- docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/METADATA +1 -1
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/RECORD +18 -16
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/LICENSE +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/WHEEL +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""HTML styles for different export modes."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _get_css_with_no_styling() -> str:
|
|
5
|
+
"""Return default CSS styles for the HTML document."""
|
|
6
|
+
return "<style></style>"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_css_for_split_page() -> str:
|
|
10
|
+
"""Return default CSS styles for the HTML document."""
|
|
11
|
+
return """<style>
|
|
12
|
+
html {
|
|
13
|
+
background-color: #e1e1e1;
|
|
14
|
+
font-family: Arial, sans-serif;
|
|
15
|
+
line-height: 1.6;
|
|
16
|
+
}
|
|
17
|
+
img {
|
|
18
|
+
min-width: 500px;
|
|
19
|
+
max-width: 100%;
|
|
20
|
+
}
|
|
21
|
+
table {
|
|
22
|
+
border-collapse: collapse;
|
|
23
|
+
border: 0px solid #fff;
|
|
24
|
+
width: 100%;
|
|
25
|
+
}
|
|
26
|
+
td {
|
|
27
|
+
vertical-align: top;
|
|
28
|
+
}
|
|
29
|
+
.page {
|
|
30
|
+
background-color: white;
|
|
31
|
+
margin-top:15px;
|
|
32
|
+
padding: 30px;
|
|
33
|
+
border: 1px solid black;
|
|
34
|
+
width:100%;
|
|
35
|
+
max-width:1000px;
|
|
36
|
+
box-shadow: 0 0 10px rgba(0,0,0,0.5);
|
|
37
|
+
}
|
|
38
|
+
.page figure {
|
|
39
|
+
text-align: center;
|
|
40
|
+
}
|
|
41
|
+
.page img {
|
|
42
|
+
max-width: 900px;
|
|
43
|
+
min-width: auto;
|
|
44
|
+
}
|
|
45
|
+
.page table {
|
|
46
|
+
border-collapse: collapse;
|
|
47
|
+
margin: 1em 0;
|
|
48
|
+
width: 100%;
|
|
49
|
+
}
|
|
50
|
+
.page table td {
|
|
51
|
+
border: 1px solid #ddd;
|
|
52
|
+
padding: 8px;
|
|
53
|
+
text-align: left;
|
|
54
|
+
}
|
|
55
|
+
.page table th {
|
|
56
|
+
border: 1px solid #ddd;
|
|
57
|
+
padding: 8px;
|
|
58
|
+
text-align: left;
|
|
59
|
+
background-color: #f2f2f2;
|
|
60
|
+
font-weight: bold;
|
|
61
|
+
}
|
|
62
|
+
.page table caption {
|
|
63
|
+
color: #666;
|
|
64
|
+
font-style: italic;
|
|
65
|
+
margin-top: 0.5em;
|
|
66
|
+
padding: 8px;
|
|
67
|
+
margin-top: 5px;
|
|
68
|
+
margin-bottom: 5px;
|
|
69
|
+
}
|
|
70
|
+
.page figcaption {
|
|
71
|
+
color: #666;
|
|
72
|
+
font-style: italic;
|
|
73
|
+
margin-top: 0.5em;
|
|
74
|
+
padding: 8px;
|
|
75
|
+
margin-top: 5px;
|
|
76
|
+
margin-bottom: 5px;
|
|
77
|
+
}
|
|
78
|
+
code {
|
|
79
|
+
background-color: rgb(228, 228, 228);
|
|
80
|
+
border: 1px solid darkgray;
|
|
81
|
+
padding: 10px;
|
|
82
|
+
display: inline-block;
|
|
83
|
+
font-family: monospace;
|
|
84
|
+
max-width:980px;
|
|
85
|
+
word-wrap: normal;
|
|
86
|
+
white-space: pre-wrap;
|
|
87
|
+
word-wrap: break-word;
|
|
88
|
+
/*overflow-wrap: break-word;*/
|
|
89
|
+
}
|
|
90
|
+
</style>
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _get_css_for_single_column() -> str:
|
|
95
|
+
"""Return CSS styles for the single-column HTML document."""
|
|
96
|
+
return """<style>
|
|
97
|
+
html {
|
|
98
|
+
background-color: #f5f5f5;
|
|
99
|
+
font-family: Arial, sans-serif;
|
|
100
|
+
line-height: 1.6;
|
|
101
|
+
}
|
|
102
|
+
body {
|
|
103
|
+
max-width: 800px;
|
|
104
|
+
margin: 0 auto;
|
|
105
|
+
padding: 2rem;
|
|
106
|
+
background-color: white;
|
|
107
|
+
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
|
108
|
+
}
|
|
109
|
+
h1, h2, h3, h4, h5, h6 {
|
|
110
|
+
color: #333;
|
|
111
|
+
margin-top: 1.5em;
|
|
112
|
+
margin-bottom: 0.5em;
|
|
113
|
+
}
|
|
114
|
+
h1 {
|
|
115
|
+
font-size: 2em;
|
|
116
|
+
border-bottom: 1px solid #eee;
|
|
117
|
+
padding-bottom: 0.3em;
|
|
118
|
+
}
|
|
119
|
+
table {
|
|
120
|
+
border-collapse: collapse;
|
|
121
|
+
margin: 1em 0;
|
|
122
|
+
width: 100%;
|
|
123
|
+
}
|
|
124
|
+
th, td {
|
|
125
|
+
border: 1px solid #ddd;
|
|
126
|
+
padding: 8px;
|
|
127
|
+
text-align: left;
|
|
128
|
+
}
|
|
129
|
+
th {
|
|
130
|
+
background-color: #f2f2f2;
|
|
131
|
+
font-weight: bold;
|
|
132
|
+
}
|
|
133
|
+
figure {
|
|
134
|
+
margin: 1.5em 0;
|
|
135
|
+
text-align: center;
|
|
136
|
+
}
|
|
137
|
+
figcaption {
|
|
138
|
+
color: #666;
|
|
139
|
+
font-style: italic;
|
|
140
|
+
margin-top: 0.5em;
|
|
141
|
+
}
|
|
142
|
+
img {
|
|
143
|
+
max-width: 100%;
|
|
144
|
+
height: auto;
|
|
145
|
+
}
|
|
146
|
+
pre {
|
|
147
|
+
background-color: #f6f8fa;
|
|
148
|
+
border-radius: 3px;
|
|
149
|
+
padding: 1em;
|
|
150
|
+
overflow: auto;
|
|
151
|
+
}
|
|
152
|
+
code {
|
|
153
|
+
font-family: monospace;
|
|
154
|
+
background-color: #f6f8fa;
|
|
155
|
+
padding: 0.2em 0.4em;
|
|
156
|
+
border-radius: 3px;
|
|
157
|
+
}
|
|
158
|
+
pre code {
|
|
159
|
+
background-color: transparent;
|
|
160
|
+
padding: 0;
|
|
161
|
+
}
|
|
162
|
+
.formula {
|
|
163
|
+
text-align: center;
|
|
164
|
+
padding: 0.5em;
|
|
165
|
+
margin: 1em 0;
|
|
166
|
+
background-color: #f9f9f9;
|
|
167
|
+
}
|
|
168
|
+
.formula-not-decoded {
|
|
169
|
+
text-align: center;
|
|
170
|
+
padding: 0.5em;
|
|
171
|
+
margin: 1em 0;
|
|
172
|
+
background: repeating-linear-gradient(
|
|
173
|
+
45deg,
|
|
174
|
+
#f0f0f0,
|
|
175
|
+
#f0f0f0 10px,
|
|
176
|
+
#f9f9f9 10px,
|
|
177
|
+
#f9f9f9 20px
|
|
178
|
+
);
|
|
179
|
+
}
|
|
180
|
+
.page-break {
|
|
181
|
+
page-break-after: always;
|
|
182
|
+
border-top: 1px dashed #ccc;
|
|
183
|
+
margin: 2em 0;
|
|
184
|
+
}
|
|
185
|
+
.key-value-region {
|
|
186
|
+
background-color: #f9f9f9;
|
|
187
|
+
padding: 1em;
|
|
188
|
+
border-radius: 4px;
|
|
189
|
+
margin: 1em 0;
|
|
190
|
+
}
|
|
191
|
+
.key-value-region dt {
|
|
192
|
+
font-weight: bold;
|
|
193
|
+
}
|
|
194
|
+
.key-value-region dd {
|
|
195
|
+
margin-left: 1em;
|
|
196
|
+
margin-bottom: 0.5em;
|
|
197
|
+
}
|
|
198
|
+
.form-container {
|
|
199
|
+
border: 1px solid #ddd;
|
|
200
|
+
padding: 1em;
|
|
201
|
+
border-radius: 4px;
|
|
202
|
+
margin: 1em 0;
|
|
203
|
+
}
|
|
204
|
+
.form-item {
|
|
205
|
+
margin-bottom: 0.5em;
|
|
206
|
+
}
|
|
207
|
+
.image-classification {
|
|
208
|
+
font-size: 0.9em;
|
|
209
|
+
color: #666;
|
|
210
|
+
margin-top: 0.5em;
|
|
211
|
+
}
|
|
212
|
+
</style>"""
|
|
@@ -26,7 +26,11 @@ from docling_core.experimental.serializer.base import (
|
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
28
|
)
|
|
29
|
-
from docling_core.experimental.serializer.common import
|
|
29
|
+
from docling_core.experimental.serializer.common import (
|
|
30
|
+
CommonParams,
|
|
31
|
+
DocSerializer,
|
|
32
|
+
create_ser_result,
|
|
33
|
+
)
|
|
30
34
|
from docling_core.types.doc.base import ImageRefMode
|
|
31
35
|
from docling_core.types.doc.document import (
|
|
32
36
|
CodeItem,
|
|
@@ -43,6 +47,7 @@ from docling_core.types.doc.document import (
|
|
|
43
47
|
NodeItem,
|
|
44
48
|
OrderedList,
|
|
45
49
|
PictureItem,
|
|
50
|
+
PictureTabularChartData,
|
|
46
51
|
SectionHeaderItem,
|
|
47
52
|
TableItem,
|
|
48
53
|
TextItem,
|
|
@@ -57,10 +62,12 @@ class MarkdownParams(CommonParams):
|
|
|
57
62
|
layers: set[ContentLayer] = {ContentLayer.BODY}
|
|
58
63
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
|
|
59
64
|
image_placeholder: str = "<!-- image -->"
|
|
65
|
+
enable_chart_tables: bool = True
|
|
60
66
|
indent: int = 4
|
|
61
67
|
wrap_width: Optional[PositiveInt] = None
|
|
62
68
|
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
|
|
63
69
|
escape_underscores: bool = True
|
|
70
|
+
escape_html: bool = True
|
|
64
71
|
|
|
65
72
|
|
|
66
73
|
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
@@ -78,46 +85,51 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
78
85
|
) -> SerializationResult:
|
|
79
86
|
"""Serializes the passed item."""
|
|
80
87
|
params = MarkdownParams(**kwargs)
|
|
81
|
-
|
|
88
|
+
res_parts: list[SerializationResult] = []
|
|
82
89
|
escape_html = True
|
|
83
90
|
escape_underscores = True
|
|
84
91
|
if isinstance(item, TitleItem):
|
|
85
|
-
|
|
92
|
+
text_part = f"# {item.text}"
|
|
86
93
|
elif isinstance(item, SectionHeaderItem):
|
|
87
|
-
|
|
94
|
+
text_part = f"{(item.level + 1) * '#'} {item.text}"
|
|
88
95
|
elif isinstance(item, CodeItem):
|
|
89
|
-
|
|
96
|
+
text_part = (
|
|
97
|
+
f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
98
|
+
)
|
|
90
99
|
escape_html = False
|
|
91
100
|
escape_underscores = False
|
|
92
101
|
elif isinstance(item, FormulaItem):
|
|
93
102
|
if item.text:
|
|
94
|
-
|
|
103
|
+
text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
|
|
95
104
|
elif item.orig:
|
|
96
|
-
|
|
105
|
+
text_part = "<!-- formula-not-decoded -->"
|
|
97
106
|
else:
|
|
98
|
-
|
|
107
|
+
text_part = ""
|
|
99
108
|
escape_html = False
|
|
100
109
|
escape_underscores = False
|
|
101
110
|
elif params.wrap_width:
|
|
102
|
-
|
|
111
|
+
text_part = textwrap.fill(item.text, width=params.wrap_width)
|
|
103
112
|
else:
|
|
104
|
-
|
|
105
|
-
|
|
113
|
+
text_part = item.text
|
|
114
|
+
|
|
115
|
+
if text_part:
|
|
116
|
+
text_res = create_ser_result(text=text_part, span_source=item)
|
|
117
|
+
res_parts.append(text_res)
|
|
106
118
|
|
|
107
119
|
if isinstance(item, FloatingItem):
|
|
108
|
-
|
|
109
|
-
if
|
|
110
|
-
|
|
120
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
121
|
+
if cap_res.text:
|
|
122
|
+
res_parts.append(cap_res)
|
|
111
123
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
text=
|
|
124
|
+
text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
|
|
125
|
+
text = doc_serializer.post_process(
|
|
126
|
+
text=text,
|
|
115
127
|
escape_html=escape_html,
|
|
116
128
|
escape_underscores=escape_underscores,
|
|
117
129
|
formatting=item.formatting,
|
|
118
130
|
hyperlink=item.hyperlink,
|
|
119
131
|
)
|
|
120
|
-
return
|
|
132
|
+
return create_ser_result(text=text, span_source=res_parts)
|
|
121
133
|
|
|
122
134
|
|
|
123
135
|
class MarkdownTableSerializer(BaseTableSerializer):
|
|
@@ -133,14 +145,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
133
145
|
**kwargs,
|
|
134
146
|
) -> SerializationResult:
|
|
135
147
|
"""Serializes the passed item."""
|
|
136
|
-
|
|
148
|
+
res_parts: list[SerializationResult] = []
|
|
137
149
|
|
|
138
150
|
cap_res = doc_serializer.serialize_captions(
|
|
139
151
|
item=item,
|
|
140
152
|
**kwargs,
|
|
141
153
|
)
|
|
142
154
|
if cap_res.text:
|
|
143
|
-
|
|
155
|
+
res_parts.append(cap_res)
|
|
144
156
|
|
|
145
157
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
146
158
|
rows = [
|
|
@@ -165,11 +177,11 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
165
177
|
else:
|
|
166
178
|
table_text = ""
|
|
167
179
|
if table_text:
|
|
168
|
-
|
|
180
|
+
res_parts.append(create_ser_result(text=table_text, span_source=item))
|
|
169
181
|
|
|
170
|
-
text_res = "\n\n".join(
|
|
182
|
+
text_res = "\n\n".join([r.text for r in res_parts])
|
|
171
183
|
|
|
172
|
-
return
|
|
184
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
173
185
|
|
|
174
186
|
|
|
175
187
|
class MarkdownPictureSerializer(BasePictureSerializer):
|
|
@@ -187,14 +199,14 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
187
199
|
"""Serializes the passed item."""
|
|
188
200
|
params = MarkdownParams(**kwargs)
|
|
189
201
|
|
|
190
|
-
|
|
202
|
+
res_parts: list[SerializationResult] = []
|
|
191
203
|
|
|
192
204
|
cap_res = doc_serializer.serialize_captions(
|
|
193
205
|
item=item,
|
|
194
206
|
**kwargs,
|
|
195
207
|
)
|
|
196
208
|
if cap_res.text:
|
|
197
|
-
|
|
209
|
+
res_parts.append(cap_res)
|
|
198
210
|
|
|
199
211
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
200
212
|
img_res = self._serialize_image_part(
|
|
@@ -204,11 +216,28 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
204
216
|
image_placeholder=params.image_placeholder,
|
|
205
217
|
)
|
|
206
218
|
if img_res.text:
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
219
|
+
res_parts.append(img_res)
|
|
220
|
+
|
|
221
|
+
if params.enable_chart_tables:
|
|
222
|
+
# Check if picture has attached PictureTabularChartData
|
|
223
|
+
tabular_chart_annotations = [
|
|
224
|
+
ann
|
|
225
|
+
for ann in item.annotations
|
|
226
|
+
if isinstance(ann, PictureTabularChartData)
|
|
227
|
+
]
|
|
228
|
+
if len(tabular_chart_annotations) > 0:
|
|
229
|
+
temp_doc = DoclingDocument(name="temp")
|
|
230
|
+
temp_table = temp_doc.add_table(
|
|
231
|
+
data=tabular_chart_annotations[0].chart_data
|
|
232
|
+
)
|
|
233
|
+
md_table_content = temp_table.export_to_markdown(temp_doc)
|
|
234
|
+
if len(md_table_content) > 0:
|
|
235
|
+
res_parts.append(
|
|
236
|
+
create_ser_result(text=md_table_content, span_source=item)
|
|
237
|
+
)
|
|
238
|
+
text_res = "\n\n".join([r.text for r in res_parts])
|
|
210
239
|
|
|
211
|
-
return
|
|
240
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
212
241
|
|
|
213
242
|
def _serialize_image_part(
|
|
214
243
|
self,
|
|
@@ -255,7 +284,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
255
284
|
else:
|
|
256
285
|
text_res = image_placeholder
|
|
257
286
|
|
|
258
|
-
return
|
|
287
|
+
return create_ser_result(text=text_res, span_source=item)
|
|
259
288
|
|
|
260
289
|
|
|
261
290
|
class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
|
|
@@ -272,12 +301,13 @@ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
272
301
|
) -> SerializationResult:
|
|
273
302
|
"""Serializes the passed item."""
|
|
274
303
|
# TODO add actual implementation
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
304
|
+
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
305
|
+
return create_ser_result(
|
|
306
|
+
text="<!-- missing-key-value-item -->",
|
|
307
|
+
span_source=item,
|
|
308
|
+
)
|
|
309
|
+
else:
|
|
310
|
+
return create_ser_result()
|
|
281
311
|
|
|
282
312
|
|
|
283
313
|
class MarkdownFormSerializer(BaseFormSerializer):
|
|
@@ -294,12 +324,13 @@ class MarkdownFormSerializer(BaseFormSerializer):
|
|
|
294
324
|
) -> SerializationResult:
|
|
295
325
|
"""Serializes the passed item."""
|
|
296
326
|
# TODO add actual implementation
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
327
|
+
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
328
|
+
return create_ser_result(
|
|
329
|
+
text="<!-- missing-form-item -->",
|
|
330
|
+
span_source=item,
|
|
331
|
+
)
|
|
332
|
+
else:
|
|
333
|
+
return create_ser_result()
|
|
303
334
|
|
|
304
335
|
|
|
305
336
|
class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
@@ -319,7 +350,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
319
350
|
) -> SerializationResult:
|
|
320
351
|
"""Serializes the passed item."""
|
|
321
352
|
params = MarkdownParams(**kwargs)
|
|
322
|
-
my_visited = visited
|
|
353
|
+
my_visited = visited if visited is not None else set()
|
|
323
354
|
parts = doc_serializer.get_parts(
|
|
324
355
|
item=item,
|
|
325
356
|
list_level=list_level + 1,
|
|
@@ -332,6 +363,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
332
363
|
for p in parts:
|
|
333
364
|
if p.text and p.text[0] == " " and my_parts:
|
|
334
365
|
my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
|
|
366
|
+
my_parts[-1].spans.extend(p.spans)
|
|
335
367
|
else:
|
|
336
368
|
my_parts.append(p)
|
|
337
369
|
|
|
@@ -348,7 +380,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
348
380
|
for i, c in enumerate(my_parts)
|
|
349
381
|
]
|
|
350
382
|
)
|
|
351
|
-
return
|
|
383
|
+
return create_ser_result(text=text_res, span_source=my_parts)
|
|
352
384
|
|
|
353
385
|
|
|
354
386
|
class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
@@ -366,7 +398,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
|
366
398
|
**kwargs,
|
|
367
399
|
) -> SerializationResult:
|
|
368
400
|
"""Serializes the passed item."""
|
|
369
|
-
my_visited = visited
|
|
401
|
+
my_visited = visited if visited is not None else set()
|
|
370
402
|
parts = doc_serializer.get_parts(
|
|
371
403
|
item=item,
|
|
372
404
|
list_level=list_level,
|
|
@@ -374,7 +406,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
|
374
406
|
visited=my_visited,
|
|
375
407
|
)
|
|
376
408
|
text_res = " ".join([p.text for p in parts if p.text])
|
|
377
|
-
return
|
|
409
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
378
410
|
|
|
379
411
|
|
|
380
412
|
class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
@@ -391,10 +423,12 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
|
391
423
|
) -> SerializationResult:
|
|
392
424
|
"""Serializes the passed item."""
|
|
393
425
|
if isinstance(item, DocItem):
|
|
394
|
-
|
|
426
|
+
return create_ser_result(
|
|
427
|
+
text="<!-- missing-text -->",
|
|
428
|
+
span_source=item,
|
|
429
|
+
)
|
|
395
430
|
else:
|
|
396
|
-
|
|
397
|
-
return SerializationResult(text=text_res)
|
|
431
|
+
return create_ser_result()
|
|
398
432
|
|
|
399
433
|
|
|
400
434
|
class MarkdownDocSerializer(DocSerializer):
|
|
@@ -472,7 +506,7 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
472
506
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
473
507
|
if escape_underscores and params.escape_underscores:
|
|
474
508
|
res = self._escape_underscores(text)
|
|
475
|
-
if escape_html:
|
|
509
|
+
if escape_html and params.escape_html:
|
|
476
510
|
res = html.escape(res, quote=False)
|
|
477
511
|
res = super().post_process(
|
|
478
512
|
text=res,
|
|
@@ -482,17 +516,21 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
482
516
|
return res
|
|
483
517
|
|
|
484
518
|
@override
|
|
485
|
-
def serialize_page(
|
|
519
|
+
def serialize_page(
|
|
520
|
+
self, *, parts: list[SerializationResult], **kwargs
|
|
521
|
+
) -> SerializationResult:
|
|
486
522
|
"""Serialize a page out of its parts."""
|
|
487
|
-
text_res = "\n\n".join([p.text for p in parts])
|
|
488
|
-
return
|
|
523
|
+
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
524
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
489
525
|
|
|
490
526
|
@override
|
|
491
|
-
def serialize_doc(
|
|
527
|
+
def serialize_doc(
|
|
528
|
+
self, *, pages: dict[Optional[int], SerializationResult], **kwargs
|
|
529
|
+
) -> SerializationResult:
|
|
492
530
|
"""Serialize a document out of its pages."""
|
|
493
531
|
if self.params.page_break_placeholder is not None:
|
|
494
532
|
sep = f"\n\n{self.params.page_break_placeholder}\n\n"
|
|
495
|
-
text_res = sep.join([
|
|
496
|
-
return
|
|
533
|
+
text_res = sep.join([text for k in pages if (text := pages[k].text)])
|
|
534
|
+
return create_ser_result(text=text_res, span_source=list(pages.values()))
|
|
497
535
|
else:
|
|
498
|
-
return self.serialize_page(parts=pages)
|
|
536
|
+
return self.serialize_page(parts=list(pages.values()))
|
|
@@ -9,6 +9,7 @@ from abc import ABC, abstractmethod
|
|
|
9
9
|
from typing import Any, ClassVar, Iterator
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel
|
|
12
|
+
from typing_extensions import deprecated
|
|
12
13
|
|
|
13
14
|
from docling_core.types.doc import DoclingDocument as DLDocument
|
|
14
15
|
|
|
@@ -65,8 +66,8 @@ class BaseChunker(BaseModel, ABC):
|
|
|
65
66
|
"""
|
|
66
67
|
raise NotImplementedError()
|
|
67
68
|
|
|
68
|
-
def
|
|
69
|
-
"""
|
|
69
|
+
def contextualize(self, chunk: BaseChunk) -> str:
|
|
70
|
+
"""Contextualize the given chunk. This implementation is embedding-targeted.
|
|
70
71
|
|
|
71
72
|
Args:
|
|
72
73
|
chunk: chunk to serialize
|
|
@@ -93,3 +94,8 @@ class BaseChunker(BaseModel, ABC):
|
|
|
93
94
|
items.append(chunk.text)
|
|
94
95
|
|
|
95
96
|
return self.delim.join(items)
|
|
97
|
+
|
|
98
|
+
@deprecated("Use contextualize() instead.")
|
|
99
|
+
def serialize(self, chunk: BaseChunk) -> str:
|
|
100
|
+
"""Contextualize the given chunk. This implementation is embedding-targeted."""
|
|
101
|
+
return self.contextualize(chunk=chunk)
|