docling-core 2.25.0__py3-none-any.whl → 2.26.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +29 -3
- docling_core/experimental/serializer/common.py +157 -71
- docling_core/experimental/serializer/doctags.py +88 -54
- docling_core/experimental/serializer/html.py +941 -0
- docling_core/experimental/serializer/html_styles.py +212 -0
- docling_core/experimental/serializer/markdown.py +105 -63
- docling_core/transforms/chunker/base.py +8 -2
- docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
- docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- docling_core/types/doc/document.py +702 -482
- docling_core/types/doc/labels.py +2 -0
- docling_core/types/doc/page.py +12 -17
- docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/METADATA +1 -1
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/RECORD +18 -16
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/LICENSE +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/WHEEL +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""HTML styles for different export modes."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _get_css_with_no_styling() -> str:
|
|
5
|
+
"""Return default CSS styles for the HTML document."""
|
|
6
|
+
return "<style></style>"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_css_for_split_page() -> str:
|
|
10
|
+
"""Return default CSS styles for the HTML document."""
|
|
11
|
+
return """<style>
|
|
12
|
+
html {
|
|
13
|
+
background-color: #e1e1e1;
|
|
14
|
+
font-family: Arial, sans-serif;
|
|
15
|
+
line-height: 1.6;
|
|
16
|
+
}
|
|
17
|
+
img {
|
|
18
|
+
min-width: 500px;
|
|
19
|
+
max-width: 100%;
|
|
20
|
+
}
|
|
21
|
+
table {
|
|
22
|
+
border-collapse: collapse;
|
|
23
|
+
border: 0px solid #fff;
|
|
24
|
+
width: 100%;
|
|
25
|
+
}
|
|
26
|
+
td {
|
|
27
|
+
vertical-align: top;
|
|
28
|
+
}
|
|
29
|
+
.page {
|
|
30
|
+
background-color: white;
|
|
31
|
+
margin-top:15px;
|
|
32
|
+
padding: 30px;
|
|
33
|
+
border: 1px solid black;
|
|
34
|
+
width:100%;
|
|
35
|
+
max-width:1000px;
|
|
36
|
+
box-shadow: 0 0 10px rgba(0,0,0,0.5);
|
|
37
|
+
}
|
|
38
|
+
.page figure {
|
|
39
|
+
text-align: center;
|
|
40
|
+
}
|
|
41
|
+
.page img {
|
|
42
|
+
max-width: 900px;
|
|
43
|
+
min-width: auto;
|
|
44
|
+
}
|
|
45
|
+
.page table {
|
|
46
|
+
border-collapse: collapse;
|
|
47
|
+
margin: 1em 0;
|
|
48
|
+
width: 100%;
|
|
49
|
+
}
|
|
50
|
+
.page table td {
|
|
51
|
+
border: 1px solid #ddd;
|
|
52
|
+
padding: 8px;
|
|
53
|
+
text-align: left;
|
|
54
|
+
}
|
|
55
|
+
.page table th {
|
|
56
|
+
border: 1px solid #ddd;
|
|
57
|
+
padding: 8px;
|
|
58
|
+
text-align: left;
|
|
59
|
+
background-color: #f2f2f2;
|
|
60
|
+
font-weight: bold;
|
|
61
|
+
}
|
|
62
|
+
.page table caption {
|
|
63
|
+
color: #666;
|
|
64
|
+
font-style: italic;
|
|
65
|
+
margin-top: 0.5em;
|
|
66
|
+
padding: 8px;
|
|
67
|
+
margin-top: 5px;
|
|
68
|
+
margin-bottom: 5px;
|
|
69
|
+
}
|
|
70
|
+
.page figcaption {
|
|
71
|
+
color: #666;
|
|
72
|
+
font-style: italic;
|
|
73
|
+
margin-top: 0.5em;
|
|
74
|
+
padding: 8px;
|
|
75
|
+
margin-top: 5px;
|
|
76
|
+
margin-bottom: 5px;
|
|
77
|
+
}
|
|
78
|
+
code {
|
|
79
|
+
background-color: rgb(228, 228, 228);
|
|
80
|
+
border: 1px solid darkgray;
|
|
81
|
+
padding: 10px;
|
|
82
|
+
display: inline-block;
|
|
83
|
+
font-family: monospace;
|
|
84
|
+
max-width:980px;
|
|
85
|
+
word-wrap: normal;
|
|
86
|
+
white-space: pre-wrap;
|
|
87
|
+
word-wrap: break-word;
|
|
88
|
+
/*overflow-wrap: break-word;*/
|
|
89
|
+
}
|
|
90
|
+
</style>
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _get_css_for_single_column() -> str:
|
|
95
|
+
"""Return CSS styles for the single-column HTML document."""
|
|
96
|
+
return """<style>
|
|
97
|
+
html {
|
|
98
|
+
background-color: #f5f5f5;
|
|
99
|
+
font-family: Arial, sans-serif;
|
|
100
|
+
line-height: 1.6;
|
|
101
|
+
}
|
|
102
|
+
body {
|
|
103
|
+
max-width: 800px;
|
|
104
|
+
margin: 0 auto;
|
|
105
|
+
padding: 2rem;
|
|
106
|
+
background-color: white;
|
|
107
|
+
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
|
108
|
+
}
|
|
109
|
+
h1, h2, h3, h4, h5, h6 {
|
|
110
|
+
color: #333;
|
|
111
|
+
margin-top: 1.5em;
|
|
112
|
+
margin-bottom: 0.5em;
|
|
113
|
+
}
|
|
114
|
+
h1 {
|
|
115
|
+
font-size: 2em;
|
|
116
|
+
border-bottom: 1px solid #eee;
|
|
117
|
+
padding-bottom: 0.3em;
|
|
118
|
+
}
|
|
119
|
+
table {
|
|
120
|
+
border-collapse: collapse;
|
|
121
|
+
margin: 1em 0;
|
|
122
|
+
width: 100%;
|
|
123
|
+
}
|
|
124
|
+
th, td {
|
|
125
|
+
border: 1px solid #ddd;
|
|
126
|
+
padding: 8px;
|
|
127
|
+
text-align: left;
|
|
128
|
+
}
|
|
129
|
+
th {
|
|
130
|
+
background-color: #f2f2f2;
|
|
131
|
+
font-weight: bold;
|
|
132
|
+
}
|
|
133
|
+
figure {
|
|
134
|
+
margin: 1.5em 0;
|
|
135
|
+
text-align: center;
|
|
136
|
+
}
|
|
137
|
+
figcaption {
|
|
138
|
+
color: #666;
|
|
139
|
+
font-style: italic;
|
|
140
|
+
margin-top: 0.5em;
|
|
141
|
+
}
|
|
142
|
+
img {
|
|
143
|
+
max-width: 100%;
|
|
144
|
+
height: auto;
|
|
145
|
+
}
|
|
146
|
+
pre {
|
|
147
|
+
background-color: #f6f8fa;
|
|
148
|
+
border-radius: 3px;
|
|
149
|
+
padding: 1em;
|
|
150
|
+
overflow: auto;
|
|
151
|
+
}
|
|
152
|
+
code {
|
|
153
|
+
font-family: monospace;
|
|
154
|
+
background-color: #f6f8fa;
|
|
155
|
+
padding: 0.2em 0.4em;
|
|
156
|
+
border-radius: 3px;
|
|
157
|
+
}
|
|
158
|
+
pre code {
|
|
159
|
+
background-color: transparent;
|
|
160
|
+
padding: 0;
|
|
161
|
+
}
|
|
162
|
+
.formula {
|
|
163
|
+
text-align: center;
|
|
164
|
+
padding: 0.5em;
|
|
165
|
+
margin: 1em 0;
|
|
166
|
+
background-color: #f9f9f9;
|
|
167
|
+
}
|
|
168
|
+
.formula-not-decoded {
|
|
169
|
+
text-align: center;
|
|
170
|
+
padding: 0.5em;
|
|
171
|
+
margin: 1em 0;
|
|
172
|
+
background: repeating-linear-gradient(
|
|
173
|
+
45deg,
|
|
174
|
+
#f0f0f0,
|
|
175
|
+
#f0f0f0 10px,
|
|
176
|
+
#f9f9f9 10px,
|
|
177
|
+
#f9f9f9 20px
|
|
178
|
+
);
|
|
179
|
+
}
|
|
180
|
+
.page-break {
|
|
181
|
+
page-break-after: always;
|
|
182
|
+
border-top: 1px dashed #ccc;
|
|
183
|
+
margin: 2em 0;
|
|
184
|
+
}
|
|
185
|
+
.key-value-region {
|
|
186
|
+
background-color: #f9f9f9;
|
|
187
|
+
padding: 1em;
|
|
188
|
+
border-radius: 4px;
|
|
189
|
+
margin: 1em 0;
|
|
190
|
+
}
|
|
191
|
+
.key-value-region dt {
|
|
192
|
+
font-weight: bold;
|
|
193
|
+
}
|
|
194
|
+
.key-value-region dd {
|
|
195
|
+
margin-left: 1em;
|
|
196
|
+
margin-bottom: 0.5em;
|
|
197
|
+
}
|
|
198
|
+
.form-container {
|
|
199
|
+
border: 1px solid #ddd;
|
|
200
|
+
padding: 1em;
|
|
201
|
+
border-radius: 4px;
|
|
202
|
+
margin: 1em 0;
|
|
203
|
+
}
|
|
204
|
+
.form-item {
|
|
205
|
+
margin-bottom: 0.5em;
|
|
206
|
+
}
|
|
207
|
+
.image-classification {
|
|
208
|
+
font-size: 0.9em;
|
|
209
|
+
color: #666;
|
|
210
|
+
margin-top: 0.5em;
|
|
211
|
+
}
|
|
212
|
+
</style>"""
|
|
@@ -26,7 +26,12 @@ from docling_core.experimental.serializer.base import (
|
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
28
|
)
|
|
29
|
-
from docling_core.experimental.serializer.common import
|
|
29
|
+
from docling_core.experimental.serializer.common import (
|
|
30
|
+
CommonParams,
|
|
31
|
+
DocSerializer,
|
|
32
|
+
_PageBreakSerResult,
|
|
33
|
+
create_ser_result,
|
|
34
|
+
)
|
|
30
35
|
from docling_core.types.doc.base import ImageRefMode
|
|
31
36
|
from docling_core.types.doc.document import (
|
|
32
37
|
CodeItem,
|
|
@@ -43,6 +48,7 @@ from docling_core.types.doc.document import (
|
|
|
43
48
|
NodeItem,
|
|
44
49
|
OrderedList,
|
|
45
50
|
PictureItem,
|
|
51
|
+
PictureTabularChartData,
|
|
46
52
|
SectionHeaderItem,
|
|
47
53
|
TableItem,
|
|
48
54
|
TextItem,
|
|
@@ -57,10 +63,12 @@ class MarkdownParams(CommonParams):
|
|
|
57
63
|
layers: set[ContentLayer] = {ContentLayer.BODY}
|
|
58
64
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
|
|
59
65
|
image_placeholder: str = "<!-- image -->"
|
|
66
|
+
enable_chart_tables: bool = True
|
|
60
67
|
indent: int = 4
|
|
61
68
|
wrap_width: Optional[PositiveInt] = None
|
|
62
69
|
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
|
|
63
70
|
escape_underscores: bool = True
|
|
71
|
+
escape_html: bool = True
|
|
64
72
|
|
|
65
73
|
|
|
66
74
|
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
@@ -78,46 +86,51 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
78
86
|
) -> SerializationResult:
|
|
79
87
|
"""Serializes the passed item."""
|
|
80
88
|
params = MarkdownParams(**kwargs)
|
|
81
|
-
|
|
89
|
+
res_parts: list[SerializationResult] = []
|
|
82
90
|
escape_html = True
|
|
83
91
|
escape_underscores = True
|
|
84
92
|
if isinstance(item, TitleItem):
|
|
85
|
-
|
|
93
|
+
text_part = f"# {item.text}"
|
|
86
94
|
elif isinstance(item, SectionHeaderItem):
|
|
87
|
-
|
|
95
|
+
text_part = f"{(item.level + 1) * '#'} {item.text}"
|
|
88
96
|
elif isinstance(item, CodeItem):
|
|
89
|
-
|
|
97
|
+
text_part = (
|
|
98
|
+
f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
99
|
+
)
|
|
90
100
|
escape_html = False
|
|
91
101
|
escape_underscores = False
|
|
92
102
|
elif isinstance(item, FormulaItem):
|
|
93
103
|
if item.text:
|
|
94
|
-
|
|
104
|
+
text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
|
|
95
105
|
elif item.orig:
|
|
96
|
-
|
|
106
|
+
text_part = "<!-- formula-not-decoded -->"
|
|
97
107
|
else:
|
|
98
|
-
|
|
108
|
+
text_part = ""
|
|
99
109
|
escape_html = False
|
|
100
110
|
escape_underscores = False
|
|
101
111
|
elif params.wrap_width:
|
|
102
|
-
|
|
112
|
+
text_part = textwrap.fill(item.text, width=params.wrap_width)
|
|
103
113
|
else:
|
|
104
|
-
|
|
105
|
-
|
|
114
|
+
text_part = item.text
|
|
115
|
+
|
|
116
|
+
if text_part:
|
|
117
|
+
text_res = create_ser_result(text=text_part, span_source=item)
|
|
118
|
+
res_parts.append(text_res)
|
|
106
119
|
|
|
107
120
|
if isinstance(item, FloatingItem):
|
|
108
|
-
|
|
109
|
-
if
|
|
110
|
-
|
|
121
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
122
|
+
if cap_res.text:
|
|
123
|
+
res_parts.append(cap_res)
|
|
111
124
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
text=
|
|
125
|
+
text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
|
|
126
|
+
text = doc_serializer.post_process(
|
|
127
|
+
text=text,
|
|
115
128
|
escape_html=escape_html,
|
|
116
129
|
escape_underscores=escape_underscores,
|
|
117
130
|
formatting=item.formatting,
|
|
118
131
|
hyperlink=item.hyperlink,
|
|
119
132
|
)
|
|
120
|
-
return
|
|
133
|
+
return create_ser_result(text=text, span_source=res_parts)
|
|
121
134
|
|
|
122
135
|
|
|
123
136
|
class MarkdownTableSerializer(BaseTableSerializer):
|
|
@@ -133,14 +146,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
133
146
|
**kwargs,
|
|
134
147
|
) -> SerializationResult:
|
|
135
148
|
"""Serializes the passed item."""
|
|
136
|
-
|
|
149
|
+
res_parts: list[SerializationResult] = []
|
|
137
150
|
|
|
138
151
|
cap_res = doc_serializer.serialize_captions(
|
|
139
152
|
item=item,
|
|
140
153
|
**kwargs,
|
|
141
154
|
)
|
|
142
155
|
if cap_res.text:
|
|
143
|
-
|
|
156
|
+
res_parts.append(cap_res)
|
|
144
157
|
|
|
145
158
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
146
159
|
rows = [
|
|
@@ -165,11 +178,11 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
165
178
|
else:
|
|
166
179
|
table_text = ""
|
|
167
180
|
if table_text:
|
|
168
|
-
|
|
181
|
+
res_parts.append(create_ser_result(text=table_text, span_source=item))
|
|
169
182
|
|
|
170
|
-
text_res = "\n\n".join(
|
|
183
|
+
text_res = "\n\n".join([r.text for r in res_parts])
|
|
171
184
|
|
|
172
|
-
return
|
|
185
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
173
186
|
|
|
174
187
|
|
|
175
188
|
class MarkdownPictureSerializer(BasePictureSerializer):
|
|
@@ -187,14 +200,14 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
187
200
|
"""Serializes the passed item."""
|
|
188
201
|
params = MarkdownParams(**kwargs)
|
|
189
202
|
|
|
190
|
-
|
|
203
|
+
res_parts: list[SerializationResult] = []
|
|
191
204
|
|
|
192
205
|
cap_res = doc_serializer.serialize_captions(
|
|
193
206
|
item=item,
|
|
194
207
|
**kwargs,
|
|
195
208
|
)
|
|
196
209
|
if cap_res.text:
|
|
197
|
-
|
|
210
|
+
res_parts.append(cap_res)
|
|
198
211
|
|
|
199
212
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
200
213
|
img_res = self._serialize_image_part(
|
|
@@ -204,11 +217,28 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
204
217
|
image_placeholder=params.image_placeholder,
|
|
205
218
|
)
|
|
206
219
|
if img_res.text:
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
220
|
+
res_parts.append(img_res)
|
|
221
|
+
|
|
222
|
+
if params.enable_chart_tables:
|
|
223
|
+
# Check if picture has attached PictureTabularChartData
|
|
224
|
+
tabular_chart_annotations = [
|
|
225
|
+
ann
|
|
226
|
+
for ann in item.annotations
|
|
227
|
+
if isinstance(ann, PictureTabularChartData)
|
|
228
|
+
]
|
|
229
|
+
if len(tabular_chart_annotations) > 0:
|
|
230
|
+
temp_doc = DoclingDocument(name="temp")
|
|
231
|
+
temp_table = temp_doc.add_table(
|
|
232
|
+
data=tabular_chart_annotations[0].chart_data
|
|
233
|
+
)
|
|
234
|
+
md_table_content = temp_table.export_to_markdown(temp_doc)
|
|
235
|
+
if len(md_table_content) > 0:
|
|
236
|
+
res_parts.append(
|
|
237
|
+
create_ser_result(text=md_table_content, span_source=item)
|
|
238
|
+
)
|
|
239
|
+
text_res = "\n\n".join([r.text for r in res_parts])
|
|
210
240
|
|
|
211
|
-
return
|
|
241
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
212
242
|
|
|
213
243
|
def _serialize_image_part(
|
|
214
244
|
self,
|
|
@@ -255,7 +285,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
255
285
|
else:
|
|
256
286
|
text_res = image_placeholder
|
|
257
287
|
|
|
258
|
-
return
|
|
288
|
+
return create_ser_result(text=text_res, span_source=item)
|
|
259
289
|
|
|
260
290
|
|
|
261
291
|
class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
|
|
@@ -272,12 +302,13 @@ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
272
302
|
) -> SerializationResult:
|
|
273
303
|
"""Serializes the passed item."""
|
|
274
304
|
# TODO add actual implementation
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
305
|
+
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
306
|
+
return create_ser_result(
|
|
307
|
+
text="<!-- missing-key-value-item -->",
|
|
308
|
+
span_source=item,
|
|
309
|
+
)
|
|
310
|
+
else:
|
|
311
|
+
return create_ser_result()
|
|
281
312
|
|
|
282
313
|
|
|
283
314
|
class MarkdownFormSerializer(BaseFormSerializer):
|
|
@@ -294,12 +325,13 @@ class MarkdownFormSerializer(BaseFormSerializer):
|
|
|
294
325
|
) -> SerializationResult:
|
|
295
326
|
"""Serializes the passed item."""
|
|
296
327
|
# TODO add actual implementation
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
328
|
+
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
329
|
+
return create_ser_result(
|
|
330
|
+
text="<!-- missing-form-item -->",
|
|
331
|
+
span_source=item,
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
return create_ser_result()
|
|
303
335
|
|
|
304
336
|
|
|
305
337
|
class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
@@ -319,7 +351,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
319
351
|
) -> SerializationResult:
|
|
320
352
|
"""Serializes the passed item."""
|
|
321
353
|
params = MarkdownParams(**kwargs)
|
|
322
|
-
my_visited = visited
|
|
354
|
+
my_visited = visited if visited is not None else set()
|
|
323
355
|
parts = doc_serializer.get_parts(
|
|
324
356
|
item=item,
|
|
325
357
|
list_level=list_level + 1,
|
|
@@ -332,6 +364,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
332
364
|
for p in parts:
|
|
333
365
|
if p.text and p.text[0] == " " and my_parts:
|
|
334
366
|
my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
|
|
367
|
+
my_parts[-1].spans.extend(p.spans)
|
|
335
368
|
else:
|
|
336
369
|
my_parts.append(p)
|
|
337
370
|
|
|
@@ -343,12 +376,16 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
343
376
|
(
|
|
344
377
|
c.text
|
|
345
378
|
if c.text and c.text[0] == " "
|
|
346
|
-
else
|
|
379
|
+
else (
|
|
380
|
+
f"{indent_str}"
|
|
381
|
+
f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}" # noqa: E501
|
|
382
|
+
f"{c.text}"
|
|
383
|
+
)
|
|
347
384
|
)
|
|
348
385
|
for i, c in enumerate(my_parts)
|
|
349
386
|
]
|
|
350
387
|
)
|
|
351
|
-
return
|
|
388
|
+
return create_ser_result(text=text_res, span_source=my_parts)
|
|
352
389
|
|
|
353
390
|
|
|
354
391
|
class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
@@ -366,15 +403,16 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
|
366
403
|
**kwargs,
|
|
367
404
|
) -> SerializationResult:
|
|
368
405
|
"""Serializes the passed item."""
|
|
369
|
-
my_visited = visited
|
|
406
|
+
my_visited = visited if visited is not None else set()
|
|
370
407
|
parts = doc_serializer.get_parts(
|
|
371
408
|
item=item,
|
|
372
409
|
list_level=list_level,
|
|
373
410
|
is_inline_scope=True,
|
|
374
411
|
visited=my_visited,
|
|
412
|
+
**kwargs,
|
|
375
413
|
)
|
|
376
414
|
text_res = " ".join([p.text for p in parts if p.text])
|
|
377
|
-
return
|
|
415
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
378
416
|
|
|
379
417
|
|
|
380
418
|
class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
@@ -391,10 +429,12 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
|
391
429
|
) -> SerializationResult:
|
|
392
430
|
"""Serializes the passed item."""
|
|
393
431
|
if isinstance(item, DocItem):
|
|
394
|
-
|
|
432
|
+
return create_ser_result(
|
|
433
|
+
text="<!-- missing-text -->",
|
|
434
|
+
span_source=item,
|
|
435
|
+
)
|
|
395
436
|
else:
|
|
396
|
-
|
|
397
|
-
return SerializationResult(text=text_res)
|
|
437
|
+
return create_ser_result()
|
|
398
438
|
|
|
399
439
|
|
|
400
440
|
class MarkdownDocSerializer(DocSerializer):
|
|
@@ -472,7 +512,7 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
472
512
|
params = self.params.merge_with_patch(patch=kwargs)
|
|
473
513
|
if escape_underscores and params.escape_underscores:
|
|
474
514
|
res = self._escape_underscores(text)
|
|
475
|
-
if escape_html:
|
|
515
|
+
if escape_html and params.escape_html:
|
|
476
516
|
res = html.escape(res, quote=False)
|
|
477
517
|
res = super().post_process(
|
|
478
518
|
text=res,
|
|
@@ -482,17 +522,19 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
482
522
|
return res
|
|
483
523
|
|
|
484
524
|
@override
|
|
485
|
-
def
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
525
|
+
def serialize_doc(
|
|
526
|
+
self, *, parts: list[SerializationResult], **kwargs
|
|
527
|
+
) -> SerializationResult:
|
|
528
|
+
"""Serialize a document out of its parts."""
|
|
529
|
+
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
530
|
+
if self.params.page_break_placeholder:
|
|
531
|
+
page_sep = self.params.page_break_placeholder or ""
|
|
532
|
+
for full_match, _, _ in self._get_page_breaks(text=text_res):
|
|
533
|
+
text_res = text_res.replace(full_match, page_sep)
|
|
534
|
+
|
|
535
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
489
536
|
|
|
490
537
|
@override
|
|
491
|
-
def
|
|
492
|
-
"""
|
|
493
|
-
|
|
494
|
-
sep = f"\n\n{self.params.page_break_placeholder}\n\n"
|
|
495
|
-
text_res = sep.join([p.text for p in pages if p.text])
|
|
496
|
-
return SerializationResult(text=text_res)
|
|
497
|
-
else:
|
|
498
|
-
return self.serialize_page(parts=pages)
|
|
538
|
+
def requires_page_break(self):
|
|
539
|
+
"""Whether to add page breaks."""
|
|
540
|
+
return self.params.page_break_placeholder is not None
|
|
@@ -9,6 +9,7 @@ from abc import ABC, abstractmethod
|
|
|
9
9
|
from typing import Any, ClassVar, Iterator
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel
|
|
12
|
+
from typing_extensions import deprecated
|
|
12
13
|
|
|
13
14
|
from docling_core.types.doc import DoclingDocument as DLDocument
|
|
14
15
|
|
|
@@ -65,8 +66,8 @@ class BaseChunker(BaseModel, ABC):
|
|
|
65
66
|
"""
|
|
66
67
|
raise NotImplementedError()
|
|
67
68
|
|
|
68
|
-
def
|
|
69
|
-
"""
|
|
69
|
+
def contextualize(self, chunk: BaseChunk) -> str:
|
|
70
|
+
"""Contextualize the given chunk. This implementation is embedding-targeted.
|
|
70
71
|
|
|
71
72
|
Args:
|
|
72
73
|
chunk: chunk to serialize
|
|
@@ -93,3 +94,8 @@ class BaseChunker(BaseModel, ABC):
|
|
|
93
94
|
items.append(chunk.text)
|
|
94
95
|
|
|
95
96
|
return self.delim.join(items)
|
|
97
|
+
|
|
98
|
+
@deprecated("Use contextualize() instead.")
|
|
99
|
+
def serialize(self, chunk: BaseChunk) -> str:
|
|
100
|
+
"""Contextualize the given chunk. This implementation is embedding-targeted."""
|
|
101
|
+
return self.contextualize(chunk=chunk)
|