docling-core 2.47.0__py3-none-any.whl → 2.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hierarchical_chunker.py +1 -1
- docling_core/transforms/serializer/common.py +1 -0
- docling_core/transforms/serializer/doctags.py +25 -9
- docling_core/transforms/serializer/html.py +89 -84
- docling_core/transforms/serializer/markdown.py +23 -21
- docling_core/types/doc/document.py +2 -1
- {docling_core-2.47.0.dist-info → docling_core-2.48.0.dist-info}/METADATA +1 -1
- {docling_core-2.47.0.dist-info → docling_core-2.48.0.dist-info}/RECORD +12 -12
- {docling_core-2.47.0.dist-info → docling_core-2.48.0.dist-info}/WHEEL +0 -0
- {docling_core-2.47.0.dist-info → docling_core-2.48.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.47.0.dist-info → docling_core-2.48.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.47.0.dist-info → docling_core-2.48.0.dist-info}/top_level.txt +0 -0
|
@@ -145,7 +145,7 @@ class TripletTableSerializer(BaseTableSerializer):
|
|
|
145
145
|
parts.append(cap_res)
|
|
146
146
|
|
|
147
147
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
148
|
-
table_df = item.export_to_dataframe()
|
|
148
|
+
table_df = item.export_to_dataframe(doc)
|
|
149
149
|
if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
|
|
150
150
|
|
|
151
151
|
# copy header as first row and shift all rows by one
|
|
@@ -32,6 +32,7 @@ from docling_core.types.doc.document import (
|
|
|
32
32
|
DoclingDocument,
|
|
33
33
|
FloatingItem,
|
|
34
34
|
FormItem,
|
|
35
|
+
GroupItem,
|
|
35
36
|
InlineGroup,
|
|
36
37
|
KeyValueItem,
|
|
37
38
|
ListGroup,
|
|
@@ -42,6 +43,7 @@ from docling_core.types.doc.document import (
|
|
|
42
43
|
PictureMoleculeData,
|
|
43
44
|
PictureTabularChartData,
|
|
44
45
|
ProvenanceItem,
|
|
46
|
+
SectionHeaderItem,
|
|
45
47
|
TableItem,
|
|
46
48
|
TextItem,
|
|
47
49
|
)
|
|
@@ -94,11 +96,11 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
94
96
|
item: TextItem,
|
|
95
97
|
doc_serializer: BaseDocSerializer,
|
|
96
98
|
doc: DoclingDocument,
|
|
99
|
+
visited: Optional[set[str]] = None,
|
|
97
100
|
**kwargs: Any,
|
|
98
101
|
) -> SerializationResult:
|
|
99
102
|
"""Serializes the passed item."""
|
|
100
|
-
|
|
101
|
-
|
|
103
|
+
my_visited = visited if visited is not None else set()
|
|
102
104
|
params = DocTagsParams(**kwargs)
|
|
103
105
|
wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
|
|
104
106
|
label=item.label,
|
|
@@ -116,12 +118,21 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
116
118
|
parts.append(location)
|
|
117
119
|
|
|
118
120
|
if params.add_content:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
121
|
+
if (
|
|
122
|
+
item.text == ""
|
|
123
|
+
and len(item.children) == 1
|
|
124
|
+
and isinstance(
|
|
125
|
+
(child_group := item.children[0].resolve(doc)), InlineGroup
|
|
126
|
+
)
|
|
127
|
+
):
|
|
128
|
+
ser_res = doc_serializer.serialize(item=child_group, visited=my_visited)
|
|
129
|
+
text_part = ser_res.text
|
|
130
|
+
else:
|
|
131
|
+
text_part = doc_serializer.post_process(
|
|
132
|
+
text=item.text,
|
|
133
|
+
formatting=item.formatting,
|
|
134
|
+
hyperlink=item.hyperlink,
|
|
135
|
+
)
|
|
125
136
|
|
|
126
137
|
if isinstance(item, CodeItem):
|
|
127
138
|
language_token = DocumentToken.get_code_language_token(
|
|
@@ -506,7 +517,12 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
506
517
|
**kwargs: Any,
|
|
507
518
|
) -> SerializationResult:
|
|
508
519
|
"""Serializes the passed item."""
|
|
509
|
-
|
|
520
|
+
if isinstance(item, GroupItem):
|
|
521
|
+
parts = doc_serializer.get_parts(item=item, **kwargs)
|
|
522
|
+
text_res = "\n".join([p.text for p in parts if p.text])
|
|
523
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
524
|
+
else:
|
|
525
|
+
return create_ser_result()
|
|
510
526
|
|
|
511
527
|
|
|
512
528
|
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
|
|
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
|
|
|
55
55
|
FormItem,
|
|
56
56
|
FormulaItem,
|
|
57
57
|
GraphData,
|
|
58
|
+
GroupItem,
|
|
58
59
|
ImageRef,
|
|
59
60
|
InlineGroup,
|
|
60
61
|
KeyValueItem,
|
|
@@ -139,21 +140,34 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
139
140
|
res_parts: list[SerializationResult] = []
|
|
140
141
|
post_processed = False
|
|
141
142
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
has_inline_repr = (
|
|
144
|
+
item.text == ""
|
|
145
|
+
and len(item.children) == 1
|
|
146
|
+
and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
|
|
147
|
+
)
|
|
148
|
+
if has_inline_repr:
|
|
149
|
+
text = doc_serializer.serialize(item=child_group, visited=my_visited).text
|
|
150
|
+
post_processed = True
|
|
151
|
+
else:
|
|
152
|
+
text = item.text
|
|
153
|
+
if not isinstance(item, (CodeItem, FormulaItem)):
|
|
154
|
+
text = html.escape(text, quote=False)
|
|
155
|
+
text = text.replace("\n", "<br>")
|
|
146
156
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
157
|
+
# Prepare the HTML based on item type
|
|
158
|
+
if isinstance(item, (TitleItem, SectionHeaderItem)):
|
|
159
|
+
section_level = (
|
|
160
|
+
min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
|
|
161
|
+
)
|
|
150
162
|
text = get_html_tag_with_text_direction(
|
|
151
|
-
html_tag=f"h{section_level}", text=
|
|
163
|
+
html_tag=f"h{section_level}", text=text
|
|
152
164
|
)
|
|
153
165
|
|
|
154
166
|
elif isinstance(item, FormulaItem):
|
|
155
167
|
text = self._process_formula(
|
|
156
168
|
item=item,
|
|
169
|
+
text=text,
|
|
170
|
+
orig=item.orig,
|
|
157
171
|
doc=doc,
|
|
158
172
|
image_mode=params.image_mode,
|
|
159
173
|
formula_to_mathml=params.formula_to_mathml,
|
|
@@ -161,19 +175,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
161
175
|
)
|
|
162
176
|
|
|
163
177
|
elif isinstance(item, CodeItem):
|
|
164
|
-
text =
|
|
178
|
+
text = (
|
|
179
|
+
f"<code>{text}</code>"
|
|
180
|
+
if is_inline_scope
|
|
181
|
+
else f"<pre><code>{text}</code></pre>"
|
|
182
|
+
)
|
|
165
183
|
|
|
166
184
|
elif isinstance(item, ListItem):
|
|
167
185
|
# List items are handled by list serializer
|
|
168
186
|
text_parts: list[str] = []
|
|
169
|
-
if
|
|
170
|
-
|
|
171
|
-
text=
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
187
|
+
if text:
|
|
188
|
+
if has_inline_repr:
|
|
189
|
+
text = f"\n{text}\n"
|
|
190
|
+
else:
|
|
191
|
+
text = doc_serializer.post_process(
|
|
192
|
+
text=text,
|
|
193
|
+
formatting=item.formatting,
|
|
194
|
+
hyperlink=item.hyperlink,
|
|
195
|
+
)
|
|
196
|
+
post_processed = True
|
|
197
|
+
text_parts.append(text)
|
|
177
198
|
nested_parts = [
|
|
178
199
|
r.text
|
|
179
200
|
for r in doc_serializer.get_parts(
|
|
@@ -184,29 +205,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
184
205
|
)
|
|
185
206
|
]
|
|
186
207
|
text_parts.extend(nested_parts)
|
|
187
|
-
|
|
208
|
+
text = "\n".join(text_parts)
|
|
188
209
|
if nested_parts:
|
|
189
|
-
|
|
210
|
+
text = f"\n{text}\n"
|
|
190
211
|
text = (
|
|
191
212
|
get_html_tag_with_text_direction(
|
|
192
213
|
html_tag="li",
|
|
193
|
-
text=
|
|
214
|
+
text=text,
|
|
194
215
|
attrs=(
|
|
195
216
|
{"style": f"list-style-type: '{item.marker} ';"}
|
|
196
217
|
if params.show_original_list_item_marker and item.marker
|
|
197
218
|
else {}
|
|
198
219
|
),
|
|
199
220
|
)
|
|
200
|
-
if
|
|
221
|
+
if text
|
|
201
222
|
else ""
|
|
202
223
|
)
|
|
203
224
|
|
|
204
|
-
elif is_inline_scope:
|
|
205
|
-
text = self._prepare_content(item.text)
|
|
206
|
-
else:
|
|
225
|
+
elif not is_inline_scope:
|
|
207
226
|
# Regular text item
|
|
208
|
-
|
|
209
|
-
text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
|
|
227
|
+
text = get_html_tag_with_text_direction(html_tag="p", text=text)
|
|
210
228
|
|
|
211
229
|
# Apply formatting and hyperlinks
|
|
212
230
|
if not post_processed:
|
|
@@ -227,66 +245,44 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
227
245
|
|
|
228
246
|
return create_ser_result(text=text, span_source=res_parts)
|
|
229
247
|
|
|
230
|
-
def _prepare_content(
|
|
231
|
-
self, text: str, do_escape_html=True, do_replace_newline=True
|
|
232
|
-
) -> str:
|
|
233
|
-
"""Prepare text content for HTML inclusion."""
|
|
234
|
-
if do_escape_html:
|
|
235
|
-
text = html.escape(text, quote=False)
|
|
236
|
-
if do_replace_newline:
|
|
237
|
-
text = text.replace("\n", "<br>")
|
|
238
|
-
return text
|
|
239
|
-
|
|
240
|
-
def _process_code(
|
|
241
|
-
self,
|
|
242
|
-
item: CodeItem,
|
|
243
|
-
is_inline_scope: bool,
|
|
244
|
-
) -> str:
|
|
245
|
-
code_text = self._prepare_content(
|
|
246
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
247
|
-
)
|
|
248
|
-
if is_inline_scope:
|
|
249
|
-
text = f"<code>{code_text}</code>"
|
|
250
|
-
else:
|
|
251
|
-
text = f"<pre><code>{code_text}</code></pre>"
|
|
252
|
-
|
|
253
|
-
return text
|
|
254
|
-
|
|
255
248
|
def _process_formula(
|
|
256
249
|
self,
|
|
257
|
-
|
|
250
|
+
*,
|
|
251
|
+
item: DocItem,
|
|
252
|
+
text: str,
|
|
253
|
+
orig: str,
|
|
258
254
|
doc: DoclingDocument,
|
|
259
255
|
image_mode: ImageRefMode,
|
|
260
256
|
formula_to_mathml: bool,
|
|
261
257
|
is_inline_scope: bool,
|
|
262
258
|
) -> str:
|
|
263
259
|
"""Process a formula item to HTML/MathML."""
|
|
264
|
-
math_formula = self._prepare_content(
|
|
265
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
266
|
-
)
|
|
267
|
-
|
|
268
260
|
# If formula is empty, try to use an image fallback
|
|
269
|
-
if
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
261
|
+
if (
|
|
262
|
+
text == ""
|
|
263
|
+
and orig != ""
|
|
264
|
+
and len(item.prov) > 0
|
|
265
|
+
and image_mode == ImageRefMode.EMBEDDED
|
|
266
|
+
and (
|
|
267
|
+
img_fallback := self._get_formula_image_fallback(
|
|
268
|
+
item=item, orig=orig, doc=doc
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
):
|
|
272
|
+
return img_fallback
|
|
277
273
|
|
|
278
274
|
# Try to generate MathML
|
|
279
|
-
|
|
275
|
+
elif formula_to_mathml and text:
|
|
280
276
|
try:
|
|
281
277
|
# Set display mode based on context
|
|
282
278
|
display_mode = "inline" if is_inline_scope else "block"
|
|
283
279
|
mathml_element = latex2mathml.converter.convert_to_element(
|
|
284
|
-
|
|
280
|
+
text, display=display_mode
|
|
285
281
|
)
|
|
286
282
|
annotation = SubElement(
|
|
287
283
|
mathml_element, "annotation", dict(encoding="TeX")
|
|
288
284
|
)
|
|
289
|
-
annotation.text =
|
|
285
|
+
annotation.text = text
|
|
290
286
|
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
291
287
|
|
|
292
288
|
# Don't wrap in div for inline formulas
|
|
@@ -296,40 +292,40 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
296
292
|
return f"<div>{mathml}</div>"
|
|
297
293
|
|
|
298
294
|
except Exception:
|
|
299
|
-
img_fallback = self._get_formula_image_fallback(
|
|
295
|
+
img_fallback = self._get_formula_image_fallback(
|
|
296
|
+
item=item, orig=orig, doc=doc
|
|
297
|
+
)
|
|
300
298
|
if (
|
|
301
299
|
image_mode == ImageRefMode.EMBEDDED
|
|
302
300
|
and len(item.prov) > 0
|
|
303
301
|
and img_fallback
|
|
304
302
|
):
|
|
305
303
|
return img_fallback
|
|
306
|
-
elif
|
|
307
|
-
return f"<pre>{
|
|
304
|
+
elif text:
|
|
305
|
+
return f"<pre>{text}</pre>"
|
|
308
306
|
else:
|
|
309
307
|
return "<pre>Formula not decoded</pre>"
|
|
310
308
|
|
|
311
309
|
_logger.warning("Could not parse formula with MathML")
|
|
312
310
|
|
|
313
311
|
# Fallback options if we got here
|
|
314
|
-
if
|
|
315
|
-
return f"<code>{
|
|
316
|
-
elif
|
|
317
|
-
f"<pre>{
|
|
312
|
+
if text and is_inline_scope:
|
|
313
|
+
return f"<code>{text}</code>"
|
|
314
|
+
elif text and (not is_inline_scope):
|
|
315
|
+
f"<pre>{text}</pre>"
|
|
318
316
|
elif is_inline_scope:
|
|
319
317
|
return '<span class="formula-not-decoded">Formula not decoded</span>'
|
|
320
318
|
|
|
321
319
|
return '<div class="formula-not-decoded">Formula not decoded</div>'
|
|
322
320
|
|
|
323
321
|
def _get_formula_image_fallback(
|
|
324
|
-
self, item:
|
|
322
|
+
self, *, item: DocItem, orig: str, doc: DoclingDocument
|
|
325
323
|
) -> Optional[str]:
|
|
326
324
|
"""Try to get an image fallback for a formula."""
|
|
327
325
|
item_image = item.get_image(doc=doc)
|
|
328
326
|
if item_image is not None:
|
|
329
327
|
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
330
|
-
return
|
|
331
|
-
"<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
|
|
332
|
-
)
|
|
328
|
+
return "<figure>" f'<img src="{img_ref.uri}" alt="{orig}" />' "</figure>"
|
|
333
329
|
return None
|
|
334
330
|
|
|
335
331
|
|
|
@@ -792,21 +788,30 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
792
788
|
"""HTML-specific fallback serializer."""
|
|
793
789
|
|
|
794
790
|
@override
|
|
795
|
-
def serialize(
|
|
791
|
+
def serialize(
|
|
792
|
+
self,
|
|
793
|
+
*,
|
|
794
|
+
item: NodeItem,
|
|
795
|
+
doc_serializer: "BaseDocSerializer",
|
|
796
|
+
doc: DoclingDocument,
|
|
797
|
+
**kwargs: Any,
|
|
798
|
+
) -> SerializationResult:
|
|
796
799
|
"""Fallback serializer for items not handled by other serializers."""
|
|
797
|
-
if isinstance(item,
|
|
800
|
+
if isinstance(item, GroupItem):
|
|
801
|
+
parts = doc_serializer.get_parts(item=item, **kwargs)
|
|
802
|
+
text_res = "\n".join([p.text for p in parts if p.text])
|
|
803
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
804
|
+
else:
|
|
798
805
|
return create_ser_result(
|
|
799
806
|
text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
|
|
800
|
-
span_source=item,
|
|
807
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
801
808
|
)
|
|
802
|
-
else:
|
|
803
|
-
# For group items, we don't generate any markup
|
|
804
|
-
return create_ser_result()
|
|
805
809
|
|
|
806
810
|
|
|
807
811
|
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
808
812
|
"""HTML-specific annotation serializer."""
|
|
809
813
|
|
|
814
|
+
@override
|
|
810
815
|
def serialize(
|
|
811
816
|
self,
|
|
812
817
|
*,
|
|
@@ -45,6 +45,7 @@ from docling_core.types.doc.document import (
|
|
|
45
45
|
Formatting,
|
|
46
46
|
FormItem,
|
|
47
47
|
FormulaItem,
|
|
48
|
+
GroupItem,
|
|
48
49
|
ImageRef,
|
|
49
50
|
InlineGroup,
|
|
50
51
|
KeyValueItem,
|
|
@@ -124,26 +125,24 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
124
125
|
my_visited = visited if visited is not None else set()
|
|
125
126
|
params = MarkdownParams(**kwargs)
|
|
126
127
|
res_parts: list[SerializationResult] = []
|
|
127
|
-
text = item.text
|
|
128
128
|
escape_html = True
|
|
129
129
|
escape_underscores = True
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
130
|
+
|
|
131
|
+
has_inline_repr = (
|
|
132
|
+
item.text == ""
|
|
133
|
+
and len(item.children) == 1
|
|
134
|
+
and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
|
|
135
|
+
)
|
|
136
|
+
if has_inline_repr:
|
|
137
|
+
text = doc_serializer.serialize(item=child_group, visited=my_visited).text
|
|
133
138
|
processing_pending = False
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
# case of inline within heading / list item
|
|
142
|
-
ser_res = doc_serializer.serialize(item=child_group)
|
|
143
|
-
text = ser_res.text
|
|
144
|
-
for span in ser_res.spans:
|
|
145
|
-
my_visited.add(span.item.self_ref)
|
|
146
|
-
else:
|
|
139
|
+
else:
|
|
140
|
+
text = item.text
|
|
141
|
+
processing_pending = True
|
|
142
|
+
|
|
143
|
+
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
|
|
144
|
+
if not has_inline_repr:
|
|
145
|
+
# case where processing/formatting should be applied first (in inner scope)
|
|
147
146
|
text = doc_serializer.post_process(
|
|
148
147
|
text=text,
|
|
149
148
|
escape_html=escape_html,
|
|
@@ -151,6 +150,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
151
150
|
formatting=item.formatting,
|
|
152
151
|
hyperlink=item.hyperlink,
|
|
153
152
|
)
|
|
153
|
+
processing_pending = False
|
|
154
154
|
|
|
155
155
|
if isinstance(item, ListItem):
|
|
156
156
|
pieces: list[str] = []
|
|
@@ -600,13 +600,15 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
|
600
600
|
**kwargs: Any,
|
|
601
601
|
) -> SerializationResult:
|
|
602
602
|
"""Serializes the passed item."""
|
|
603
|
-
if isinstance(item,
|
|
603
|
+
if isinstance(item, GroupItem):
|
|
604
|
+
parts = doc_serializer.get_parts(item=item, **kwargs)
|
|
605
|
+
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
606
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
607
|
+
else:
|
|
604
608
|
return create_ser_result(
|
|
605
609
|
text="<!-- missing-text -->",
|
|
606
|
-
span_source=item,
|
|
610
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
607
611
|
)
|
|
608
|
-
else:
|
|
609
|
-
return create_ser_result()
|
|
610
612
|
|
|
611
613
|
|
|
612
614
|
class MarkdownDocSerializer(DocSerializer):
|
|
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
|
|
|
60
60
|
|
|
61
61
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
62
62
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
63
|
-
CURRENT_VERSION: Final = "1.
|
|
63
|
+
CURRENT_VERSION: Final = "1.7.0"
|
|
64
64
|
|
|
65
65
|
DEFAULT_EXPORT_LABELS = {
|
|
66
66
|
DocItemLabel.TITLE,
|
|
@@ -310,6 +310,7 @@ class TableCell(BaseModel):
|
|
|
310
310
|
column_header: bool = False
|
|
311
311
|
row_header: bool = False
|
|
312
312
|
row_section: bool = False
|
|
313
|
+
fillable: bool = False
|
|
313
314
|
|
|
314
315
|
@model_validator(mode="before")
|
|
315
316
|
@classmethod
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.48.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -19,7 +19,7 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
|
|
|
19
19
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
20
20
|
docling_core/transforms/chunker/__init__.py,sha256=Qg5RhC-2QqdXKEfjzNGJaVi0NqBCL3xAhKWJGOlrE3M,375
|
|
21
21
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
22
|
-
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=
|
|
22
|
+
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=qc-gnuxji-2lrlZCRr34VubBciBTE4ClZ3QplgNpUx8,8246
|
|
23
23
|
docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
|
|
24
24
|
docling_core/transforms/chunker/page_chunker.py,sha256=gLUlqA_klK-rkuPVYuJKi3ZuTIGdd2HD7ces72AiZ2U,2018
|
|
25
25
|
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
@@ -28,11 +28,11 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZ
|
|
|
28
28
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
29
29
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
30
30
|
docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
|
|
31
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
32
|
-
docling_core/transforms/serializer/doctags.py,sha256=
|
|
33
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
31
|
+
docling_core/transforms/serializer/common.py,sha256=vfJhu0b4vAcIres85PX774RQSTKu9RueBOWMO95FQyc,19186
|
|
32
|
+
docling_core/transforms/serializer/doctags.py,sha256=9_aV_ffTOTtQKZQTKz_I3kRTQ_GXHCePKwXnR-rnggA,20644
|
|
33
|
+
docling_core/transforms/serializer/html.py,sha256=h0yiDgTNIeOS-rJaMRfinUFgrZygd3MjheM7pjLw5F0,38380
|
|
34
34
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
35
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
35
|
+
docling_core/transforms/serializer/markdown.py,sha256=9Sy7xWSegX0zdQb9vPzEUFucyGQUA4TcQxMfE70SJsk,24354
|
|
36
36
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
37
37
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
38
38
|
docling_core/transforms/visualizer/key_value_visualizer.py,sha256=fp7nFLy4flOSiavdRgg5y1Mu7WVLIDGh1zEHsq8kgVM,8979
|
|
@@ -43,7 +43,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
43
43
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
44
44
|
docling_core/types/doc/__init__.py,sha256=Vsl3oJV3_BLpS7rIwvahhcWOwmEBvj7ZbQzQCCl-IQk,1678
|
|
45
45
|
docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
|
|
46
|
-
docling_core/types/doc/document.py,sha256=
|
|
46
|
+
docling_core/types/doc/document.py,sha256=sZsLV6GfFF8TzTgD6C47a9YrurLZFhwqt8I9PZmYkJY,202734
|
|
47
47
|
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
48
48
|
docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
|
|
49
49
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -76,9 +76,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
76
76
|
docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
|
|
77
77
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
78
78
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
83
|
-
docling_core-2.
|
|
84
|
-
docling_core-2.
|
|
79
|
+
docling_core-2.48.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
80
|
+
docling_core-2.48.0.dist-info/METADATA,sha256=WybgSJP5TG0mMu5sA2bN0pVKCoZxKCf4KR70MGK3904,6453
|
|
81
|
+
docling_core-2.48.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
82
|
+
docling_core-2.48.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
83
|
+
docling_core-2.48.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
84
|
+
docling_core-2.48.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|