docling-core 2.47.0__tar.gz → 2.48.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.47.0 → docling_core-2.48.1}/PKG-INFO +1 -1
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/hierarchical_chunker.py +1 -1
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/common.py +1 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/doctags.py +25 -9
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/html.py +89 -84
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/markdown.py +24 -22
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/document.py +2 -1
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.47.0 → docling_core-2.48.1}/pyproject.toml +1 -1
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_docling_doc.py +21 -2
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_serialization.py +128 -66
- {docling_core-2.47.0 → docling_core-2.48.1}/LICENSE +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/README.md +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/py.typed +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/package.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/validators.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/setup.cfg +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_collection.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_base.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_schema.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doctags_load.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_nlp_qa.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_page.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_page_chunker.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_rec_schema.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_search_meta.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_utils.py +0 -0
- {docling_core-2.47.0 → docling_core-2.48.1}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.48.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -145,7 +145,7 @@ class TripletTableSerializer(BaseTableSerializer):
|
|
|
145
145
|
parts.append(cap_res)
|
|
146
146
|
|
|
147
147
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
148
|
-
table_df = item.export_to_dataframe()
|
|
148
|
+
table_df = item.export_to_dataframe(doc)
|
|
149
149
|
if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
|
|
150
150
|
|
|
151
151
|
# copy header as first row and shift all rows by one
|
|
@@ -32,6 +32,7 @@ from docling_core.types.doc.document import (
|
|
|
32
32
|
DoclingDocument,
|
|
33
33
|
FloatingItem,
|
|
34
34
|
FormItem,
|
|
35
|
+
GroupItem,
|
|
35
36
|
InlineGroup,
|
|
36
37
|
KeyValueItem,
|
|
37
38
|
ListGroup,
|
|
@@ -42,6 +43,7 @@ from docling_core.types.doc.document import (
|
|
|
42
43
|
PictureMoleculeData,
|
|
43
44
|
PictureTabularChartData,
|
|
44
45
|
ProvenanceItem,
|
|
46
|
+
SectionHeaderItem,
|
|
45
47
|
TableItem,
|
|
46
48
|
TextItem,
|
|
47
49
|
)
|
|
@@ -94,11 +96,11 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
94
96
|
item: TextItem,
|
|
95
97
|
doc_serializer: BaseDocSerializer,
|
|
96
98
|
doc: DoclingDocument,
|
|
99
|
+
visited: Optional[set[str]] = None,
|
|
97
100
|
**kwargs: Any,
|
|
98
101
|
) -> SerializationResult:
|
|
99
102
|
"""Serializes the passed item."""
|
|
100
|
-
|
|
101
|
-
|
|
103
|
+
my_visited = visited if visited is not None else set()
|
|
102
104
|
params = DocTagsParams(**kwargs)
|
|
103
105
|
wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
|
|
104
106
|
label=item.label,
|
|
@@ -116,12 +118,21 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
116
118
|
parts.append(location)
|
|
117
119
|
|
|
118
120
|
if params.add_content:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
121
|
+
if (
|
|
122
|
+
item.text == ""
|
|
123
|
+
and len(item.children) == 1
|
|
124
|
+
and isinstance(
|
|
125
|
+
(child_group := item.children[0].resolve(doc)), InlineGroup
|
|
126
|
+
)
|
|
127
|
+
):
|
|
128
|
+
ser_res = doc_serializer.serialize(item=child_group, visited=my_visited)
|
|
129
|
+
text_part = ser_res.text
|
|
130
|
+
else:
|
|
131
|
+
text_part = doc_serializer.post_process(
|
|
132
|
+
text=item.text,
|
|
133
|
+
formatting=item.formatting,
|
|
134
|
+
hyperlink=item.hyperlink,
|
|
135
|
+
)
|
|
125
136
|
|
|
126
137
|
if isinstance(item, CodeItem):
|
|
127
138
|
language_token = DocumentToken.get_code_language_token(
|
|
@@ -506,7 +517,12 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
506
517
|
**kwargs: Any,
|
|
507
518
|
) -> SerializationResult:
|
|
508
519
|
"""Serializes the passed item."""
|
|
509
|
-
|
|
520
|
+
if isinstance(item, GroupItem):
|
|
521
|
+
parts = doc_serializer.get_parts(item=item, **kwargs)
|
|
522
|
+
text_res = "\n".join([p.text for p in parts if p.text])
|
|
523
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
524
|
+
else:
|
|
525
|
+
return create_ser_result()
|
|
510
526
|
|
|
511
527
|
|
|
512
528
|
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
|
|
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
|
|
|
55
55
|
FormItem,
|
|
56
56
|
FormulaItem,
|
|
57
57
|
GraphData,
|
|
58
|
+
GroupItem,
|
|
58
59
|
ImageRef,
|
|
59
60
|
InlineGroup,
|
|
60
61
|
KeyValueItem,
|
|
@@ -139,21 +140,34 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
139
140
|
res_parts: list[SerializationResult] = []
|
|
140
141
|
post_processed = False
|
|
141
142
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
has_inline_repr = (
|
|
144
|
+
item.text == ""
|
|
145
|
+
and len(item.children) == 1
|
|
146
|
+
and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
|
|
147
|
+
)
|
|
148
|
+
if has_inline_repr:
|
|
149
|
+
text = doc_serializer.serialize(item=child_group, visited=my_visited).text
|
|
150
|
+
post_processed = True
|
|
151
|
+
else:
|
|
152
|
+
text = item.text
|
|
153
|
+
if not isinstance(item, (CodeItem, FormulaItem)):
|
|
154
|
+
text = html.escape(text, quote=False)
|
|
155
|
+
text = text.replace("\n", "<br>")
|
|
146
156
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
157
|
+
# Prepare the HTML based on item type
|
|
158
|
+
if isinstance(item, (TitleItem, SectionHeaderItem)):
|
|
159
|
+
section_level = (
|
|
160
|
+
min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
|
|
161
|
+
)
|
|
150
162
|
text = get_html_tag_with_text_direction(
|
|
151
|
-
html_tag=f"h{section_level}", text=
|
|
163
|
+
html_tag=f"h{section_level}", text=text
|
|
152
164
|
)
|
|
153
165
|
|
|
154
166
|
elif isinstance(item, FormulaItem):
|
|
155
167
|
text = self._process_formula(
|
|
156
168
|
item=item,
|
|
169
|
+
text=text,
|
|
170
|
+
orig=item.orig,
|
|
157
171
|
doc=doc,
|
|
158
172
|
image_mode=params.image_mode,
|
|
159
173
|
formula_to_mathml=params.formula_to_mathml,
|
|
@@ -161,19 +175,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
161
175
|
)
|
|
162
176
|
|
|
163
177
|
elif isinstance(item, CodeItem):
|
|
164
|
-
text =
|
|
178
|
+
text = (
|
|
179
|
+
f"<code>{text}</code>"
|
|
180
|
+
if is_inline_scope
|
|
181
|
+
else f"<pre><code>{text}</code></pre>"
|
|
182
|
+
)
|
|
165
183
|
|
|
166
184
|
elif isinstance(item, ListItem):
|
|
167
185
|
# List items are handled by list serializer
|
|
168
186
|
text_parts: list[str] = []
|
|
169
|
-
if
|
|
170
|
-
|
|
171
|
-
text=
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
187
|
+
if text:
|
|
188
|
+
if has_inline_repr:
|
|
189
|
+
text = f"\n{text}\n"
|
|
190
|
+
else:
|
|
191
|
+
text = doc_serializer.post_process(
|
|
192
|
+
text=text,
|
|
193
|
+
formatting=item.formatting,
|
|
194
|
+
hyperlink=item.hyperlink,
|
|
195
|
+
)
|
|
196
|
+
post_processed = True
|
|
197
|
+
text_parts.append(text)
|
|
177
198
|
nested_parts = [
|
|
178
199
|
r.text
|
|
179
200
|
for r in doc_serializer.get_parts(
|
|
@@ -184,29 +205,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
184
205
|
)
|
|
185
206
|
]
|
|
186
207
|
text_parts.extend(nested_parts)
|
|
187
|
-
|
|
208
|
+
text = "\n".join(text_parts)
|
|
188
209
|
if nested_parts:
|
|
189
|
-
|
|
210
|
+
text = f"\n{text}\n"
|
|
190
211
|
text = (
|
|
191
212
|
get_html_tag_with_text_direction(
|
|
192
213
|
html_tag="li",
|
|
193
|
-
text=
|
|
214
|
+
text=text,
|
|
194
215
|
attrs=(
|
|
195
216
|
{"style": f"list-style-type: '{item.marker} ';"}
|
|
196
217
|
if params.show_original_list_item_marker and item.marker
|
|
197
218
|
else {}
|
|
198
219
|
),
|
|
199
220
|
)
|
|
200
|
-
if
|
|
221
|
+
if text
|
|
201
222
|
else ""
|
|
202
223
|
)
|
|
203
224
|
|
|
204
|
-
elif is_inline_scope:
|
|
205
|
-
text = self._prepare_content(item.text)
|
|
206
|
-
else:
|
|
225
|
+
elif not is_inline_scope:
|
|
207
226
|
# Regular text item
|
|
208
|
-
|
|
209
|
-
text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
|
|
227
|
+
text = get_html_tag_with_text_direction(html_tag="p", text=text)
|
|
210
228
|
|
|
211
229
|
# Apply formatting and hyperlinks
|
|
212
230
|
if not post_processed:
|
|
@@ -227,66 +245,44 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
227
245
|
|
|
228
246
|
return create_ser_result(text=text, span_source=res_parts)
|
|
229
247
|
|
|
230
|
-
def _prepare_content(
|
|
231
|
-
self, text: str, do_escape_html=True, do_replace_newline=True
|
|
232
|
-
) -> str:
|
|
233
|
-
"""Prepare text content for HTML inclusion."""
|
|
234
|
-
if do_escape_html:
|
|
235
|
-
text = html.escape(text, quote=False)
|
|
236
|
-
if do_replace_newline:
|
|
237
|
-
text = text.replace("\n", "<br>")
|
|
238
|
-
return text
|
|
239
|
-
|
|
240
|
-
def _process_code(
|
|
241
|
-
self,
|
|
242
|
-
item: CodeItem,
|
|
243
|
-
is_inline_scope: bool,
|
|
244
|
-
) -> str:
|
|
245
|
-
code_text = self._prepare_content(
|
|
246
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
247
|
-
)
|
|
248
|
-
if is_inline_scope:
|
|
249
|
-
text = f"<code>{code_text}</code>"
|
|
250
|
-
else:
|
|
251
|
-
text = f"<pre><code>{code_text}</code></pre>"
|
|
252
|
-
|
|
253
|
-
return text
|
|
254
|
-
|
|
255
248
|
def _process_formula(
|
|
256
249
|
self,
|
|
257
|
-
|
|
250
|
+
*,
|
|
251
|
+
item: DocItem,
|
|
252
|
+
text: str,
|
|
253
|
+
orig: str,
|
|
258
254
|
doc: DoclingDocument,
|
|
259
255
|
image_mode: ImageRefMode,
|
|
260
256
|
formula_to_mathml: bool,
|
|
261
257
|
is_inline_scope: bool,
|
|
262
258
|
) -> str:
|
|
263
259
|
"""Process a formula item to HTML/MathML."""
|
|
264
|
-
math_formula = self._prepare_content(
|
|
265
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
266
|
-
)
|
|
267
|
-
|
|
268
260
|
# If formula is empty, try to use an image fallback
|
|
269
|
-
if
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
261
|
+
if (
|
|
262
|
+
text == ""
|
|
263
|
+
and orig != ""
|
|
264
|
+
and len(item.prov) > 0
|
|
265
|
+
and image_mode == ImageRefMode.EMBEDDED
|
|
266
|
+
and (
|
|
267
|
+
img_fallback := self._get_formula_image_fallback(
|
|
268
|
+
item=item, orig=orig, doc=doc
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
):
|
|
272
|
+
return img_fallback
|
|
277
273
|
|
|
278
274
|
# Try to generate MathML
|
|
279
|
-
|
|
275
|
+
elif formula_to_mathml and text:
|
|
280
276
|
try:
|
|
281
277
|
# Set display mode based on context
|
|
282
278
|
display_mode = "inline" if is_inline_scope else "block"
|
|
283
279
|
mathml_element = latex2mathml.converter.convert_to_element(
|
|
284
|
-
|
|
280
|
+
text, display=display_mode
|
|
285
281
|
)
|
|
286
282
|
annotation = SubElement(
|
|
287
283
|
mathml_element, "annotation", dict(encoding="TeX")
|
|
288
284
|
)
|
|
289
|
-
annotation.text =
|
|
285
|
+
annotation.text = text
|
|
290
286
|
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
291
287
|
|
|
292
288
|
# Don't wrap in div for inline formulas
|
|
@@ -296,40 +292,40 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
296
292
|
return f"<div>{mathml}</div>"
|
|
297
293
|
|
|
298
294
|
except Exception:
|
|
299
|
-
img_fallback = self._get_formula_image_fallback(
|
|
295
|
+
img_fallback = self._get_formula_image_fallback(
|
|
296
|
+
item=item, orig=orig, doc=doc
|
|
297
|
+
)
|
|
300
298
|
if (
|
|
301
299
|
image_mode == ImageRefMode.EMBEDDED
|
|
302
300
|
and len(item.prov) > 0
|
|
303
301
|
and img_fallback
|
|
304
302
|
):
|
|
305
303
|
return img_fallback
|
|
306
|
-
elif
|
|
307
|
-
return f"<pre>{
|
|
304
|
+
elif text:
|
|
305
|
+
return f"<pre>{text}</pre>"
|
|
308
306
|
else:
|
|
309
307
|
return "<pre>Formula not decoded</pre>"
|
|
310
308
|
|
|
311
309
|
_logger.warning("Could not parse formula with MathML")
|
|
312
310
|
|
|
313
311
|
# Fallback options if we got here
|
|
314
|
-
if
|
|
315
|
-
return f"<code>{
|
|
316
|
-
elif
|
|
317
|
-
f"<pre>{
|
|
312
|
+
if text and is_inline_scope:
|
|
313
|
+
return f"<code>{text}</code>"
|
|
314
|
+
elif text and (not is_inline_scope):
|
|
315
|
+
f"<pre>{text}</pre>"
|
|
318
316
|
elif is_inline_scope:
|
|
319
317
|
return '<span class="formula-not-decoded">Formula not decoded</span>'
|
|
320
318
|
|
|
321
319
|
return '<div class="formula-not-decoded">Formula not decoded</div>'
|
|
322
320
|
|
|
323
321
|
def _get_formula_image_fallback(
|
|
324
|
-
self, item:
|
|
322
|
+
self, *, item: DocItem, orig: str, doc: DoclingDocument
|
|
325
323
|
) -> Optional[str]:
|
|
326
324
|
"""Try to get an image fallback for a formula."""
|
|
327
325
|
item_image = item.get_image(doc=doc)
|
|
328
326
|
if item_image is not None:
|
|
329
327
|
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
330
|
-
return
|
|
331
|
-
"<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
|
|
332
|
-
)
|
|
328
|
+
return "<figure>" f'<img src="{img_ref.uri}" alt="{orig}" />' "</figure>"
|
|
333
329
|
return None
|
|
334
330
|
|
|
335
331
|
|
|
@@ -792,21 +788,30 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
|
|
|
792
788
|
"""HTML-specific fallback serializer."""
|
|
793
789
|
|
|
794
790
|
@override
|
|
795
|
-
def serialize(
|
|
791
|
+
def serialize(
|
|
792
|
+
self,
|
|
793
|
+
*,
|
|
794
|
+
item: NodeItem,
|
|
795
|
+
doc_serializer: "BaseDocSerializer",
|
|
796
|
+
doc: DoclingDocument,
|
|
797
|
+
**kwargs: Any,
|
|
798
|
+
) -> SerializationResult:
|
|
796
799
|
"""Fallback serializer for items not handled by other serializers."""
|
|
797
|
-
if isinstance(item,
|
|
800
|
+
if isinstance(item, GroupItem):
|
|
801
|
+
parts = doc_serializer.get_parts(item=item, **kwargs)
|
|
802
|
+
text_res = "\n".join([p.text for p in parts if p.text])
|
|
803
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
804
|
+
else:
|
|
798
805
|
return create_ser_result(
|
|
799
806
|
text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
|
|
800
|
-
span_source=item,
|
|
807
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
801
808
|
)
|
|
802
|
-
else:
|
|
803
|
-
# For group items, we don't generate any markup
|
|
804
|
-
return create_ser_result()
|
|
805
809
|
|
|
806
810
|
|
|
807
811
|
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
|
|
808
812
|
"""HTML-specific annotation serializer."""
|
|
809
813
|
|
|
814
|
+
@override
|
|
810
815
|
def serialize(
|
|
811
816
|
self,
|
|
812
817
|
*,
|
|
@@ -45,6 +45,7 @@ from docling_core.types.doc.document import (
|
|
|
45
45
|
Formatting,
|
|
46
46
|
FormItem,
|
|
47
47
|
FormulaItem,
|
|
48
|
+
GroupItem,
|
|
48
49
|
ImageRef,
|
|
49
50
|
InlineGroup,
|
|
50
51
|
KeyValueItem,
|
|
@@ -124,26 +125,24 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
124
125
|
my_visited = visited if visited is not None else set()
|
|
125
126
|
params = MarkdownParams(**kwargs)
|
|
126
127
|
res_parts: list[SerializationResult] = []
|
|
127
|
-
text = item.text
|
|
128
128
|
escape_html = True
|
|
129
129
|
escape_underscores = True
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
130
|
+
|
|
131
|
+
has_inline_repr = (
|
|
132
|
+
item.text == ""
|
|
133
|
+
and len(item.children) == 1
|
|
134
|
+
and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
|
|
135
|
+
)
|
|
136
|
+
if has_inline_repr:
|
|
137
|
+
text = doc_serializer.serialize(item=child_group, visited=my_visited).text
|
|
133
138
|
processing_pending = False
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
# case of inline within heading / list item
|
|
142
|
-
ser_res = doc_serializer.serialize(item=child_group)
|
|
143
|
-
text = ser_res.text
|
|
144
|
-
for span in ser_res.spans:
|
|
145
|
-
my_visited.add(span.item.self_ref)
|
|
146
|
-
else:
|
|
139
|
+
else:
|
|
140
|
+
text = item.text
|
|
141
|
+
processing_pending = True
|
|
142
|
+
|
|
143
|
+
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
|
|
144
|
+
if not has_inline_repr:
|
|
145
|
+
# case where processing/formatting should be applied first (in inner scope)
|
|
147
146
|
text = doc_serializer.post_process(
|
|
148
147
|
text=text,
|
|
149
148
|
escape_html=escape_html,
|
|
@@ -151,6 +150,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
151
150
|
formatting=item.formatting,
|
|
152
151
|
hyperlink=item.hyperlink,
|
|
153
152
|
)
|
|
153
|
+
processing_pending = False
|
|
154
154
|
|
|
155
155
|
if isinstance(item, ListItem):
|
|
156
156
|
pieces: list[str] = []
|
|
@@ -332,7 +332,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
332
332
|
]
|
|
333
333
|
for row in item.data.grid
|
|
334
334
|
]
|
|
335
|
-
if len(rows) >
|
|
335
|
+
if len(rows) > 0:
|
|
336
336
|
try:
|
|
337
337
|
table_text = tabulate(rows[1:], headers=rows[0], tablefmt="github")
|
|
338
338
|
except ValueError:
|
|
@@ -600,13 +600,15 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
|
600
600
|
**kwargs: Any,
|
|
601
601
|
) -> SerializationResult:
|
|
602
602
|
"""Serializes the passed item."""
|
|
603
|
-
if isinstance(item,
|
|
603
|
+
if isinstance(item, GroupItem):
|
|
604
|
+
parts = doc_serializer.get_parts(item=item, **kwargs)
|
|
605
|
+
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
606
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
607
|
+
else:
|
|
604
608
|
return create_ser_result(
|
|
605
609
|
text="<!-- missing-text -->",
|
|
606
|
-
span_source=item,
|
|
610
|
+
span_source=item if isinstance(item, DocItem) else [],
|
|
607
611
|
)
|
|
608
|
-
else:
|
|
609
|
-
return create_ser_result()
|
|
610
612
|
|
|
611
613
|
|
|
612
614
|
class MarkdownDocSerializer(DocSerializer):
|
|
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
|
|
|
60
60
|
|
|
61
61
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
62
62
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
63
|
-
CURRENT_VERSION: Final = "1.
|
|
63
|
+
CURRENT_VERSION: Final = "1.7.0"
|
|
64
64
|
|
|
65
65
|
DEFAULT_EXPORT_LABELS = {
|
|
66
66
|
DocItemLabel.TITLE,
|
|
@@ -310,6 +310,7 @@ class TableCell(BaseModel):
|
|
|
310
310
|
column_header: bool = False
|
|
311
311
|
row_header: bool = False
|
|
312
312
|
row_section: bool = False
|
|
313
|
+
fillable: bool = False
|
|
313
314
|
|
|
314
315
|
@model_validator(mode="before")
|
|
315
316
|
@classmethod
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.48.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.48.1" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -734,7 +734,7 @@ def _test_export_methods(
|
|
|
734
734
|
for table in doc.tables:
|
|
735
735
|
table.export_to_markdown()
|
|
736
736
|
table.export_to_html(doc)
|
|
737
|
-
table.export_to_dataframe()
|
|
737
|
+
table.export_to_dataframe(doc)
|
|
738
738
|
table.export_to_doctags(doc)
|
|
739
739
|
|
|
740
740
|
# Test Images export ...
|
|
@@ -2102,7 +2102,7 @@ def _construct_rich_table_doc():
|
|
|
2102
2102
|
|
|
2103
2103
|
table_item = doc.add_table(
|
|
2104
2104
|
data=TableData(
|
|
2105
|
-
num_rows=
|
|
2105
|
+
num_rows=5,
|
|
2106
2106
|
num_cols=2,
|
|
2107
2107
|
),
|
|
2108
2108
|
)
|
|
@@ -2121,6 +2121,17 @@ def _construct_rich_table_doc():
|
|
|
2121
2121
|
rich_item_3 = doc.add_table(
|
|
2122
2122
|
data=TableData(num_rows=2, num_cols=3), parent=table_item
|
|
2123
2123
|
)
|
|
2124
|
+
|
|
2125
|
+
rich_item_4 = doc.add_group(parent=table_item, label=GroupLabel.UNSPECIFIED)
|
|
2126
|
+
doc.add_text(
|
|
2127
|
+
parent=rich_item_4,
|
|
2128
|
+
text="Some text in a generic group.",
|
|
2129
|
+
label=DocItemLabel.TEXT,
|
|
2130
|
+
)
|
|
2131
|
+
doc.add_text(
|
|
2132
|
+
parent=rich_item_4, text="More text in the group.", label=DocItemLabel.TEXT
|
|
2133
|
+
)
|
|
2134
|
+
|
|
2124
2135
|
for i in range(rich_item_3.data.num_rows):
|
|
2125
2136
|
for j in range(rich_item_3.data.num_cols):
|
|
2126
2137
|
cell = TableCell(
|
|
@@ -2158,6 +2169,14 @@ def _construct_rich_table_doc():
|
|
|
2158
2169
|
end_col_offset_idx=j + 1,
|
|
2159
2170
|
ref=rich_item_3.get_ref(),
|
|
2160
2171
|
)
|
|
2172
|
+
elif i == 4 and j == 0:
|
|
2173
|
+
cell = RichTableCell(
|
|
2174
|
+
start_row_offset_idx=i,
|
|
2175
|
+
end_row_offset_idx=i + 1,
|
|
2176
|
+
start_col_offset_idx=j,
|
|
2177
|
+
end_col_offset_idx=j + 1,
|
|
2178
|
+
ref=rich_item_4.get_ref(),
|
|
2179
|
+
)
|
|
2161
2180
|
else:
|
|
2162
2181
|
cell = TableCell(
|
|
2163
2182
|
start_row_offset_idx=i,
|
|
@@ -25,7 +25,13 @@ from docling_core.transforms.serializer.markdown import (
|
|
|
25
25
|
)
|
|
26
26
|
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
|
27
27
|
from docling_core.types.doc.base import ImageRefMode
|
|
28
|
-
from docling_core.types.doc.document import
|
|
28
|
+
from docling_core.types.doc.document import (
|
|
29
|
+
DoclingDocument,
|
|
30
|
+
MiscAnnotation,
|
|
31
|
+
TableCell,
|
|
32
|
+
TableData,
|
|
33
|
+
TableItem,
|
|
34
|
+
)
|
|
29
35
|
from docling_core.types.doc.labels import DocItemLabel
|
|
30
36
|
|
|
31
37
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
@@ -85,6 +91,11 @@ def verify(exp_file: Path, actual: str):
|
|
|
85
91
|
assert expected == actual
|
|
86
92
|
|
|
87
93
|
|
|
94
|
+
# ===============================
|
|
95
|
+
# Markdown tests
|
|
96
|
+
# ===============================
|
|
97
|
+
|
|
98
|
+
|
|
88
99
|
def test_md_cross_page_list_page_break():
|
|
89
100
|
src = Path("./test/data/doc/activities.json")
|
|
90
101
|
doc = DoclingDocument.load_from_json(src)
|
|
@@ -99,7 +110,7 @@ def test_md_cross_page_list_page_break():
|
|
|
99
110
|
),
|
|
100
111
|
)
|
|
101
112
|
actual = ser.serialize().text
|
|
102
|
-
verify(exp_file=src.
|
|
113
|
+
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
|
|
103
114
|
|
|
104
115
|
|
|
105
116
|
def test_md_cross_page_list_page_break_none():
|
|
@@ -170,20 +181,6 @@ def test_md_cross_page_list_page_break_p2():
|
|
|
170
181
|
verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)
|
|
171
182
|
|
|
172
183
|
|
|
173
|
-
def test_html_charts():
|
|
174
|
-
src = Path("./test/data/doc/barchart.json")
|
|
175
|
-
doc = DoclingDocument.load_from_json(src)
|
|
176
|
-
|
|
177
|
-
ser = HTMLDocSerializer(
|
|
178
|
-
doc=doc,
|
|
179
|
-
params=HTMLParams(
|
|
180
|
-
image_mode=ImageRefMode.PLACEHOLDER,
|
|
181
|
-
),
|
|
182
|
-
)
|
|
183
|
-
actual = ser.serialize().text
|
|
184
|
-
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
|
|
185
|
-
|
|
186
|
-
|
|
187
184
|
def test_md_charts():
|
|
188
185
|
src = Path("./test/data/doc/barchart.json")
|
|
189
186
|
doc = DoclingDocument.load_from_json(src)
|
|
@@ -195,7 +192,7 @@ def test_md_charts():
|
|
|
195
192
|
),
|
|
196
193
|
)
|
|
197
194
|
actual = ser.serialize().text
|
|
198
|
-
verify(exp_file=src.
|
|
195
|
+
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
|
|
199
196
|
|
|
200
197
|
|
|
201
198
|
def test_md_inline_and_formatting():
|
|
@@ -209,51 +206,7 @@ def test_md_inline_and_formatting():
|
|
|
209
206
|
),
|
|
210
207
|
)
|
|
211
208
|
actual = ser.serialize().text
|
|
212
|
-
verify(exp_file=src.
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
def test_html_cross_page_list_page_break():
|
|
216
|
-
src = Path("./test/data/doc/activities.json")
|
|
217
|
-
doc = DoclingDocument.load_from_json(src)
|
|
218
|
-
|
|
219
|
-
ser = HTMLDocSerializer(
|
|
220
|
-
doc=doc,
|
|
221
|
-
params=HTMLParams(
|
|
222
|
-
image_mode=ImageRefMode.PLACEHOLDER,
|
|
223
|
-
),
|
|
224
|
-
)
|
|
225
|
-
actual = ser.serialize().text
|
|
226
|
-
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
def test_html_cross_page_list_page_break_p1():
|
|
230
|
-
src = Path("./test/data/doc/activities.json")
|
|
231
|
-
doc = DoclingDocument.load_from_json(src)
|
|
232
|
-
|
|
233
|
-
ser = HTMLDocSerializer(
|
|
234
|
-
doc=doc,
|
|
235
|
-
params=HTMLParams(
|
|
236
|
-
image_mode=ImageRefMode.PLACEHOLDER,
|
|
237
|
-
pages={1},
|
|
238
|
-
),
|
|
239
|
-
)
|
|
240
|
-
actual = ser.serialize().text
|
|
241
|
-
verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
def test_html_cross_page_list_page_break_p2():
|
|
245
|
-
src = Path("./test/data/doc/activities.json")
|
|
246
|
-
doc = DoclingDocument.load_from_json(src)
|
|
247
|
-
|
|
248
|
-
ser = HTMLDocSerializer(
|
|
249
|
-
doc=doc,
|
|
250
|
-
params=HTMLParams(
|
|
251
|
-
image_mode=ImageRefMode.PLACEHOLDER,
|
|
252
|
-
pages={2},
|
|
253
|
-
),
|
|
254
|
-
)
|
|
255
|
-
actual = ser.serialize().text
|
|
256
|
-
verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
|
|
209
|
+
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
|
|
257
210
|
|
|
258
211
|
|
|
259
212
|
def test_md_pb_placeholder_and_page_filter():
|
|
@@ -269,7 +222,7 @@ def test_md_pb_placeholder_and_page_filter():
|
|
|
269
222
|
),
|
|
270
223
|
)
|
|
271
224
|
actual = ser.serialize().text
|
|
272
|
-
verify(exp_file=src.
|
|
225
|
+
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
|
|
273
226
|
|
|
274
227
|
|
|
275
228
|
def test_md_list_item_markers():
|
|
@@ -358,7 +311,7 @@ def test_md_nested_lists():
|
|
|
358
311
|
|
|
359
312
|
ser = MarkdownDocSerializer(doc=doc)
|
|
360
313
|
actual = ser.serialize().text
|
|
361
|
-
verify(exp_file=src.
|
|
314
|
+
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
|
|
362
315
|
|
|
363
316
|
|
|
364
317
|
def test_md_rich_table():
|
|
@@ -370,6 +323,92 @@ def test_md_rich_table():
|
|
|
370
323
|
verify(exp_file=exp_file, actual=actual)
|
|
371
324
|
|
|
372
325
|
|
|
326
|
+
def test_md_single_row_table():
|
|
327
|
+
exp_file = Path("./test/data/doc/single_row_table.gt.md")
|
|
328
|
+
words = ["foo", "bar"]
|
|
329
|
+
doc = DoclingDocument(name="")
|
|
330
|
+
row_idx = 0
|
|
331
|
+
table = doc.add_table(data=TableData(num_rows=1, num_cols=len(words)))
|
|
332
|
+
for col_idx, word in enumerate(words):
|
|
333
|
+
doc.add_table_cell(
|
|
334
|
+
table_item=table,
|
|
335
|
+
cell=TableCell(
|
|
336
|
+
start_row_offset_idx=row_idx,
|
|
337
|
+
end_row_offset_idx=row_idx + 1,
|
|
338
|
+
start_col_offset_idx=col_idx,
|
|
339
|
+
end_col_offset_idx=col_idx + 1,
|
|
340
|
+
text=word,
|
|
341
|
+
),
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
ser = MarkdownDocSerializer(doc=doc)
|
|
345
|
+
actual = ser.serialize().text
|
|
346
|
+
verify(exp_file=exp_file, actual=actual)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
# ===============================
|
|
350
|
+
# HTML tests
|
|
351
|
+
# ===============================
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def test_html_charts():
|
|
355
|
+
src = Path("./test/data/doc/barchart.json")
|
|
356
|
+
doc = DoclingDocument.load_from_json(src)
|
|
357
|
+
|
|
358
|
+
ser = HTMLDocSerializer(
|
|
359
|
+
doc=doc,
|
|
360
|
+
params=HTMLParams(
|
|
361
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
|
362
|
+
),
|
|
363
|
+
)
|
|
364
|
+
actual = ser.serialize().text
|
|
365
|
+
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def test_html_cross_page_list_page_break():
|
|
369
|
+
src = Path("./test/data/doc/activities.json")
|
|
370
|
+
doc = DoclingDocument.load_from_json(src)
|
|
371
|
+
|
|
372
|
+
ser = HTMLDocSerializer(
|
|
373
|
+
doc=doc,
|
|
374
|
+
params=HTMLParams(
|
|
375
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
|
376
|
+
),
|
|
377
|
+
)
|
|
378
|
+
actual = ser.serialize().text
|
|
379
|
+
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def test_html_cross_page_list_page_break_p1():
|
|
383
|
+
src = Path("./test/data/doc/activities.json")
|
|
384
|
+
doc = DoclingDocument.load_from_json(src)
|
|
385
|
+
|
|
386
|
+
ser = HTMLDocSerializer(
|
|
387
|
+
doc=doc,
|
|
388
|
+
params=HTMLParams(
|
|
389
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
|
390
|
+
pages={1},
|
|
391
|
+
),
|
|
392
|
+
)
|
|
393
|
+
actual = ser.serialize().text
|
|
394
|
+
verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def test_html_cross_page_list_page_break_p2():
|
|
398
|
+
src = Path("./test/data/doc/activities.json")
|
|
399
|
+
doc = DoclingDocument.load_from_json(src)
|
|
400
|
+
|
|
401
|
+
ser = HTMLDocSerializer(
|
|
402
|
+
doc=doc,
|
|
403
|
+
params=HTMLParams(
|
|
404
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
|
405
|
+
pages={2},
|
|
406
|
+
),
|
|
407
|
+
)
|
|
408
|
+
actual = ser.serialize().text
|
|
409
|
+
verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
|
|
410
|
+
|
|
411
|
+
|
|
373
412
|
def test_html_split_page():
|
|
374
413
|
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
|
375
414
|
doc = DoclingDocument.load_from_json(src)
|
|
@@ -506,7 +545,7 @@ def test_html_nested_lists():
|
|
|
506
545
|
|
|
507
546
|
ser = HTMLDocSerializer(doc=doc)
|
|
508
547
|
actual = ser.serialize().text
|
|
509
|
-
verify(exp_file=src.
|
|
548
|
+
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
|
|
510
549
|
|
|
511
550
|
|
|
512
551
|
def test_html_rich_table():
|
|
@@ -518,13 +557,27 @@ def test_html_rich_table():
|
|
|
518
557
|
verify(exp_file=exp_file, actual=actual)
|
|
519
558
|
|
|
520
559
|
|
|
560
|
+
def test_html_inline_and_formatting():
|
|
561
|
+
src = Path("./test/data/doc/inline_and_formatting.yaml")
|
|
562
|
+
doc = DoclingDocument.load_from_yaml(src)
|
|
563
|
+
|
|
564
|
+
ser = HTMLDocSerializer(doc=doc)
|
|
565
|
+
actual = ser.serialize().text
|
|
566
|
+
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
# ===============================
|
|
570
|
+
# DocTags tests
|
|
571
|
+
# ===============================
|
|
572
|
+
|
|
573
|
+
|
|
521
574
|
def test_doctags_inline_loc_tags():
|
|
522
575
|
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
|
523
576
|
doc = DoclingDocument.load_from_json(src)
|
|
524
577
|
|
|
525
578
|
ser = DocTagsDocSerializer(doc=doc)
|
|
526
579
|
actual = ser.serialize().text
|
|
527
|
-
verify(exp_file=src.
|
|
580
|
+
verify(exp_file=src.with_suffix(".out.dt"), actual=actual)
|
|
528
581
|
|
|
529
582
|
|
|
530
583
|
def test_doctags_rich_table():
|
|
@@ -535,3 +588,12 @@ def test_doctags_rich_table():
|
|
|
535
588
|
ser = DocTagsDocSerializer(doc=doc)
|
|
536
589
|
actual = ser.serialize().text
|
|
537
590
|
verify(exp_file=exp_file, actual=actual)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def test_doctags_inline_and_formatting():
|
|
594
|
+
src = Path("./test/data/doc/inline_and_formatting.yaml")
|
|
595
|
+
doc = DoclingDocument.load_from_yaml(src)
|
|
596
|
+
|
|
597
|
+
ser = DocTagsDocSerializer(doc=doc)
|
|
598
|
+
actual = ser.serialize().text
|
|
599
|
+
verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/table_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|